[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

non-linear regression



Hello,

I'm attaching my suggestions for the regression.h and regression.c files.
I do think that I have corrected all the problems that were brought up.

So... what's the next step?

Daniel.
/* Non-linear Regression
 *
 * Daniel Carrera (dcarrera math toronto edu)
 *
 * Suggested additions to regression.h and regression.c
 * Version 0.0.1
 *
 * These additions add a non-linear regrssion function to Gnumeric
 * using the well-stablished Levenberg-Marquardt method.
 *
 * STATUS:
 * =======
 *    I prototyped and tested the method under Python first.  There should
 * not be major problems with the algorithm.  However, the C implementation
 * has received NO TESTING whatsoever.
 *
 * TODO:
 * =====
 * 1.-	Testing.
 * 2.-  Integration with Gnumeric.
 *
 */

/* ---------------------------- regression.h ------------------------------- */

/* How do I make a RegressionFunction?
 * ===================================
 *
 * SYNOPSIS:	
 *		RegressionResult f(gnum_float * , gnum_float * , gnum_float * )
 *
 * USAGE:
 *		result = f(x, par, &y);
 *
 * DETAILS:
 * x	-> independent variables.
 * par	-> parameters.
 *
 * The function must:
 *		-  	Be smooth on the parameters.
 *		-	Store the resulting value in 'y'.
 *
 * EXAMPLE:
 	
RegressionResult 
f (gnum_float * X, gnum_float * s, gnum_float *y)
{	
	gnum_float x = X[0];
	gnum_float y = X[1];

	*y = s[0] * sin( s[1]*x + s[2]*y ) + s[3];
	
	return REG_ok;
}
 
*/

/* ----------------------------  regression.h ------------------------------ */

#define LARGE		1e6
#define SMALL		0.01
#define TINY		1e-10
#define MAX_STEPS	200    /* FIXME:  I pulled this number out of my hat.
							*      I need some testing to pick a sensible value.
							*/

typedef enum {
	REG_ok,
	REG_invalid_dimensions,
	REG_invalid_data,
	REG_not_enough_data,
	REG_near_singular_good, /* Probably good result */
	REG_near_singular_bad,  /* Probably bad result */
	REG_singular,
	REG_parameters_out_of_range  /* NEW */
} RegressionResult;


typedef RegressionResult 
        (*RegressionFunction) (gnum_float * x, gnum_float * params, gnum_float *f);


/* ----------------------------  regression.c ------------------------------ */

/*
 * SYNOPSIS:
 *   	result = derivative( f, &df, x, par, i, p_dim)
 *
 * Approximates the partial derivative of a given function, at (x;params) with respect
 * to the parameter indicated by ith parameter.  The resulst is stored in 'df'.
 *
 * See the header file for more information.
 */
static RegressionResult 
derivative(
	RegressionFunction f,
	gnum_float * df,
	gnum_float * x,			/* Only one point, not the whole data set. */
	gnum_float * par,
	gint index,
	gint p_dim	/* Number of parameters.  */
)
{
	gnum_float y1, y2;
	RegressionResult result;
	gnum_float * tmp_par;
	gnum_float delta;
	gint i;
	
	/* Make a copy of the parameters so we can alter it with impunity. */
	tmp_par = g_new(gnum_float, p_dim);
	for (i = 0; i < p_dim; i++)
		tmp_par[i] = par[i];
	
	delta = (tmp_par[index] == 0? abs(tmp_par[index]/LARGE) : TINY);
	
	tmp_par[index] -= delta;
	status = (*f)(x,par, &y1);
	if (status != REGRESSION_OK) {
		g_free(tmp_par);
		return status;
	}
	
	tmp_par[index] += 2*delta;
	status = (*f)(x,par, &y2);
	if (status != REGRESSION_OK) {
		g_free(tmp_par);
		return status;
	}
	
#ifdef DEBUG
	printf("y1 = %lf\n",y1);
	printf("y2 = %lf\n",y2);
	printf("DELTA = %lf\n",delta);
#endif
	
	*df = (y2 - y1) / (2*delta);
	
	g_free(tmp_par);
	return REG_ok;
}

/*
 * SYNOPSIS:
 *      result = chi_derivative( f, &dchi, xvals, par, i, yvals, sigmas, p_dim)
 *
 * This is a simple adaptation of the derivative() function specific to 
 * the Chi Squared.
 */ 
static RegressionResult
chi_derivative(
	RegressionFunction f,
	gnum_float * dchi,
	gnum_float * xvals,		/* The entire data set, not just one point. */
	gnum_float * par,
	gint index,
	gnum_float * yvals,		/* Ditto. */
	gnum_float * sigmas,	/* Ditto. */
	gint		 p_dim		/* Number of parameters.  */
)
{
	gnum_float y1, y2;
	RegressionResult result;
	gnum_float * tmp_par;
	gint i;
	
	/* Make a copy of the parameters so we can alter it with impunity. */
	tmp_par = g_new(gnum_float, p_dim);
	for (i = 0; i < p_dim; i++)
		tmp_par[i] = par[i];
	
	delta = (tmp_par[index] == 0? abs(tmp_par[index]/LARGE) : TINY);
	
	
	tmp_par[index] -= delta;
	result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &y1);
	if (result != REG_ok) {
		g_free(tmp_par);
		return result;
	}
	
	tmp_par[index] += 2*delta;
	result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &y2);
	if (result != REG_ok) {
		g_free(tmp_par);
		return result;
	}
	
#ifdef DEBUG
	printf("y1 = %lf\n",y1);
	printf("y2 = %lf\n",y2);
	printf("DELTA = %lf\n",delta);
#endif
	
	g_free(tmp_par);
	*dchi = (y2 - y1) / (2*delta);
	return REG_ok;
}


/*
 * SYNOPSIS:
 *		result = coefficient_matrix(A, f, xvals, par, yvals, sigmas, x_dim, p_dim, r)
 *
 * RETURNS:
 *		The coefficient matrix of the LM method.
 *	
 * DETAIS:
 *		The coefficient matrix matrix is defined by
 * 
 * 	          N        1      df  df
 *     A   = Sum  ( -------   --  --  ( i == j? 1 + r : 1 )  )
 *      ij   k=1    sigma^2   dp  dp
 *                       k      i   j
 *
 * A		->	p_dim X p_dim coefficient matrix.  MUST ALREADY BE ALLOCATED.
 *
 * sigmas	->	Measurement errors in the dataset (along the y-axis).
 *				NULL means "no errors available", so they are all set to 1.
 *
 * x_dim	->	Number of data points.
 *
 * p_dim	->	Number of parameters.
 *
 * r		->	Positive constant.  It's value is altered during the LM procedure. 
 * 
 */
static RegressionResult 
coefficient_matrix(
	gnum_float ** A,		/* Location where the matrix will be stored. */
	RegressionFunction f,
	gnum_float ** xvals,    /* The entire data set, not just one point. */
	gnum_float  * par,
	gnum_float  * yvals,    /* Ditto. */
	gnum_float  * sigmas,   /* Ditto. */
	gint 		  x_dim,	/* Number of data points. */
	gint		  p_dim,	/* Number of parameters.  */
	gnum_float    r
)
{
	gint i, j, k;
	RegressionResult result;
	gnum_float df_i, df_j;
	gnum_float sum, sigma;
	
	/* Notice that the matrix is symetric.  */
	for (i = 0; i < p_dim; i++) {
		for (j = 0; j <= i; j++) {
			sum = 0;
			for (k = 0; k < x_dim; x++) {
				result = derivative( f, &df_i, xvals[k], par, i, p_dim);
				if (result != REG_ok) 
					return result;
				
				result = derivative( f, &df_i, xvals[k], par, j, p_dim);
				if (result != REG_ok) 
					return result;
				
				sigma = (sigmas == NULL? 1 : sigmas[k]);
				
				sum = sum +   1 / (sigma*sigma) * df_i*df_j * (i == j? 1 + r : 1) ;
			}
			A[i][j] = A[j][i] = sum;
		}
	}
		
	return REG_ok;
}

/*
 * SYNOPSIS:
 * 		result = parameter_errors(f, xvals, par, yvals, sigmas, x_dim, p_dim, errors)
 * 
 * Returns the errors associated with the parameters.
 * If an error is infinite, it is set to -1.
 *
 * sigmas	->	Measurement errors in the dataset (along the y-axis).
 *				NULL means "no errors available", so they are all set to 1.
 *
 * x_dim	->	Number of data points.
 *
 * p_dim	->	Number of parameters.
 *
 * errors	->  MUST ALREADY BE ALLOCATED.
 */

/* FIXME:  I am not happy with the behaviour with infinite errors.  */
static RegressionResult
parameter_errors(
	RegressionFunction f,
	gnum_float ** xvals,    /* The entire data set, not just one point. */
	gnum_float  * par,
	gnum_float  * yvals,    /* Ditto. */
	gnum_float  * sigmas,   /* Ditto. */
	gint 		  x_dim,	/* Number of data points. */
	gint		  p_dim,	/* Number of parameters.  */
	gnum_float  * errors
)
{
	RegressionResult result;
	gnum_float ** A;
	gint i;
	
	ALLOC_MATRIX(A, p_dim, p_dim)
	
	result = coefficient_matrix(A, f, x, par, y, sigmas, x_dim, p_dim, 0)
	if (result != REG_ok) {
		FREE_MATRIX(A, p_dim, p_dim);
		return result;
	}
	
	for (i = 0; i < p_dim; i++)
		errors[i] = (A[i][j] != 0? 1/sqrt(A[i][j]) : -1 );
	
	FREE_MATRIX(A, p_dim, p_dim);
	return REG_ok;
}

/*
 * SYNOPSIS:
 * result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chisq)
 *    
 *						  /  y  - f(x ; par) \ 2
 * 	      2              |    i      i        |
 * 	   Chi   ==   Sum (  | ------------------ |   )
 * 	                      \     sigma        /
 * 	                                 i
 * 
 * sigmas	->	Measurement errors in the dataset (along the y-axis).
 *				NULL means "no errors available", so they are all set to 1.
 *
 * x_dim	->	Number of data points.
 *
 * This value is not very meaningful without the sigmas.  However, it is still useful
 * for the fit.
 */
static RegressionResult
chi_squared(
	RegressionFunction f,
	gnum_float ** xvals,    /* The entire data set, not just one point. */
	gnum_float  * par,
	gnum_float  * yvals,    /* Ditto. */
	gnum_float  * sigmas,   /* Ditto. */
	gint 		  x_dim,	/* Number of data points. */
	gnum_float  * chisq     /* Chi Squared */
)
{
	gint i;
	RegressionResult result;
	gnum_float tmp, y;
	*chisq = 0;
	
	for (i = 0; i < x_dim; i++) {
		result = f(xvals[i], par, &y);
		if (result != REG_ok)
			return result;
		
		tmp = (yvals[i] - y ) / (sigmas == NULL ? 1 : sigmas[i] );
		
		*chisq += tmp*tmp;
	}
	
	return REG_ok;
}

/*
 * SYNOPSIS:
 *		result = non_linear_regression(f, xvals, par, yvals, sigmas, x_dim, p_dim,
 *                                     min_par, max_par, &chi, errors)
 * 
 * Returns the results of the non-linear regression from the given intial values.
 * The resulting parameters are placed back into 'par'.
 *		
 * PARAMETERS:
 *
 * sigmas	->	Measurement errors in the dataset (along the y-axis).
 *				NULL means "no errors available", so they are all set to 1.
 *
 * min_par 	->	Minimum accepted values for the parameterrs.
 *
 * max_par 	->	Maximum accepted values for the parameterrs.
 *
 * x_dim	->	Number of data points.
 *
 * p_dim	->	Number of parameters.
 *
 * errors	->  MUST ALREADY BE ALLOCATED.  These are the approximated standard
 *				deviation for each parameter.
 *
 * chi		->	Chi Squared of the final result.  This value is not very meaningful
 *				without the sigmas.
 *		
 */
static RegressionResult
non_linear_regression( 
	RegressionFunction f,
	gnum_float ** xvals,    /* The entire data set, not just one point. */
	gnum_float  * par,
	gnum_float  * yvals,    /* Ditto. */
	gnum_float  * sigmas,   /* Ditto. */
	gint  	      x_dim,	/* Number of data points. */
	gint  	      p_dim,	/* Number of parameters.  */
	gnum_float  * min_par,
	gnum_float  * max_par,
	gnum_float  * chi,
	gnum_float  * errors
)
{
	gnum_float   r = 0.001; /* Pick a conservative initial value. */
	gnum_float * b;
	gnum_float * dpar;
	gnum_float * tmp_par;
	gnum_float chi_pre, chi_pro, dchi;
	RegressionResult result;
	gint i, count;
	
	result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chi_pos);
	if (result != REG_ok)
	    return result;
	
	/* chi_pos will be re-evaluated with new parameters before we
	 * compare chi_pre and chi_pos .
	 */
	chi_pre = chi_pos; 
	
	ALLOC_MATRIX(A, p_dim, p_dim);
	dpar    = g_new(gnum_float, p_dim);
	tmp_par = g_new(gnum_float, p_dim);
	b       = g_new(gnum_float, p_dim);
	
#ifdef DEBUG
	printf("Chi Squared : %lf",chi_pre);
#endif
	
	result = REG_ok;
	for (count = 0; count < MAX_STEPS; count++) { 
		for (i = 0; i < p_dim; i++) {
			
			/*                d Chi
			 *  b   == ( - 1) -----
			 *   k            d p
			 *                   k
			 */
			result = chi_derivative( f, &dchi, xvals, par, i, yvals, sigmas, p_dim);
			if (result != REG_ok) break;
			
			b[i] = - dchi;
		}
		if (result != REG_ok) break;
		
		result = coefficient_matrix(A, f, xvals, par, yvals, sigmas, x_dim, p_dim, r);
		if (result != REG_ok) break;
			
		result = linear_solve(A, b, p_dim, dpar);
		if (result != REG_ok) break;
		
		for (i = 0; i < p_dim; i++) {
			tmp_par[i] = par[i] + dpar[i];

			/* Check that we're still in the acceptable region. */
			if ( (tmp_par[i] => min_par[i]) && (tmp_par[i] <= max_par[i]) )
			    continue;

			if (  abs(dpar[i]/par[i]) < TINY  ) {
				result = REG_parameters_out_of_range;
				break;
			}
		}
		if (result != REG_ok) break;
		
		result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chi_pos);
		if (result != REG_ok) break;
			
#ifdef DEBUG
		printf("Chi Squared : %lf",chi_pre);
		printf("Chi Squared : &lf",chi_pos);
		printf("r  :  %lf",r);
#endif
		
		if (chi_pos <= chi_pre + SMALL/2) { /* There is improvement */
			r /= 10;
			
			for (i = 0; i < p_dim; i++)
				par[i] = tmp_par[i];
			
			if (abs(chi_pos - chi_pre) < SMALL)
				break;
			
			chi_pre = chi_pos;
		} else {
			r *= 10;
		}
	}
	
	if (result != REG_ok ) {
		FREE_MATRIX(A, p_dim, p_dim);
		g_free(dpar);
		g_free(tmp_par);
		g_free(b);
		return result;
	}
			
	
	result = parameter_errors(f, xvals, par, yvals, sigmas, x_dim, p_dim, errors);
	if (result != REG_ok) {
		FREE_MATRIX(A, p_dim, p_dim);
		g_free(dpar);
		g_free(tmp_par);
		g_free(b);
		return result;
	}
		
	*chi = chi_pos;
	
	FREE_MATRIX(A, p_dim, p_dim);
	g_free(dpar);
	g_free(tmp_par);
	g_free(b);
	
	return REG_ok;
} 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]