[Date Prev][Date Next] [Thread Prev][Thread Next]
[Thread Index]
[Date Index]
[Author Index]
non-linear regression
- From: Daniel Carrera <dcarrera math toronto edu>
- To: Gnumeric Mailing List <gnumeric-list gnome org>
- Subject: non-linear regression
- Date: Wed, 5 Jun 2002 09:12:29 -0400 (EDT)
Hello,
I'm attaching my suggestions for the regression.h and regression.c files.
I do think that I have corrected all the problems that were brought up.
So... what's the next step?
Daniel.
/* Non-linear Regression
*
* Daniel Carrera (dcarrera math toronto edu)
*
* Suggested additions to regression.h and regression.c
* Version 0.0.1
*
* These additions add a non-linear regrssion function to Gnumeric
* using the well-stablished Levenberg-Marquardt method.
*
* STATUS:
* =======
* I prototyped and tested the method under Python first. There should
* not be major problems with the algorithm. However, the C implementation
* has received NO TESTING whatsoever.
*
* TODO:
* =====
* 1.- Testing.
* 2.- Integration with Gnumeric.
*
*/
/* ---------------------------- regression.h ------------------------------- */
/* How do I make a RegressionFunction?
* ===================================
*
* SYNOPSIS:
* RegressionResult f(gnum_float * , gnum_float * , gnum_float * )
*
* USAGE:
* result = f(x, par, &y);
*
* DETAILS:
* x -> independent variables.
* par -> parameters.
*
* The function must:
* - Be smooth on the parameters.
* - Store the resulting value in 'y'.
*
* EXAMPLE:
RegressionResult
f (gnum_float * X, gnum_float * s, gnum_float *y)
{
gnum_float x = X[0];
gnum_float y = X[1];
*y = s[0] * sin( s[1]*x + s[2]*y ) + s[3];
return REG_ok;
}
*/
/* ---------------------------- regression.h ------------------------------ */
#define LARGE 1e6
#define SMALL 0.01
#define TINY 1e-10
#define MAX_STEPS 200 /* FIXME: I pulled this number out of my hat.
* I need some testing to pick a sensible value.
*/
typedef enum {
REG_ok,
REG_invalid_dimensions,
REG_invalid_data,
REG_not_enough_data,
REG_near_singular_good, /* Probably good result */
REG_near_singular_bad, /* Probably bad result */
REG_singular,
REG_parameters_out_of_range /* NEW */
} RegressionResult;
typedef RegressionResult
(*RegressionFunction) (gnum_float * x, gnum_float * params, gnum_float *f);
/* ---------------------------- regression.c ------------------------------ */
/*
* SYNOPSIS:
* result = derivative( f, &df, x, par, i, p_dim)
*
* Approximates the partial derivative of a given function, at (x;params) with respect
* to the parameter indicated by ith parameter. The resulst is stored in 'df'.
*
* See the header file for more information.
*/
static RegressionResult
derivative(
RegressionFunction f,
gnum_float * df,
gnum_float * x, /* Only one point, not the whole data set. */
gnum_float * par,
gint index,
gint p_dim /* Number of parameters. */
)
{
gnum_float y1, y2;
RegressionResult result;
gnum_float * tmp_par;
gnum_float delta;
gint i;
/* Make a copy of the parameters so we can alter it with impunity. */
tmp_par = g_new(gnum_float, p_dim);
for (i = 0; i < p_dim; i++)
tmp_par[i] = par[i];
delta = (tmp_par[index] == 0? abs(tmp_par[index]/LARGE) : TINY);
tmp_par[index] -= delta;
status = (*f)(x,par, &y1);
if (status != REGRESSION_OK) {
g_free(tmp_par);
return status;
}
tmp_par[index] += 2*delta;
status = (*f)(x,par, &y2);
if (status != REGRESSION_OK) {
g_free(tmp_par);
return status;
}
#ifdef DEBUG
printf("y1 = %lf\n",y1);
printf("y2 = %lf\n",y2);
printf("DELTA = %lf\n",delta);
#endif
*df = (y2 - y1) / (2*delta);
g_free(tmp_par);
return REG_ok;
}
/*
* SYNOPSIS:
* result = chi_derivative( f, &dchi, xvals, par, i, yvals, sigmas, p_dim)
*
* This is a simple adaptation of the derivative() function specific to
* the Chi Squared.
*/
static RegressionResult
chi_derivative(
RegressionFunction f,
gnum_float * dchi,
gnum_float * xvals, /* The entire data set, not just one point. */
gnum_float * par,
gint index,
gnum_float * yvals, /* Ditto. */
gnum_float * sigmas, /* Ditto. */
gint p_dim /* Number of parameters. */
)
{
gnum_float y1, y2;
RegressionResult result;
gnum_float * tmp_par;
gint i;
/* Make a copy of the parameters so we can alter it with impunity. */
tmp_par = g_new(gnum_float, p_dim);
for (i = 0; i < p_dim; i++)
tmp_par[i] = par[i];
delta = (tmp_par[index] == 0? abs(tmp_par[index]/LARGE) : TINY);
tmp_par[index] -= delta;
result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &y1);
if (result != REG_ok) {
g_free(tmp_par);
return result;
}
tmp_par[index] += 2*delta;
result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &y2);
if (result != REG_ok) {
g_free(tmp_par);
return result;
}
#ifdef DEBUG
printf("y1 = %lf\n",y1);
printf("y2 = %lf\n",y2);
printf("DELTA = %lf\n",delta);
#endif
g_free(tmp_par);
*dchi = (y2 - y1) / (2*delta);
return REG_ok;
}
/*
* SYNOPSIS:
* result = coefficient_matrix(A, f, xvals, par, yvals, sigmas, x_dim, p_dim, r)
*
* RETURNS:
* The coefficient matrix of the LM method.
*
* DETAIS:
* The coefficient matrix matrix is defined by
*
* N 1 df df
* A = Sum ( ------- -- -- ( i == j? 1 + r : 1 ) )
* ij k=1 sigma^2 dp dp
* k i j
*
* A -> p_dim X p_dim coefficient matrix. MUST ALREADY BE ALLOCATED.
*
* sigmas -> Measurement errors in the dataset (along the y-axis).
* NULL means "no errors available", so they are all set to 1.
*
* x_dim -> Number of data points.
*
* p_dim -> Number of parameters.
*
* r -> Positive constant. It's value is altered during the LM procedure.
*
*/
static RegressionResult
coefficient_matrix(
gnum_float ** A, /* Location where the matrix will be stored. */
RegressionFunction f,
gnum_float ** xvals, /* The entire data set, not just one point. */
gnum_float * par,
gnum_float * yvals, /* Ditto. */
gnum_float * sigmas, /* Ditto. */
gint x_dim, /* Number of data points. */
gint p_dim, /* Number of parameters. */
gnum_float r
)
{
gint i, j, k;
RegressionResult result;
gnum_float df_i, df_j;
gnum_float sum, sigma;
/* Notice that the matrix is symetric. */
for (i = 0; i < p_dim; i++) {
for (j = 0; j <= i; j++) {
sum = 0;
for (k = 0; k < x_dim; x++) {
result = derivative( f, &df_i, xvals[k], par, i, p_dim);
if (result != REG_ok)
return result;
result = derivative( f, &df_i, xvals[k], par, j, p_dim);
if (result != REG_ok)
return result;
sigma = (sigmas == NULL? 1 : sigmas[k]);
sum = sum + 1 / (sigma*sigma) * df_i*df_j * (i == j? 1 + r : 1) ;
}
A[i][j] = A[j][i] = sum;
}
}
return REG_ok;
}
/*
* SYNOPSIS:
* result = parameter_errors(f, xvals, par, yvals, sigmas, x_dim, p_dim, errors)
*
* Returns the errors associated with the parameters.
* If an error is infinite, it is set to -1.
*
* sigmas -> Measurement errors in the dataset (along the y-axis).
* NULL means "no errors available", so they are all set to 1.
*
* x_dim -> Number of data points.
*
* p_dim -> Number of parameters.
*
* errors -> MUST ALREADY BE ALLOCATED.
*/
/* FIXME: I am not happy with the behaviour with infinite errors. */
static RegressionResult
parameter_errors(
RegressionFunction f,
gnum_float ** xvals, /* The entire data set, not just one point. */
gnum_float * par,
gnum_float * yvals, /* Ditto. */
gnum_float * sigmas, /* Ditto. */
gint x_dim, /* Number of data points. */
gint p_dim, /* Number of parameters. */
gnum_float * errors
)
{
RegressionResult result;
gnum_float ** A;
gint i;
ALLOC_MATRIX(A, p_dim, p_dim)
result = coefficient_matrix(A, f, x, par, y, sigmas, x_dim, p_dim, 0)
if (result != REG_ok) {
FREE_MATRIX(A, p_dim, p_dim);
return result;
}
for (i = 0; i < p_dim; i++)
errors[i] = (A[i][j] != 0? 1/sqrt(A[i][j]) : -1 );
FREE_MATRIX(A, p_dim, p_dim);
return REG_ok;
}
/*
* SYNOPSIS:
* result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chisq)
*
* / y - f(x ; par) \ 2
* 2 | i i |
* Chi == Sum ( | ------------------ | )
* \ sigma /
* i
*
* sigmas -> Measurement errors in the dataset (along the y-axis).
* NULL means "no errors available", so they are all set to 1.
*
* x_dim -> Number of data points.
*
* This value is not very meaningful without the sigmas. However, it is still useful
* for the fit.
*/
static RegressionResult
chi_squared(
RegressionFunction f,
gnum_float ** xvals, /* The entire data set, not just one point. */
gnum_float * par,
gnum_float * yvals, /* Ditto. */
gnum_float * sigmas, /* Ditto. */
gint x_dim, /* Number of data points. */
gnum_float * chisq /* Chi Squared */
)
{
gint i;
RegressionResult result;
gnum_float tmp, y;
*chisq = 0;
for (i = 0; i < x_dim; i++) {
result = f(xvals[i], par, &y);
if (result != REG_ok)
return result;
tmp = (yvals[i] - y ) / (sigmas == NULL ? 1 : sigmas[i] );
*chisq += tmp*tmp;
}
return REG_ok;
}
/*
* SYNOPSIS:
* result = non_linear_regression(f, xvals, par, yvals, sigmas, x_dim, p_dim,
* min_par, max_par, &chi, errors)
*
* Returns the results of the non-linear regression from the given intial values.
* The resulting parameters are placed back into 'par'.
*
* PARAMETERS:
*
* sigmas -> Measurement errors in the dataset (along the y-axis).
* NULL means "no errors available", so they are all set to 1.
*
* min_par -> Minimum accepted values for the parameterrs.
*
* max_par -> Maximum accepted values for the parameterrs.
*
* x_dim -> Number of data points.
*
* p_dim -> Number of parameters.
*
* errors -> MUST ALREADY BE ALLOCATED. These are the approximated standard
* deviation for each parameter.
*
* chi -> Chi Squared of the final result. This value is not very meaningful
* without the sigmas.
*
*/
static RegressionResult
non_linear_regression(
RegressionFunction f,
gnum_float ** xvals, /* The entire data set, not just one point. */
gnum_float * par,
gnum_float * yvals, /* Ditto. */
gnum_float * sigmas, /* Ditto. */
gint x_dim, /* Number of data points. */
gint p_dim, /* Number of parameters. */
gnum_float * min_par,
gnum_float * max_par,
gnum_float * chi,
gnum_float * errors
)
{
gnum_float r = 0.001; /* Pick a conservative initial value. */
gnum_float * b;
gnum_float * dpar;
gnum_float * tmp_par;
gnum_float chi_pre, chi_pro, dchi;
RegressionResult result;
gint i, count;
result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chi_pos);
if (result != REG_ok)
return result;
/* chi_pos will be re-evaluated with new parameters before we
* compare chi_pre and chi_pos .
*/
chi_pre = chi_pos;
ALLOC_MATRIX(A, p_dim, p_dim);
dpar = g_new(gnum_float, p_dim);
tmp_par = g_new(gnum_float, p_dim);
b = g_new(gnum_float, p_dim);
#ifdef DEBUG
printf("Chi Squared : %lf",chi_pre);
#endif
result = REG_ok;
for (count = 0; count < MAX_STEPS; count++) {
for (i = 0; i < p_dim; i++) {
/* d Chi
* b == ( - 1) -----
* k d p
* k
*/
result = chi_derivative( f, &dchi, xvals, par, i, yvals, sigmas, p_dim);
if (result != REG_ok) break;
b[i] = - dchi;
}
if (result != REG_ok) break;
result = coefficient_matrix(A, f, xvals, par, yvals, sigmas, x_dim, p_dim, r);
if (result != REG_ok) break;
result = linear_solve(A, b, p_dim, dpar);
if (result != REG_ok) break;
for (i = 0; i < p_dim; i++) {
tmp_par[i] = par[i] + dpar[i];
/* Check that we're still in the acceptable region. */
if ( (tmp_par[i] => min_par[i]) && (tmp_par[i] <= max_par[i]) )
continue;
if ( abs(dpar[i]/par[i]) < TINY ) {
result = REG_parameters_out_of_range;
break;
}
}
if (result != REG_ok) break;
result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chi_pos);
if (result != REG_ok) break;
#ifdef DEBUG
printf("Chi Squared : %lf",chi_pre);
printf("Chi Squared : &lf",chi_pos);
printf("r : %lf",r);
#endif
if (chi_pos <= chi_pre + SMALL/2) { /* There is improvement */
r /= 10;
for (i = 0; i < p_dim; i++)
par[i] = tmp_par[i];
if (abs(chi_pos - chi_pre) < SMALL)
break;
chi_pre = chi_pos;
} else {
r *= 10;
}
}
if (result != REG_ok ) {
FREE_MATRIX(A, p_dim, p_dim);
g_free(dpar);
g_free(tmp_par);
g_free(b);
return result;
}
result = parameter_errors(f, xvals, par, yvals, sigmas, x_dim, p_dim, errors);
if (result != REG_ok) {
FREE_MATRIX(A, p_dim, p_dim);
g_free(dpar);
g_free(tmp_par);
g_free(b);
return result;
}
*chi = chi_pos;
FREE_MATRIX(A, p_dim, p_dim);
g_free(dpar);
g_free(tmp_par);
g_free(b);
return REG_ok;
}
[Date Prev][Date Next] [Thread Prev][Thread Next]
[Thread Index]
[Date Index]
[Author Index]