/* Non-linear Regression * * Daniel Carrera (dcarrera math toronto edu) * * Suggested additions to regression.h and regression.c * Version 0.0.1 * * These additions add a non-linear regrssion function to Gnumeric * using the well-stablished Levenberg-Marquardt method. * * STATUS: * ======= * I prototyped and tested the method under Python first. There should * not be major problems with the algorithm. However, the C implementation * has received NO TESTING whatsoever. * * TODO: * ===== * 1.- Testing. * 2.- Integration with Gnumeric. * */ /* ---------------------------- regression.h ------------------------------- */ /* How do I make a RegressionFunction? * =================================== * * SYNOPSIS: * RegressionResult f(gnum_float * , gnum_float * , gnum_float * ) * * USAGE: * result = f(x, par, &y); * * DETAILS: * x -> independent variables. * par -> parameters. * * The function must: * - Be smooth on the parameters. * - Store the resulting value in 'y'. * * EXAMPLE: RegressionResult f (gnum_float * X, gnum_float * s, gnum_float *y) { gnum_float x = X[0]; gnum_float y = X[1]; *y = s[0] * sin( s[1]*x + s[2]*y ) + s[3]; return REG_ok; } */ /* ---------------------------- regression.h ------------------------------ */ #define DELTA 1e-6 typedef RegressionResult (*RegressionFunction) (gnum_float * x, gnum_float * params, gnum_float *f); /* ---------------------------- regression.c ------------------------------ */ /* * SYNOPSIS: * result = derivative( f, &df, x, par, i) * * Approximates the partial derivative of a given function, at (x;params) with respect * to the parameter indicated by ith parameter. The resulst is stored in 'df'. * * See the header file for more information. */ static RegressionResult derivative( RegressionFunction f, gnum_float * df, gnum_float * x, /* Only one point, not the whole data set. */ gnum_float * par, gint index ) { gnum_float y1, y2; RegressionResult result; par[index] -= DELTA; status = (*f)(x,par, &y1); if (status != REGRESSION_OK) return status; params[index] += 2*DELTA; status = (*f)(x,par, &y2); if (status != REGRESSION_OK) return status; #ifdef DEBUG printf("y1 = %lf\n",y1); printf("y2 = %lf\n",y2); printf("DELTA = %lf\n",DELTA); #endif *df = (y2 - y1) / (2*DELTA); return REG_ok; } /* * SYNOPSIS: * result = chi_derivative( f, &dchi, xvals, par, i, yvals, sigmas) * * This is a simple adaptation of the derivative() function specific to * the Chi Squared. */ static RegressionResult chi_derivative( RegressionFunction f, gnum_float * dchi, gnum_float * xvals, /* The entire data set, not just one point. */ gnum_float * par, gint index, gnum_float * yvals, /* Ditto. */ gnum_float * sigmas /* Ditto. */ ) { gnum_float y1, y2; RegressionResult result; par[index] -= DELTA; result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &y1); if (result != REG_ok) return result; params[index] += 2*DELTA; result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &y2); if (result != REG_ok) return result; #ifdef DEBUG printf("y1 = %lf\n",y1); printf("y2 = %lf\n",y2); printf("DELTA = %lf\n",DELTA); #endif *dchi = (y2 - y1) / (2*DELTA); return REG_ok; } /* * SYNOPSIS: * result = coefficient_matrix(A, f, xvals, par, yvals, sigmas, x_dim, p_dim, r) * * RETURNS: * The coefficient matrix of the LM method. * * DETAIS: * The coefficient matrix matrix is defined by * * N 1 df df * A = Sum ( ------- -- -- ( i == j? 1 + r : 1 ) ) * ij k=1 sigma^2 dp dp * k i j * * A -> p_dim X p_dim coefficient matrix. MUST ALREADY BE ALLOCATED. * * sigmas -> Measurement errors in the dataset (along the y-axis). * NULL means "no errors available", so they are all set to 1. * * x_dim -> Number of data points. * * p_dim -> Number of parameters. * * r -> Positive constant. It's value is altered during the LM procedure. * */ static RegressionResult coefficient_matrix( gnum_float ** A, /* Location where the matrix will be stored. */ RegressionFunction f, gnum_float ** xvals, /* The entire data set, not just one point. */ gnum_float * par, gnum_float * yvals, /* Ditto. */ gnum_float * sigmas, /* Ditto. */ gint x_dim, /* Number of data points. */ gint p_dim, /* Number of parameters. */ gnum_float r ) { gint i, j, k; RegressionResult result; gnum_float df_i, df_j; gnum_float sum, sigma; /* Notice that the matrix is symetric. */ for (i = 0; i < p_dim; i++) { for (j = 0; j <= i; j++) { sum = 0; for (k = 0; k < x_dim; x++) { result = derivative( f, &df_i, xvals[k], par, i); if (result != REG_ok) return result; result = derivative( f, &df_i, xvals[k], par, j); if (result != REG_ok) return result; sigma = (sigmas == NULL? 1 : sigmas[k]); sum = sum + 1 / (sigma*sigma) * df_i*df_j * (i == j? 1 + r : 1) ; } A[i][j] = A[j][i] = sum; } } return REG_ok; } /* * SYNOPSIS: * result = parameter_errors(f, xvals, par, yvals, sigmas, x_dim, p_dim, errors) * * Returns the errors associated with the parameters. * If an error is infinite, it is set to -1. * * sigmas -> Measurement errors in the dataset (along the y-axis). * NULL means "no errors available", so they are all set to 1. * * x_dim -> Number of data points. * * p_dim -> Number of parameters. * * errors -> MUST ALREADY BE ALLOCATED. */ /* FIXME: I am not happy with the behaviour with infinite errors. */ static RegressionResult parameter_errors( RegressionFunction f, gnum_float ** xvals, /* The entire data set, not just one point. */ gnum_float * par, gnum_float * yvals, /* Ditto. */ gnum_float * sigmas, /* Ditto. */ gint x_dim, /* Number of data points. */ gint p_dim, /* Number of parameters. */ gnum_float * errors ) { RegressionResult result; gnum_float ** A; gint i; ALLOC_MATRIX(A, p_dim, p_dim) result = coefficient_matrix(A, f, x, par, y, sigmas, x_dim, p_dim, 0) if (result != REG_ok) return result; for (i = 0; i < p_dim; i++) errors[i] = (A[i][j] != 0? 1/sqrt(A[i][j]) : -1 ); FREE_MATRIX(A, p_dim, p_dim); return REG_ok; } /* * SYNOPSIS: * result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chisq) * * / y - f(x ; par) \ 2 * 2 | i i | * Chi == Sum ( | ------------------ | ) * \ sigma / * i * * sigmas -> Measurement errors in the dataset (along the y-axis). * NULL means "no errors available", so they are all set to 1. * * x_dim -> Number of data points. * * This value is not very meaningful without the sigmas. However, it is still useful * for the fit. */ static RegressionResult chi_squared( RegressionFunction f, gnum_float ** xvals, /* The entire data set, not just one point. */ gnum_float * par, gnum_float * yvals, /* Ditto. */ gnum_float * sigmas, /* Ditto. */ gint x_dim, /* Number of data points. */ gnum_float * chisq /* Chi Squared */ ) { gint i; RegressionResult result; gnum_float tmp, y; *chisq = 0; for (i = 0; i < x_dim; i++) { result = f(xvals[i], par, &y); if (result != REG_ok) return result; tmp = (yvals[i] - y ) / (sigmas == NULL ? 1 : sigmas[i] ); *chisq += tmp*tmp; } return REG_ok; } /* * SYNOPSIS: * result = non_linear_regression(f, xvals, par, yvals, sigmas, x_dim, p_dim, &chi, errors) * * Returns the results of the non-linear regression from the given intial values. * The resulting parameters are placed back into 'par'. * * PARAMETERS: * * sigmas -> Measurement errors in the dataset (along the y-axis). * NULL means "no errors available", so they are all set to 1. * * x_dim -> Number of data points. * * p_dim -> Number of parameters. * * errors -> MUST ALREADY BE ALLOCATED. These are the approximated standard * deviation for each parameter. * * chi -> Chi Squared of the final result. This value is not very meaningful * without the sigmas. * */ static RegressionResult non_linear_regression( RegressionFunction f, gnum_float ** xvals, /* The entire data set, not just one point. */ gnum_float * par, gnum_float * yvals, /* Ditto. */ gnum_float * sigmas, /* Ditto. */ gint x_dim, /* Number of data points. */ gint p_dim, /* Number of parameters. */ gnum_float * chi, gnum_float * errors ) { gnum_float r = 0.001; /* Pick a conservative initial value. */ gnum_float * b; gnum_float * dpar; gnum_float * tmp_par; gnum_float chi_pre, chi_pro, dchi; RegressionResult result; gint i; result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chi_pos); if (result != REG_ok) return result; ALLOC_MATRIX(A, p_dim, p_dim); dpar = g_new(gnum_float, p_dim); tmp_par = g_new(gnum_float, p_dim); b = g_new(gnum_float, p_dim); #ifdef DEBUG printf("Chi Squared : %lf",chi_pre); #endif while (1) { for (i = 0; i < p_dim; i++) { /* * d Chi * b == ----- * k d p * k */ result = chi_derivative( f, &dchi, xvals, par, i, yvals, sigmas); if (result != REG_ok) return result; b[i] = - dchi; } result = coefficient_matrix(A, f, xvals, par, yvals, sigmas, x_dim, p_dim, r); if (result != REG_ok) return result; result = linear_solve(A, b, p_dim, dpar); if (result != REG_ok) return result; for(i = 0; i < p_dim; i++) tmp_par[i] = par[i] + dpar[i]; result = chi_squared( f, xvals, par, yvals, sigmas, x_dim, &chi_pos); if (result != REG_ok) return result; #ifdef DEBUG printf("Chi Squared : %lf",chi_pre); printf("Chi Squared : &lf",chi_pos); printf("r : %lf",r); #endif if (chi_pos <= chi_pre) { /* There is improvement */ r /= 10; par = tmp_par; if (abs(chi_pos - chi_pre) < DELTA) break; chi_pre = chi_pos; } else { r *= 10; } } result = parameter_errors(f, xvals, par, yvals, sigmas, x_dim, p_dim, errors); if (result != REG_ok) return result; *chi = chi_pos; FREE_MATRIX(A, p_dim, p_dim); g_free(dpar); g_free(tmp_par); g_free(b); return REG_ok; }