? gnumeric.patch Index: ChangeLog =================================================================== RCS file: /cvs/gnome/gnumeric/ChangeLog,v retrieving revision 1.2451 diff -u -3 -p -r1.2451 ChangeLog --- ChangeLog 5 Oct 2002 02:54:51 -0000 1.2451 +++ ChangeLog 6 Oct 2002 10:25:46 -0000 @@ -1,3 +1,11 @@ +2002-10-06 Olaf Till + + * src/regression.[c,h] (logarithmic_regression, + logarithmic_fit) : 2 new functions. + + * pugins/fn-stat/functions.c (gnumeric_logreg, + gnumeric_logfit) : interface for above functions. + 2002-10-04 Jody Goldberg * src/workbook.c (workbook_metadata) : new. Index: plugins/fn-stat/functions.c =================================================================== RCS file: /cvs/gnome/gnumeric/plugins/fn-stat/functions.c,v retrieving revision 1.188 diff -u -3 -p -r1.188 functions.c --- plugins/fn-stat/functions.c 22 Sep 2002 21:17:37 -0000 1.188 +++ plugins/fn-stat/functions.c 6 Oct 2002 10:25:52 -0000 @@ -4069,6 +4069,384 @@ gnumeric_linest (FunctionEvalInfo *ei, V /***************************************************************************/ +static const char *help_logreg = { + N_("@FUNCTION=LOGREG\n" + "@SYNTAX=LOGREG(known_y's[,known_x's[,const[,stat]]])\n" + + "@DESCRIPTION=" + "LOGREG function transforms your x's to z=log(x) and " + "applies the ``least squares'' method to fit the linear equation\n" + "y = m * z + b \n" + "to your y's and z's --- equivalent to fitting the equation\n" + "y = m * log(x) + b \n" + "to y's and x's. \n" + "\n" + "If @known_x's is omitted, an array {1, 2, 3, ...} is used. " + "LOGREG returns an array having two columns and one row. " + "m is given in the first column and b in the second. \n" + "\n" + "If @known_y's and @known_x's have unequal number of data points, " + "LOGREG returns #NUM! error.\n" + "\n" + "If @const is FALSE, the line will be forced to go through [1; 0], " + "i.e., b will be zero. The default is TRUE.\n" + "\n" + "If @stat is TRUE, extra statistical information will be returned " + "which applies to the state AFTER transformation to z, " + "assuming that the z = log(x) are normally distributed. " + "Extra statistical information is written below m and b in the " + "result array. Extra statistical information consists of four " + "rows of data. In the first row the standard error values for the " + "coefficients m, b are represented. The second row " + "contains the square of R and the standard error for the y " + "estimate. The third row contains the F-observed value and the " + "degrees of freedom. The last row contains the regression sum " + "of squares and the residual sum of squares." + "The default of @stat is FALSE.\n" + "@EXAMPLES=\n" + "\n" + "@SEEALSO=LOGFIT,LINEST,LOGEST") +}; +/* The following is a copy of "gnumeric_linest" of Gnumeric version 1.1.9 + * with "linear_regression" replaced by "logarithmic_regression". + * + * In Excel, this functionality is not available as a function, but only + * as a "trend curve" within graphs. + * + * The function "logarithmic_regression" transforms x's logarithmically + * before calling "general_linear_regression" written by others. + * + * I do not know if in statistical praxis logarithmically transformed x-data + * is useful for *multidimensional* regression, and also if extra statistical + * data is useful in this case, but since "general_linear_regression" written + * by others provides it I have passed this functionality to the user. + * But see comment to "gnumeric_linest" for problem with reading more than + * one x-range. + */ + +static Value * +gnumeric_logreg (FunctionEvalInfo *ei, Value *argv[]) +{ + gnum_float **xss = NULL, *ys = NULL; + Value *result = NULL; + int nx, ny, dim, i; + int xarg = 0; + gnum_float *logreg_res = NULL; + gboolean affine, stat, err; + enum { + ARRAY = 1, + SINGLE_COL = 2, + SINGLE_ROW = 3, + OTHER = 4 + } ytype; + regression_stat_t *extra_stat; + + extra_stat = regression_stat_new (); + dim = 0; + + if (argv[0] == NULL || (argv[0]->type != VALUE_ARRAY && argv[0]->type != VALUE_CELLRANGE)){ + goto out; /* Not a valid input for ys */ + } + + if (argv[0]->type == VALUE_ARRAY) + ytype = ARRAY; + else if (argv[0]->v_range.cell.a.col == argv[0]->v_range.cell.b.col) + ytype = SINGLE_COL; + else if (argv[0]->v_range.cell.a.row == argv[0]->v_range.cell.b.row) + ytype = SINGLE_ROW; + else ytype = OTHER; + + if (argv[0]->type == VALUE_CELLRANGE) + ys = collect_floats_value (argv[0], ei->pos, + COLLECT_IGNORE_STRINGS | + COLLECT_IGNORE_BOOLS, + &ny, &result); + else if (argv[0]->type == VALUE_ARRAY){ + /* + * Get ys from array argument argv[0] + */ + } + + if (result) + goto out; + + /* TODO Better error-checking in next statement */ + + if (argv[1] == NULL || (ytype == ARRAY && argv[1]->type != VALUE_ARRAY) || + (ytype != ARRAY && argv[1]->type != VALUE_CELLRANGE)){ + dim = 1; + xss = g_new (gnum_float *, 1); + xss[0] = g_new (gnum_float, ny); + for (nx = 0; nx < ny; nx++) + xss[0][nx] = nx + 1; + } + else if (ytype == ARRAY){ + xarg = 1; + /* Get xss from array argument argv[1] */ + } + else if (ytype == SINGLE_COL){ + int firstcol, lastcol; + Value *copy; + xarg = 1; + firstcol = argv[1]->v_range.cell.a.col; + lastcol = argv[1]->v_range.cell.b.col; + + if (firstcol < lastcol) { + int tmp = firstcol; + firstcol = lastcol; + lastcol = tmp; + } + + dim = lastcol - firstcol + 1; + copy = value_duplicate (argv[1]); + xss = g_new (gnum_float *, dim); + for (i = firstcol; i <= lastcol; i++){ + copy->v_range.cell.a.col = i; + copy->v_range.cell.b.col = i; + xss[i - firstcol] = collect_floats_value (copy, ei->pos, + COLLECT_IGNORE_STRINGS | + COLLECT_IGNORE_BOOLS, + &nx, &result); + if (result){ + value_release (copy); + dim = i - firstcol; /* How many got allocated before failure*/ + goto out; + } + if (nx != ny){ + value_release (copy); + dim = i - firstcol + 1; + result = value_new_error (ei->pos, gnumeric_err_NUM); + goto out; + } + } + value_release (copy); + } + else if (ytype == SINGLE_ROW){ + int firstrow, lastrow; + Value *copy; + xarg = 1; + firstrow = argv[1]->v_range.cell.a.row; + lastrow = argv[1]->v_range.cell.b.row; + + if (firstrow < lastrow) { + int tmp = firstrow; + firstrow = lastrow; + lastrow = tmp; + } + + dim = lastrow - firstrow + 1; + copy = value_duplicate (argv[1]); + xss = g_new (gnum_float *, dim); + for (i = firstrow; i <= lastrow; i++){ + copy->v_range.cell.a.row = i; + copy->v_range.cell.b.row = i; + xss[i - firstrow] = collect_floats_value (copy, ei->pos, + COLLECT_IGNORE_STRINGS | + COLLECT_IGNORE_BOOLS, + &nx, &result); + if (result){ + value_release (copy); + dim = i - firstrow; /*How many got allocated before failure*/ + goto out; + } + if (nx != ny){ + value_release (copy); + dim = i - firstrow + 1; + result = value_new_error (ei->pos, gnumeric_err_NUM); + goto out; + } + } + value_release (copy); + } + else { /*Y is none of the above */ + xarg = 1; + dim = 1; + xss = g_new (gnum_float *, dim); + xss[0] = collect_floats_value (argv[1], ei->pos, + COLLECT_IGNORE_STRINGS | + COLLECT_IGNORE_BOOLS, + &nx, &result); + if (result){ + dim = 0; + goto out; + } + if (nx != ny){ + dim = 1; + result = value_new_error (ei->pos, gnumeric_err_NUM); + goto out; + } + } + + if (argv[1 + xarg]) { + affine = value_get_as_bool (argv[1 + xarg], &err); + if (err) { + result = value_new_error (ei->pos, gnumeric_err_VALUE); + goto out; + } + } else + affine = TRUE; + + if (argv[2 + xarg]) { + stat = value_get_as_bool (argv[2 + xarg], &err); + if (err) { + result = value_new_error (ei->pos, + gnumeric_err_VALUE); + goto out; + } + } else + stat = FALSE; + + logreg_res = g_new (gnum_float, dim + 1); + + if (logarithmic_regression (xss, dim, ys, nx, affine, + logreg_res, extra_stat) != REG_ok) { + result = value_new_error (ei->pos, gnumeric_err_NUM); + goto out; + } + + if (stat) { + result = value_new_array (dim + 1, 5); + + value_array_set (result, 0, 2, + value_new_float (extra_stat->sqr_r)); + value_array_set (result, 1, 2, + value_new_float (sqrtgnum (extra_stat->var))); + value_array_set (result, 0, 3, + value_new_float (extra_stat->F)); + value_array_set (result, 1, 3, + value_new_float (extra_stat->df_resid)); + value_array_set (result, 0, 4, + value_new_float (extra_stat->ss_reg)); + value_array_set (result, 1, 4, + value_new_float (extra_stat->ss_resid)); + for (i = 0; i < dim; i++) + value_array_set (result, dim - i - 1, 1, + value_new_float (extra_stat->se[i+affine])); + value_array_set (result, dim, 1, + value_new_float (extra_stat->se[0])); + } else + result = value_new_array (dim + 1, 1); + + value_array_set (result, dim, 0, value_new_float (logreg_res[0])); + for (i = 0; i < dim; i++) + value_array_set (result, dim - i - 1, 0, + value_new_float (logreg_res[i + 1])); + + out: + if (xss) { + for (i = 0; i < dim; i++) + g_free (xss[i]); + g_free (xss); + } + g_free (ys); + g_free (logreg_res); + regression_stat_destroy (extra_stat); + + return result; +} + +/***************************************************************************/ + +static const char *help_logfit = { + N_("@FUNCTION=LOGFIT\n" + "@SYNTAX=LOGFIT(known_y's,known_x's)\n" + + "@DESCRIPTION=" + "LOGFIT function applies the ``least squares'' method to fit " + "the logarithmic equation\n" + "y = a + b * log(sign * (x - c)) , sign = +1 or -1 \n" + "to your data. The graph of the equation is a logarithmic curve " + "moved horizontally by c and possibly mirrored across the y-axis " + "(if sign = -1).\n" + "\n" + "LOGFIT returns an array having five columns and one row. " + "`Sign' is given in the first column, `a', `b', and `c' are " + "given in columns 2 to 4. Column 5 holds the sum of squared " + "residuals.\n" + "\n" + "An error is returned when there are less than 3 different x's " + "or y's, or when the shape of the point cloud is too different " + "from a ``logarithmic'' one.\n" + "\n" + "You can use the above formula \n" + "= a + b * log(sign * (x - c)) \n" + "or rearrange it to \n" + "= (exp((y - a) / b)) / sign + c \n" + "to compute unknown y's or x's, respectively. \n" + "\n" + "Technically, this is non-linear fitting by trial-and-error. " + "The accuracy of `c' is: width of x-range -> rounded to the " + "next smaller (10^integer), times 0.000001. There might be cases " + "in which the returned fit is not the best possible.\n" + "@EXAMPLES=\n" + "\n" + "@SEEALSO=LOGREG,LINEST,LOGEST") +}; + +/* This function is not available in Excel. + * It is intended for calculation of unknowns from a calibration curve. + * It adapts well to some types of scientific data. + * It does not do multidimensional regression or extra statistics. + * + * One could do this kind of non-linear fitting with a general solver, too, + * but then the success depends on the choosing of suitable starting values. + * Also, determination of `sign' would be complicated. + */ +static Value * +gnumeric_logfit (FunctionEvalInfo *ei, Value *argv[]) +{ + gnum_float *xs = NULL, *ys = NULL; + Value *result = NULL; + int nx, ny, i; + gnum_float *logfit_res = NULL; + + if (argv[0] == NULL || argv[0]->type != VALUE_CELLRANGE) + goto out; + ys = collect_floats_value (argv[0], ei->pos, + COLLECT_IGNORE_BLANKS | /* zeroing blanks + is prone to produce unwanted results */ + COLLECT_IGNORE_STRINGS | /* dangerous, user +should use validation tool to prevent erroneously inputting strings instead of +numbers */ + COLLECT_IGNORE_BOOLS, + &ny, &result); + if (result) + goto out; + if (argv[1] == NULL || argv[1]->type != VALUE_CELLRANGE) + goto out; + xs = collect_floats_value (argv[1], ei->pos, + COLLECT_IGNORE_BLANKS | + COLLECT_IGNORE_STRINGS | + COLLECT_IGNORE_BOOLS, + &nx, &result); + if (result) + goto out; + if (nx != ny || nx < 3) { + result = value_new_error (ei->pos, gnumeric_err_VALUE); + goto out; + } + + logfit_res = g_new (gnum_float, 5); + + if (logarithmic_fit (xs, ys, nx, logfit_res) != REG_ok) { + result = value_new_error (ei->pos, gnumeric_err_NUM); + goto out; + } + + result = value_new_array (5, 1); + for (i=0; i<5; i++) + value_array_set (result, i, 0, + value_new_float (logfit_res[i])); + + out: + g_free (xs); + g_free (ys); + g_free (logfit_res); + return result; +} + +/***************************************************************************/ + static const char *help_trend = { N_("@FUNCTION=TREND\n" "@SYNTAX=TREND(known_y's[,known_x's],new_x's])\n" @@ -5244,12 +5622,16 @@ const ModulePluginFunctionInfo stat_func &help_linest, gnumeric_linest, NULL, NULL, NULL }, { "logest", "A|Abb", N_("known_y's,known_x's,const,stat"), &help_logest, gnumeric_logest, NULL, NULL, NULL }, + { "logfit", "rr", N_("known_y's,known_x's"), + &help_logfit, gnumeric_logfit, NULL, NULL, NULL }, { "loginv", "fff", N_("p,mean,stddev"), &help_loginv, gnumeric_loginv, NULL, NULL, NULL }, { "logistic", "ff", N_("x,a"), &help_logistic, gnumeric_logistic, NULL, NULL, NULL }, { "lognormdist", "fff", N_("x,meean,stdev"), &help_lognormdist, gnumeric_lognormdist, NULL, NULL, NULL }, + { "logreg", "A|Abb", N_("kwnow_y's,known_x's,const,stat"), + &help_logreg, gnumeric_logreg, NULL, NULL, NULL }, { "max", 0, N_("number,number,"), &help_max, NULL, gnumeric_max, NULL, NULL }, { "maxa", 0, N_("number,number,"), Index: plugins/fn-stat/plugin.xml.in =================================================================== RCS file: /cvs/gnome/gnumeric/plugins/fn-stat/plugin.xml.in,v retrieving revision 1.7 diff -u -3 -p -r1.7 plugin.xml.in --- plugins/fn-stat/plugin.xml.in 4 Aug 2002 11:25:53 -0000 1.7 +++ plugins/fn-stat/plugin.xml.in 6 Oct 2002 10:25:53 -0000 @@ -57,9 +57,11 @@ + + Index: src/regression.c =================================================================== RCS file: /cvs/gnome/gnumeric/src/regression.c,v retrieving revision 1.38 diff -u -3 -p -r1.38 regression.c --- src/regression.c 24 Jun 2002 20:51:31 -0000 1.38 +++ src/regression.c 6 Oct 2002 10:25:58 -0000 @@ -11,6 +11,7 @@ #include "gnumeric.h" #include "regression.h" #include "rangefunc.h" +#include "mathfunc.h" #include #include @@ -451,6 +452,193 @@ general_linear_regression (gnum_float ** } /* ------------------------------------------------------------------------- */ + +typedef struct { + gnum_float min_x; + gnum_float max_x; + gnum_float min_y; + gnum_float max_y; + gnum_float mean_y; +} point_cloud_measure_type; + +/* Takes the current 'sign' (res[0]) and 'c' (res[3]) from the calling + * function, transforms xs to log(sign*(x-c)), performs a simple + * linear regression to find the best fitting 'a' (res[1]) and 'b' + * (res[2]) for ys and transformed xs, and computes the sum of squared + * residuals. + * Needs 'sign' (i.e. +1 or -1) and 'c' so adjusted that (sign*(x-c)) is + * positive for all xs. n must be > 0. These conditions are trusted to be + * checked by the calling functions. + * Is called often, so do not make it too slow. + */ + +static int +transform_x_and_linear_regression_log_fitting (gnum_float *xs, + gnum_float *transf_xs, + const gnum_float *ys, int n, + gnum_float *res, + point_cloud_measure_type *point_cloud) +{ + int i; + int result = REG_ok; + gnum_float mean_transf_x, diff_x, resid_y; + gnum_float sum1 = 0; + gnum_float sum2 = 0; + + /* log (always > 0) */ + for (i=0; imean_y); + sum2 += diff_x * diff_x; + } + res[2] = sum1 / sum2; + res[1] = point_cloud->mean_y - (res[2] * mean_transf_x); + res[4] = 0; + for (i=0; imax_x) - (point_cloud->min_x); + c_accuracy = log10gnum (c_accuracy); + if (c_accuracy < 0) + if (modf (c_accuracy, &c_accuracy_int) != 0) + c_accuracy--; + modf (c_accuracy, &c_accuracy_int); + c_accuracy = (gnum_float) c_accuracy_int; + c_accuracy = powgnum (10, c_accuracy); + c_accuracy *= LOGFIT_C_ACCURACY; + + /* Determine sign. Take a c which is ``much to small'' since the part + * of the curve cutting the point cloud is almost not bent. + * If making c still smaller does not make things still worse, + * assume that we have to change the direction of curve bending + * by changing sign. + */ + c_step = c_accuracy * LOGFIT_C_STEP_FACTOR; + res[0] = 1; /* sign */ + res[3] = point_cloud->min_x - c_accuracy * LOGFIT_C_RANGE_FACTOR; + temp_res[0] = 1; + temp_res[3] = res[3] - c_step; + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + res, point_cloud); + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + temp_res, point_cloud); + if (temp_res[4] <= res[4]) + sign_plus_ok = 0; + /* check again with new sign */ + res[0] = -1; /* sign */ + res[3] = point_cloud->max_x + c_accuracy * LOGFIT_C_RANGE_FACTOR; + temp_res[0] = -1; + temp_res[3] = res[3] + c_step; + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + res, point_cloud); + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + temp_res, point_cloud); + if (temp_res[4] <= res[4]) + sign_minus_ok = 0; + /* If not exactly one of plus or minus works, give up. + * This happens in point clouds which are very weakly bent. + */ + if (sign_plus_ok && !sign_minus_ok) + res[0] = 1; + else if (sign_minus_ok && !sign_plus_ok) + res[0] = -1; + else { + result = REG_invalid_data; + goto out; + } + + /* Start of fitted c-range. Rounded to final accuracy of c. */ + c_offset = (res[0] == 1) ? point_cloud->min_x + : point_cloud->max_x; + c_offset = c_accuracy * ((res[0] == 1) ? floor (c_offset / c_accuracy) + : ceil (c_offset /c_accuracy)); + + /* Now the adapting of c starts. Find a local minimum of sum + * of squared residuals. */ + + /* First, catch some unsuitably shaped point clouds. */ + res[3] = c_offset - res[0] * c_accuracy; + temp_res[3] = c_offset - res[0] * 2 * c_accuracy; + temp_res[0] = res[0]; + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + res, point_cloud); + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + temp_res, point_cloud); + if (temp_res[4] >= res[4]) { + result = REG_invalid_data; + goto out; + } + /* After the above check, any minimum reached will be NOT at + * the start of c-range (c_offset - sign * c_accuracy) */ + c_start = c_offset; + c_end = c_start - res[0] * c_accuracy * LOGFIT_C_RANGE_FACTOR; + c_dist = res[0] * (c_start - c_end) / 2; + res[3] = c_end + res[0] * c_dist; + do { + c_dist /= 2; + transform_x_and_linear_regression_log_fitting (xs, transf_xs, + ys, n, res, + point_cloud); + temp_res[3] = res[3] + res[0] * c_dist; + transform_x_and_linear_regression_log_fitting (xs, transf_xs, + ys, n, temp_res, + point_cloud); + if (temp_res[4] <= res[4]) + memcpy (res, temp_res, 5 * sizeof (gnum_float)); + else { + temp_res[3] = res[3] - res[0] * c_dist; + transform_x_and_linear_regression_log_fitting (xs, + transf_xs, + ys, n, + temp_res, + point_cloud); + if (temp_res[4] <= res[4]) + memcpy (res, temp_res, 5*sizeof (gnum_float)); + } + } while (c_dist > c_accuracy); + + res[3] = c_accuracy * gnumeric_fake_round (res[3] / c_accuracy); + transform_x_and_linear_regression_log_fitting (xs, transf_xs, ys, n, + res, point_cloud); + + if ((res[0] * (res[3] - c_end)) < (1.1 * c_accuracy)) { + /* Allowing for some inaccuracy, we are at the end of the + * range, so this is probably no local minimum. + * The start of the range has been checked above. */ + result = REG_invalid_data; + goto out; + } + + out: + g_free (transf_xs); + g_free (temp_res); + return result; +} + +/* ------------------------------------------------------------------------- */ /* Please refer to description in regression.h. */ RegressionResult @@ -529,6 +717,106 @@ exponential_regression (gnum_float **xss out: g_free (log_ys); + return result; +} + +/* ------------------------------------------------------------------------- */ +/* Please refer to description in regression.h. */ + +RegressionResult +logarithmic_regression (gnum_float **xss, int dim, + const gnum_float *ys, int n, + gboolean affine, + gnum_float *res, + regression_stat_t *regression_stat) +{ + gnum_float **log_xss; + RegressionResult result; + int i, j; + + g_return_val_if_fail (dim >= 1, REG_invalid_dimensions); + g_return_val_if_fail (n >= 1, REG_invalid_dimensions); + + ALLOC_MATRIX (log_xss, dim, n); + for (i = 0; i < dim; i++) + for (j = 0; j < n; j++) + if (xss[i][j] > 0) + log_xss[i][j] = loggnum (xss[i][j]); + else { + result = REG_invalid_data; + goto out; + } + + + if (affine) { + gnum_float **log_xss2; + log_xss2 = g_new (gnum_float *, dim + 1); + log_xss2[0] = NULL; /* Substitute for 1-vector. */ + memcpy (log_xss2 + 1, log_xss, dim * sizeof (gnum_float *)); + + result = general_linear_regression (log_xss2, dim + 1, ys, n, + res, regression_stat, + affine); + g_free (log_xss2); + } else { + res[0] = 0; + result = general_linear_regression (log_xss, dim, ys, n, + res + 1, regression_stat, + affine); + } + + out: + FREE_MATRIX (log_xss, dim, n); + return result; +} + +/* ------------------------------------------------------------------------- */ +/* Please refer to description in regression.h. */ + +RegressionResult +logarithmic_fit (gnum_float *xs, const gnum_float *ys, int n, gnum_float *res) +{ + point_cloud_measure_type point_cloud_measures; + int i, result; + gboolean more_2_y = 0, more_2_x = 0; + + /* Store useful measures for using them here and in subfunctions. + * The checking of n is paranoid -- the calling function should + * have cared for that. */ + g_return_val_if_fail (n > 2, REG_invalid_dimensions); + result = range_min (xs, n, &(point_cloud_measures.min_x)); + result = range_max (xs, n, &(point_cloud_measures.max_x)); + result = range_min (ys, n, &(point_cloud_measures.min_y)); + result = range_max (ys, n, &(point_cloud_measures.max_y)); + result = range_average (ys, n, &(point_cloud_measures.mean_y)); + /* Checking of error conditions. */ + /* less than 2 different ys or less than 2 different xs */ + g_return_val_if_fail (((point_cloud_measures.min_y != + point_cloud_measures.max_y) && + (point_cloud_measures.min_x != + point_cloud_measures.max_x)), + REG_invalid_data); + /* less than 3 different ys */ + for (i=0; i