alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Commons Math example source code file (SimpleRegression.java)

This example Commons Math source code file (SimpleRegression.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Commons Math tags/keywords

io, mathexception, mathexception, serializable, serializable, simpleregression, simpleregression, tdistribution, tdistribution

The Commons Math SimpleRegression.java source code

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.math.stat.regression;
import java.io.Serializable;

import org.apache.commons.math.MathException;
import org.apache.commons.math.MathRuntimeException;
import org.apache.commons.math.distribution.TDistribution;
import org.apache.commons.math.distribution.TDistributionImpl;

/**
 * Estimates an ordinary least squares regression model
 * with one independent variable.
 * <p>
 * <code> y = intercept + slope * x  

* <p> * Standard errors for <code>intercept and slope are * available as well as ANOVA, r-square and Pearson's r statistics.</p> * <p> * Observations (x,y pairs) can be added to the model one at a time or they * can be provided in a 2-dimensional array. The observations are not stored * in memory, so there is no limit to the number of observations that can be * added to the model.</p> * <p> * <strong>Usage Notes:
    * <li> When there are fewer than two observations in the model, or when * there is no variation in the x values (i.e. all x values are the same) * all statistics return <code>NaN. At least two observations with * different x coordinates are requred to estimate a bivariate regression * model. * </li> * <li> getters for the statistics always compute values based on the current * set of observations -- i.e., you can get statistics, then add more data * and get updated statistics without using a new instance. There is no * "compute" method that updates all statistics. Each of the getters performs * the necessary computations to return the requested statistic.</li> * </ul>

    * * @version $Revision: 811685 $ $Date: 2009-09-05 13:36:48 -0400 (Sat, 05 Sep 2009) $ */ public class SimpleRegression implements Serializable { /** Serializable version identifier */ private static final long serialVersionUID = -3004689053607543335L; /** the distribution used to compute inference statistics. */ private TDistribution distribution; /** sum of x values */ private double sumX = 0d; /** total variation in x (sum of squared deviations from xbar) */ private double sumXX = 0d; /** sum of y values */ private double sumY = 0d; /** total variation in y (sum of squared deviations from ybar) */ private double sumYY = 0d; /** sum of products */ private double sumXY = 0d; /** number of observations */ private long n = 0; /** mean of accumulated x values, used in updating formulas */ private double xbar = 0; /** mean of accumulated y values, used in updating formulas */ private double ybar = 0; // ---------------------Public methods-------------------------------------- /** * Create an empty SimpleRegression instance */ public SimpleRegression() { this(new TDistributionImpl(1.0)); } /** * Create an empty SimpleRegression using the given distribution object to * compute inference statistics. * @param t the distribution used to compute inference statistics. * @since 1.2 */ public SimpleRegression(TDistribution t) { super(); setDistribution(t); } /** * Adds the observation (x,y) to the regression data set. * <p> * Uses updating formulas for means and sums of squares defined in * "Algorithms for Computing the Sample Variance: Analysis and * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. * 1983, American Statistician, vol. 37, pp. 242-247, referenced in * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> * * * @param x independent variable value * @param y dependent variable value */ public void addData(double x, double y) { if (n == 0) { xbar = x; ybar = y; } else { double dx = x - xbar; double dy = y - ybar; sumXX += dx * dx * (double) n / (n + 1d); sumYY += dy * dy * (double) n / (n + 1d); sumXY += dx * dy * (double) n / (n + 1d); xbar += dx / (n + 1.0); ybar += dy / (n + 1.0); } sumX += x; sumY += y; n++; if (n > 2) { distribution.setDegreesOfFreedom(n - 2); } } /** * Removes the observation (x,y) from the regression data set. * <p> * Mirrors the addData method. This method permits the use of * SimpleRegression instances in streaming mode where the regression * is applied to a sliding "window" of observations, however the caller is * responsible for maintaining the set of observations in the window.</p> * * The method has no effect if there are no points of data (i.e. n=0) * * @param x independent variable value * @param y dependent variable value */ public void removeData(double x, double y) { if (n > 0) { double dx = x - xbar; double dy = y - ybar; sumXX -= dx * dx * (double) n / (n - 1d); sumYY -= dy * dy * (double) n / (n - 1d); sumXY -= dx * dy * (double) n / (n - 1d); xbar -= dx / (n - 1.0); ybar -= dy / (n - 1.0); sumX -= x; sumY -= y; n--; if (n > 2) { distribution.setDegreesOfFreedom(n - 2); } } } /** * Adds the observations represented by the elements in * <code>data. * <p> * <code>(data[0][0],data[0][1]) will be the first observation, then * <code>(data[1][0],data[1][1]), etc.

    * <p> * This method does not replace data that has already been added. The * observations represented by <code>data are added to the existing * dataset.</p> * <p> * To replace all data, use <code>clear() before adding the new * data.</p> * * @param data array of observations to be added */ public void addData(double[][] data) { for (int i = 0; i < data.length; i++) { addData(data[i][0], data[i][1]); } } /** * Removes observations represented by the elements in <code>data. * <p> * If the array is larger than the current n, only the first n elements are * processed. This method permits the use of SimpleRegression instances in * streaming mode where the regression is applied to a sliding "window" of * observations, however the caller is responsible for maintaining the set * of observations in the window.</p> * <p> * To remove all data, use <code>clear().

    * * @param data array of observations to be removed */ public void removeData(double[][] data) { for (int i = 0; i < data.length && n > 0; i++) { removeData(data[i][0], data[i][1]); } } /** * Clears all data from the model. */ public void clear() { sumX = 0d; sumXX = 0d; sumY = 0d; sumYY = 0d; sumXY = 0d; n = 0; } /** * Returns the number of observations that have been added to the model. * * @return n number of observations that have been added. */ public long getN() { return n; } /** * Returns the "predicted" <code>y value associated with the * supplied <code>x value, based on the data that has been * added to the model when this method is activated. * <p> * <code> predict(x) = intercept + slope * x

    * <p> * <strong>Preconditions:
      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN is * returned. * </li>

    * * @param x input <code>x value * @return predicted <code>y value */ public double predict(double x) { double b1 = getSlope(); return getIntercept(b1) + b1 * x; } /** * Returns the intercept of the estimated regression line. * <p> * The least squares estimate of the intercept is computed using the * <a href="http://www.xycoon.com/estimation4.htm">normal equations. * The intercept is sometimes denoted b0.</p> * <p> * <strong>Preconditions:
      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN is * returned. * </li>

    * * @return the intercept of the regression line */ public double getIntercept() { return getIntercept(getSlope()); } /** * Returns the slope of the estimated regression line. * <p> * The least squares estimate of the slope is computed using the * <a href="http://www.xycoon.com/estimation4.htm">normal equations. * The slope is sometimes denoted b1.</p> * <p> * <strong>Preconditions:
      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double.NaN is * returned. * </li>

    * * @return the slope of the regression line */ public double getSlope() { if (n < 2) { return Double.NaN; //not enough data } if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) { return Double.NaN; //not enough variation in x } return sumXY / sumXX; } /** * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> * sum of squared errors</a> (SSE) associated with the regression * model. * <p> * The sum is computed using the computational formula</p> * <p> * <code>SSE = SYY - (SXY * SXY / SXX)

    * <p> * where <code>SYY is the sum of the squared deviations of the y * values about their mean, <code>SXX is similarly defined and * <code>SXY is the sum of the products of x and y mean deviations. * </p>

    * The sums are accumulated using the updating algorithm referenced in * {@link #addData}.</p> * <p> * The return value is constrained to be non-negative - i.e., if due to * rounding errors the computational formula returns a negative result, * 0 is returned.</p> * <p> * <strong>Preconditions:

      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN is * returned. * </li>

    * * @return sum of squared errors associated with the regression model */ public double getSumSquaredErrors() { return Math.max(0d, sumYY - sumXY * sumXY / sumXX); } /** * Returns the sum of squared deviations of the y values about their mean. * <p> * This is defined as SSTO * <a href="http://www.xycoon.com/SumOfSquares.htm">here.

    * <p> * If <code>n < 2, this returns Double.NaN.

    * * @return sum of squared deviations of y values */ public double getTotalSumSquares() { if (n < 2) { return Double.NaN; } return sumYY; } /** * Returns the sum of squared deviations of the x values about their mean. * * If <code>n < 2, this returns Double.NaN.

    * * @return sum of squared deviations of x values */ public double getXSumSquares() { if (n < 2) { return Double.NaN; } return sumXX; } /** * Returns the sum of crossproducts, x<sub>i*yi. * * @return sum of cross products */ public double getSumOfCrossProducts() { return sumXY; } /** * Returns the sum of squared deviations of the predicted y values about * their mean (which equals the mean of y). * <p> * This is usually abbreviated SSR or SSM. It is defined as SSM * <a href="http://www.xycoon.com/SumOfSquares.htm">here

    * <p> * <strong>Preconditions:
      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double.NaN is * returned. * </li>

    * * @return sum of squared deviations of predicted y values */ public double getRegressionSumSquares() { return getRegressionSumSquares(getSlope()); } /** * Returns the sum of squared errors divided by the degrees of freedom, * usually abbreviated MSE. * <p> * If there are fewer than <strong>three data pairs in the model, * or if there is no variation in <code>x, this returns * <code>Double.NaN.

    * * @return sum of squared deviations of y values */ public double getMeanSquareError() { if (n < 3) { return Double.NaN; } return getSumSquaredErrors() / (n - 2); } /** * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> * Pearson's product moment correlation coefficient</a>, * usually denoted r. * <p> * <strong>Preconditions:
      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN is * returned. * </li>

    * * @return Pearson's r */ public double getR() { double b1 = getSlope(); double result = Math.sqrt(getRSquare()); if (b1 < 0) { result = -result; } return result; } /** * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> * coefficient of determination</a>, * usually denoted r-square. * <p> * <strong>Preconditions:
      * <li>At least two observations (with at least two different x values) * must have been added before invoking this method. If this method is * invoked before a model can be estimated, <code>Double,NaN is * returned. * </li>

    * * @return r-square */ public double getRSquare() { double ssto = getTotalSumSquares(); return (ssto - getSumSquaredErrors()) / ssto; } /** * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> * standard error of the intercept estimate</a>, * usually denoted s(b0). * <p> * If there are fewer that <strong>three observations in the * model, or if there is no variation in x, this returns * <code>Double.NaN.

    * * @return standard error associated with intercept estimate */ public double getInterceptStdErr() { return Math.sqrt( getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX)); } /** * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard * error of the slope estimate</a>, * usually denoted s(b1). * <p> * If there are fewer that <strong>three data pairs in the model, * or if there is no variation in x, this returns <code>Double.NaN. * </p> * * @return standard error associated with slope estimate */ public double getSlopeStdErr() { return Math.sqrt(getMeanSquareError() / sumXX); } /** * Returns the half-width of a 95% confidence interval for the slope * estimate. * <p> * The 95% confidence interval is</p> * <p> * <code>(getSlope() - getSlopeConfidenceInterval(), * getSlope() + getSlopeConfidenceInterval())</code>

    * <p> * If there are fewer that <strong>three observations in the * model, or if there is no variation in x, this returns * <code>Double.NaN.

    * <p> * <strong>Usage Note:
    * The validity of this statistic depends on the assumption that the * observations included in the model are drawn from a * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> * Bivariate Normal Distribution</a>.

    * * @return half-width of 95% confidence interval for the slope estimate * @throws MathException if the confidence interval can not be computed. */ public double getSlopeConfidenceInterval() throws MathException { return getSlopeConfidenceInterval(0.05d); } /** * Returns the half-width of a (100-100*alpha)% confidence interval for * the slope estimate. * <p> * The (100-100*alpha)% confidence interval is </p> * <p> * <code>(getSlope() - getSlopeConfidenceInterval(), * getSlope() + getSlopeConfidenceInterval())</code>

    * <p> * To request, for example, a 99% confidence interval, use * <code>alpha = .01

    * <p> * <strong>Usage Note:
    * The validity of this statistic depends on the assumption that the * observations included in the model are drawn from a * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> * Bivariate Normal Distribution</a>.

    * <p> * <strong> Preconditions:
      * <li>If there are fewer that three observations in the * model, or if there is no variation in x, this returns * <code>Double.NaN. * </li> * <li>(0 < alpha < 1); otherwise an * <code>IllegalArgumentException is thrown. * </li>

    * * @param alpha the desired significance level * @return half-width of 95% confidence interval for the slope estimate * @throws MathException if the confidence interval can not be computed. */ public double getSlopeConfidenceInterval(double alpha) throws MathException { if (alpha >= 1 || alpha <= 0) { throw MathRuntimeException.createIllegalArgumentException( "out of bounds significance level {0}, must be between {1} and {2}", alpha, 0.0, 1.0); } return getSlopeStdErr() * distribution.inverseCumulativeProbability(1d - alpha / 2d); } /** * Returns the significance level of the slope (equiv) correlation. * <p> * Specifically, the returned value is the smallest <code>alpha * such that the slope confidence interval with significance level * equal to <code>alpha does not include 0. * On regression output, this is often denoted <code>Prob(|t| > 0) * </p>

    * <strong>Usage Note:
    * The validity of this statistic depends on the assumption that the * observations included in the model are drawn from a * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> * Bivariate Normal Distribution</a>.

    * <p> * If there are fewer that <strong>three observations in the * model, or if there is no variation in x, this returns * <code>Double.NaN.

    * * @return significance level for slope/correlation * @throws MathException if the significance level can not be computed. */ public double getSignificance() throws MathException { return 2d * (1.0 - distribution.cumulativeProbability( Math.abs(getSlope()) / getSlopeStdErr())); } // ---------------------Private methods----------------------------------- /** * Returns the intercept of the estimated regression line, given the slope. * <p> * Will return <code>NaN if slope is NaN.

    * * @param slope current slope * @return the intercept of the regression line */ private double getIntercept(double slope) { return (sumY - slope * sumX) / n; } /** * Computes SSR from b1. * * @param slope regression slope estimate * @return sum of squared deviations of predicted y values */ private double getRegressionSumSquares(double slope) { return slope * slope * sumXX; } /** * Modify the distribution used to compute inference statistics. * @param value the new distribution * @since 1.2 */ public void setDistribution(TDistribution value) { distribution = value; // modify degrees of freedom if (n > 2) { distribution.setDegreesOfFreedom(n - 2); } } }

Other Commons Math examples (source code examples)

Here is a short list of links related to this Commons Math SimpleRegression.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.