Source code for gneiss.regression._ols

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from collections import OrderedDict
import numpy as np
import pandas as pd
from gneiss.regression._model import RegressionModel
from gneiss.util import _type_cast_to_float
from gneiss.balances import balance_basis
from skbio.stats.composition import ilr_inv

from statsmodels.iolib.summary2 import Summary
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
from patsy import dmatrix
from scipy import stats


[docs]def ols(formula, table, metadata): """ Ordinary Least Squares applied to balances. An ordinary least squares (OLS) regression is a method for estimating parameters in a linear regression model. OLS is a common statistical technique for fitting and testing the effects of covariates on a response. This implementation is focused on performing a multivariate response regression where the response is a matrix of balances (`table`) and the covariates (`metadata`) are made up of external variables. Global statistical tests indicating goodness of fit and contributions from covariates can be accessed from a coefficient of determination (`r2`), leave-one-variable-out cross validation (`lovo`), leave-one-out cross validation (`loo`) and k-fold cross validation (`kfold`). In addition residuals (`residuals`) can be accessed for diagnostic purposes. T-statistics (`tvalues`) and p-values (`pvalues`) can be obtained to investigate to evaluate statistical significance for a covariate for a given balance. Predictions on the resulting model can be made using (`predict`), and these results can be interpreted as either balances or proportions. Parameters ---------- formula : str Formula representing the statistical equation to be evaluated. These strings are similar to how equations are handled in R and statsmodels. Note that the dependent variable in this string should not be specified, since this method will be run on each of the individual balances. See `patsy` for more details. table : pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. Returns ------- OLSModel Container object that holds information about the overall fit. This includes information about coefficients, pvalues, residuals and coefficient of determination from the resulting regression. Example ------- >>> import numpy as np >>> import pandas as pd >>> from skbio import TreeNode >>> from gneiss.regression import ols Here, we will define a table of balances as follows >>> np.random.seed(0) >>> n = 100 >>> g1 = np.linspace(0, 15, n) >>> y1 = g1 + 5 >>> y2 = -g1 - 2 >>> Y = pd.DataFrame({'y1': y1, 'y2': y2}) Once we have the balances defined, we will add some errors >>> e = np.random.normal(loc=1, scale=0.1, size=(n, 2)) >>> Y = Y + e Now we will define the environment variables that we want to regress against the balances. >>> X = pd.DataFrame({'g1': g1}) Once these variables are defined, a regression can be performed. These proportions will be converted to balances according to the tree specified. And the regression formula is specified to run `temp` and `ph` against the proportions in a single model. >>> res = ols('g1', Y, X) >>> res.fit() From the summary results of the `ols` function, we can view the pvalues according to how well each individual balance fitted in the regression model. >>> res.pvalues y1 y2 Intercept 8.826379e-148 7.842085e-71 g1 1.923597e-163 1.277152e-163 We can also view the balance coefficients estimated in the regression model. These coefficients can also be viewed as proportions by passing `project=True` as an argument in `res.coefficients()`. >>> res.coefficients() y1 y2 Intercept 6.016459 -0.983476 g1 0.997793 -1.000299 The overall model fit can be obtained as follows >>> res.r2 0.99945903186495066 """ # one-time creation of exogenous data matrix allows for faster run-time metadata = _type_cast_to_float(metadata.copy()) x = dmatrix(formula, metadata, return_type='dataframe') ilr_table, x = table.align(x, join='inner', axis=0) return OLSModel(Y=ilr_table, Xs=x)
[docs]class OLSModel(RegressionModel): """ Summary object for storing ordinary least squares results. A `OLSModel` object stores information about the individual balances used in the regression, the coefficients, residuals. This object can be used to perform predictions. In addition, summary statistics such as the coefficient of determination for the overall fit can be calculated. Attributes ---------- submodels : list of statsmodels objects List of statsmodels result objects. balances : pd.DataFrame A table of balances where samples are rows and balances are columns. These balances were calculated using `tree`. """
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
def fit(self, **kwargs): """ Fit the ordinary least squares model. Here, the coefficients of the model are estimated. In addition, there are additional summary statistics that are being calculated, such as residuals, t-statistics, pvalues and coefficient of determination. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. """ Y = self.response_matrix X = self.design_matrices n, p = X.shape inv = np.linalg.pinv(np.dot(X.T, X)) cross = np.dot(inv, X.T) beta = np.dot(cross, Y) pX = np.dot(X, beta) resid = (Y - pX) sst = (Y - Y.mean(axis=0)) sse = (resid**2).sum(axis=0) sst_balance = ((Y - Y.mean(axis=0))**2).sum(axis=0) sse_balance = (resid**2).sum(axis=0) ssr_balance = (sst_balance - sse_balance) df_resid = n - p + 1 mse = sse / df_resid self._mse = mse # t tests # TODO: Look into more robust ways to calculate cov cov = np.linalg.pinv(np.dot(X.T, X)) bse = np.sqrt(np.outer(np.diag(cov), mse)) tvalues = np.divide(beta, bse) pvals = stats.t.sf(np.abs(tvalues), df_resid)*2 self._tvalues = pd.DataFrame(tvalues, index=X.columns, columns=Y.columns) self._pvalues = pd.DataFrame(pvals, index=X.columns, columns=Y.columns) self._beta = pd.DataFrame(beta, index=X.columns, columns=Y.columns) self._resid = pd.DataFrame(resid, index=Y.index, columns=Y.columns) self._fitted = True self._ess = ssr_balance self._r2 = 1 - ((resid**2).values.sum() / (sst**2).values.sum()) def predict(self, X=None, tree=None, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. tree : skbio.TreeNode, optional The tree used to perform the ilr transformation. If this is specified, then the prediction will be represented as proportions. Otherwise, if this is not specified, the prediction will be represented as balances. (default: None). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of predicted values where columns are coefficients, and the rows are balances. If `tree` is specified, then the rows are proportions. """ if not self._fitted: ValueError(('Model not fitted - coefficients not calculated.' 'See `fit()`')) if X is None: X = self.design_matrices prediction = X.dot(self._beta) if tree is not None: basis, _ = balance_basis(tree) proj_prediction = ilr_inv(prediction.values, basis=basis) ids = [n.name for n in tree.tips()] return pd.DataFrame(proj_prediction, columns=ids, index=prediction.index) else: return prediction @property def pvalues(self): """ Return pvalues from each of the coefficients in the fit. """ return self._pvalues @property def tvalues(self): """ Return t-statistics from each of the coefficients in the fit. """ return self._tvalues @property def r2(self): """ Coefficient of determination for overall fit""" return self._r2 @property def mse(self): """ Mean Sum of squares Error""" return self._mse @property def ess(self): """ Explained Sum of squares""" return self._ess def summary(self, kfolds, lovo): """ Summarize the Ordinary Least Squares Regression Results. Parameters ---------- kfold : pd.DataFrame Results from kfold cross-validation lovo : pd.DataFrame Results from leave-one-variable-out cross-validation. Returns ------- str : This holds the summary of regression coefficients and fit information. """ _r2 = self.r2 self.params = self._beta # number of observations self.nobs = self.response_matrix.shape[0] self.model = None # Start filling in summary information smry = Summary() # Top results info = OrderedDict() info["No. Observations"] = self.nobs info["Model:"] = "OLS" info["Rsquared: "] = _r2 # TODO: Investigate how to properly resize the tables smry.add_dict(info, ncols=1) smry.add_title("Simplicial Least Squares Results") smry.add_df(lovo, align='l') smry.add_df(kfolds, align='l') return smry def kfold(self, num_folds=10, **kwargs): """ K-fold cross-validation. Performs k-fold cross-validation by spliting the data into k partitions, building a model on k-1 partitions and evaluating the predictions on the remaining partition. This process is performed k times. Parameters ---------- num_folds: int, optional The number of partitions used for the cross validation. **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame model_mse : np.array, float The within model mean sum of squares error for each iteration of the cross validation. Rsquared : np.array, float The Rsquared of the model fitted on training data. pred_mse : np.array, float Prediction mean sum of squares error for each iteration of the cross validation. """ # number of observations (i.e. samples) nobs = self.response_matrix.shape[0] s = nobs // num_folds folds = [np.arange(i*s, ((i*s)+s) % nobs) for i in range(num_folds)] results = pd.DataFrame(index=['fold_%d' % i for i in range(num_folds)], columns=['model_mse', 'Rsquared', 'pred_mse'], dtype=np.float64) for k in range(num_folds): test = folds[k] train = np.hstack(folds[:k] + folds[k+1:]) res_i = OLSModel(self.response_matrix.iloc[train], self.design_matrix.iloc[train]) res_i.fit(**kwargs) # model error p = res_i.predict(X=self.design_matrix.iloc[train]).values r = self.response_matrix.iloc[train].values model_resid = ((p - r)**2) model_mse = np.mean(model_resid.sum(axis=0)) results.loc['fold_%d' % k, 'model_mse'] = model_mse results.loc['fold_%d' % k, 'Rsquared'] = res_i.r2 # prediction error p = res_i.predict(X=self.design_matrix.iloc[test]).values r = self.response_matrix.iloc[test].values pred_resid = ((p - r)**2) pred_mse = np.mean(pred_resid.sum(axis=0)) results.loc['fold_%d' % k, 'pred_mse'] = pred_mse return results def loo(self, **kwargs): """ Leave one out cross-validation. Calculates summary statistics for each iteraction of leave one out cross-validation, specially `mse` on entire model and `pred_err` to measure prediction error. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame model_mse : np.array, float Mean sum of squares error for each iteration of the cross validation. pred_mse : np.array, float Prediction mean sum of squares error for each iteration of the cross validation. See Also -------- fit statsmodels.regression.linear_model. """ # number of observations (i.e. samples) nobs = self.response_matrix.shape[0] cv_iter = LeaveOneOut(nobs) results = pd.DataFrame(index=self.response_matrix.index, columns=['model_mse', 'pred_mse'], dtype=np.float64) for i, (train, test) in enumerate(cv_iter): sample_id = self.response_matrix.index[i] res_i = OLSModel(self.response_matrix.iloc[train], self.design_matrix.iloc[train]) res_i.fit(**kwargs) # model error predicted = res_i.predict(X=self.design_matrix.iloc[train]) r = self.response_matrix.iloc[train].values p = predicted.values model_resid = ((r - p)**2) model_mse = np.mean(model_resid.sum(axis=0)) results.loc[sample_id, 'model_mse'] = model_mse # prediction error predicted = res_i.predict(X=self.design_matrix.iloc[test]) r = self.response_matrix.iloc[test].values p = predicted.values pred_resid = ((r - p)**2) pred_mse = np.mean(pred_resid.sum(axis=0)) results.loc[sample_id, 'pred_mse'] = pred_mse return results def lovo(self, **kwargs): """ Leave one variable out cross-validation. Calculates summary statistics for each iteraction of leave one variable out cross-validation, specially `r2` and `mse` on entire model. This technique is particularly useful for feature selection. Parameters ---------- **kwargs : dict Keyword arguments used to tune the parameter estimation. Returns ------- pd.DataFrame mse : np.array, float Mean sum of squares error for each iteration of the cross validation. Rsquared : np.array, float Coefficient of determination for each variable left out. R2diff : np.array, float Decrease in Rsquared for each variable left out. """ cv_iter = LeaveOneOut(len(self.design_matrix.columns)) results = pd.DataFrame(index=self.design_matrix.columns, columns=['mse', 'Rsquared', 'R2diff'], dtype=np.float64) for i, (inidx, outidx) in enumerate(cv_iter): feature_id = self.design_matrix.columns[i] res_i = OLSModel(Y=self.response_matrix, Xs=self.design_matrix.iloc[:, inidx]) res_i.fit(**kwargs) predicted = res_i.predict() r = self.response_matrix.values p = predicted.values model_resid = ((r - p)**2) model_mse = np.mean(model_resid.sum(axis=0)) results.loc[feature_id, 'mse'] = model_mse results.loc[feature_id, 'Rsquared'] = res_i.r2 results.loc[feature_id, 'R2diff'] = self.r2 - res_i.r2 return results