diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a66d00fff9714..d8269b605eef9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5501,7 +5501,22 @@ def corr(self, method='pearson', min_periods=1): def cov(self, min_periods=None): """ - Compute pairwise covariance of columns, excluding NA/null values + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. Parameters ---------- @@ -5511,12 +5526,71 @@ def cov(self, min_periods=None): Returns ------- - y : DataFrame + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + pandas.Series.cov : compute covariance with another Series + pandas.core.window.EWM.cov: expoential weighted sample covariance + pandas.core.window.Expanding.cov : expanding sample covariance + pandas.core.window.Rolling.cov : rolling sample covariance Notes ----- - `y` contains the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-1 (unbiased estimator). + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-1. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns