Add reader for SPSS (.sav) files (#26537)

cbrnr · jreback · commit 21fe224627a0 · 2019-06-16T10:30:38.000-04:00
diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2013-2016
+COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT
@@ -0,0 +1,32 @@
+Based on http://opensource.org/licenses/MIT
+
+This is a template. Complete and ship as file LICENSE the following 2
+lines (only)
+
+YEAR:
+COPYRIGHT HOLDER:
+
+and specify as
+
+License: MIT + file LICENSE
+
+Copyright (c) <YEAR>, <COPYRIGHT HOLDER>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml
@@ -23,6 +23,7 @@ dependencies:
   - xlsxwriter
   - xlwt
   - pip:
+    - pyreadstat
     # universal
     - pytest==4.5.0
     - pytest-xdist
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
@@ -30,3 +30,4 @@ dependencies:
   - pytest-mock
   - moto
   - hypothesis>=3.58.0
+  - pyreadstat
diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
@@ -19,5 +19,6 @@ dependencies:
   - hypothesis>=3.58.0
   - s3fs
   - pip
+  - pyreadstat
   - pip:
     - moto
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -285,6 +285,7 @@ pandas-gbq                0.8.0              Google Big Query access
 psycopg2                                     PostgreSQL engine for sqlalchemy
 pyarrow                   0.9.0              Parquet and feather reading / writing
 pymysql                                      MySQL engine for sqlalchemy
+pyreadstat                                   SPSS files (.sav) reading
 qtpy                                         Clipboard I/O
 s3fs                      0.0.8              Amazon S3 access
 xarray                    0.8.2              pandas-like API for N-dimensional data
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -99,6 +99,7 @@ Other Enhancements
 - Error message for missing required imports now includes the original import error's text (:issue:`23868`)
 - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
 - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
+- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
 
 .. _whatsnew_0250.api_breaking:
 
diff --git a/environment.yml b/environment.yml
@@ -79,3 +79,5 @@ dependencies:
   - xlrd  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
   - xlsxwriter  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
   - xlwt  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
+  - pip:
+    - pyreadstat  # pandas.read_spss
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -105,7 +105,7 @@
 
     # misc
     read_clipboard, read_parquet, read_feather, read_gbq,
-    read_html, read_json, read_stata, read_sas)
+    read_html, read_json, read_stata, read_sas, read_spss)
 
 from pandas.util._tester import test
 import pandas.testing
diff --git a/pandas/io/api.py b/pandas/io/api.py
@@ -16,5 +16,6 @@
 from pandas.io.pickle import read_pickle, to_pickle
 from pandas.io.pytables import HDFStore, read_hdf
 from pandas.io.sas import read_sas
+from pandas.io.spss import read_spss
 from pandas.io.sql import read_sql, read_sql_query, read_sql_table
 from pandas.io.stata import read_stata
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.api.types import is_list_like
+from pandas.core.api import DataFrame
+
+
+def read_spss(path: Union[str, Path],
+              usecols: Optional[Sequence[str]] = None,
+              convert_categoricals: bool = True) -> DataFrame:
+    """
+    Load an SPSS file from the file path, returning a DataFrame.
+
+    .. versionadded 0.25.0
+
+    Parameters
+    ----------
+    path : string or Path
+        File path
+    usecols : list-like, optional
+        Return a subset of the columns. If None, return all columns.
+    convert_categoricals : bool, default is True
+        Convert categorical columns into pd.Categorical.
+
+    Returns
+    -------
+    DataFrame
+    """
+    pyreadstat = import_optional_dependency("pyreadstat")
+
+    if usecols is not None:
+        if not is_list_like(usecols):
+            raise TypeError("usecols must be list-like.")
+        else:
+            usecols = list(usecols)  # pyreadstat requires a list
+
+    df, _ = pyreadstat.read_sav(path, usecols=usecols,
+                                apply_value_formats=convert_categoricals)
+    return df
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -81,7 +81,7 @@ class TestPDApi(Base):
                   'read_gbq', 'read_hdf', 'read_html', 'read_json',
                   'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
                   'read_sql_query', 'read_sql_table', 'read_stata',
-                  'read_table', 'read_feather', 'read_parquet']
+                  'read_table', 'read_feather', 'read_parquet', 'read_spss']
 
     # top-level to_* funcs
     funcs_to = ['to_datetime', 'to_msgpack',
diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/labelled-num-na.sav
diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/labelled-num.sav
diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/labelled-str.sav
diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/umlauts.sav
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
@@ -0,0 +1,74 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.util import testing as tm
+
+pyreadstat = pytest.importorskip("pyreadstat")
+
+
+def test_spss_labelled_num(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "labelled-num.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
+    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"VAR00002": 1.0}, index=[0])
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_labelled_num_na(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "labelled-num-na.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"VAR00002": ["This is one", None]})
+    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"VAR00002": [1.0, np.nan]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_labelled_str(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "labelled-str.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"gender": ["Male", "Female"]})
+    expected["gender"] = pd.Categorical(expected["gender"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"gender": ["M", "F"]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_umlauts(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "umlauts.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"var1": ["the ä umlaut",
+                                      "the ü umlaut",
+                                      "the ä umlaut",
+                                      "the ö umlaut"]})
+    expected["var1"] = pd.Categorical(expected["var1"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_usecols(datapath):
+    # usecols must be list-like
+    fname = datapath("io", "data", "labelled-num.sav")
+
+    with pytest.raises(TypeError, match="usecols must be list-like."):
+        pd.read_spss(fname, usecols="VAR00002")
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -52,4 +52,5 @@ sqlalchemy
 xarray
 xlrd
 xlsxwriter
-xlwt
+xlwt
+pyreadstat

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+YEAR: 2013-2016`
	`2`	`+COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller`