Skip to content

Commit 21fe224

Browse files
cbrnrjreback
authored andcommitted
Add reader for SPSS (.sav) files (#26537)
1 parent 3381c64 commit 21fe224

18 files changed

+161
-3
lines changed

LICENSES/HAVEN_LICENSE

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
YEAR: 2013-2016
2+
COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller

LICENSES/HAVEN_MIT

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
Based on http://opensource.org/licenses/MIT
2+
3+
This is a template. Complete and ship as file LICENSE the following 2
4+
lines (only)
5+
6+
YEAR:
7+
COPYRIGHT HOLDER:
8+
9+
and specify as
10+
11+
License: MIT + file LICENSE
12+
13+
Copyright (c) <YEAR>, <COPYRIGHT HOLDER>
14+
15+
Permission is hereby granted, free of charge, to any person obtaining
16+
a copy of this software and associated documentation files (the
17+
"Software"), to deal in the Software without restriction, including
18+
without limitation the rights to use, copy, modify, merge, publish,
19+
distribute, sublicense, and/or sell copies of the Software, and to
20+
permit persons to whom the Software is furnished to do so, subject to
21+
the following conditions:
22+
23+
The above copyright notice and this permission notice shall be
24+
included in all copies or substantial portions of the Software.
25+
26+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
30+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
31+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
32+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

ci/deps/azure-macos-35.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ dependencies:
2323
- xlsxwriter
2424
- xlwt
2525
- pip:
26+
- pyreadstat
2627
# universal
2728
- pytest==4.5.0
2829
- pytest-xdist

ci/deps/azure-windows-37.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ dependencies:
3030
- pytest-mock
3131
- moto
3232
- hypothesis>=3.58.0
33+
- pyreadstat

ci/deps/travis-37.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ dependencies:
1919
- hypothesis>=3.58.0
2020
- s3fs
2121
- pip
22+
- pyreadstat
2223
- pip:
2324
- moto

doc/source/install.rst

+1
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access
285285
psycopg2 PostgreSQL engine for sqlalchemy
286286
pyarrow 0.9.0 Parquet and feather reading / writing
287287
pymysql MySQL engine for sqlalchemy
288+
pyreadstat SPSS files (.sav) reading
288289
qtpy Clipboard I/O
289290
s3fs 0.0.8 Amazon S3 access
290291
xarray 0.8.2 pandas-like API for N-dimensional data

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ Other Enhancements
9999
- Error message for missing required imports now includes the original import error's text (:issue:`23868`)
100100
- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
101101
- :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
102+
- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
102103

103104
.. _whatsnew_0250.api_breaking:
104105

environment.yml

+2
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,5 @@ dependencies:
7979
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
8080
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
8181
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
82+
- pip:
83+
- pyreadstat # pandas.read_spss

pandas/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@
105105

106106
# misc
107107
read_clipboard, read_parquet, read_feather, read_gbq,
108-
read_html, read_json, read_stata, read_sas)
108+
read_html, read_json, read_stata, read_sas, read_spss)
109109

110110
from pandas.util._tester import test
111111
import pandas.testing

pandas/io/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@
1616
from pandas.io.pickle import read_pickle, to_pickle
1717
from pandas.io.pytables import HDFStore, read_hdf
1818
from pandas.io.sas import read_sas
19+
from pandas.io.spss import read_spss
1920
from pandas.io.sql import read_sql, read_sql_query, read_sql_table
2021
from pandas.io.stata import read_stata

pandas/io/spss.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from pathlib import Path
2+
from typing import Optional, Sequence, Union
3+
4+
from pandas.compat._optional import import_optional_dependency
5+
6+
from pandas.api.types import is_list_like
7+
from pandas.core.api import DataFrame
8+
9+
10+
def read_spss(path: Union[str, Path],
11+
usecols: Optional[Sequence[str]] = None,
12+
convert_categoricals: bool = True) -> DataFrame:
13+
"""
14+
Load an SPSS file from the file path, returning a DataFrame.
15+
16+
.. versionadded 0.25.0
17+
18+
Parameters
19+
----------
20+
path : string or Path
21+
File path
22+
usecols : list-like, optional
23+
Return a subset of the columns. If None, return all columns.
24+
convert_categoricals : bool, default is True
25+
Convert categorical columns into pd.Categorical.
26+
27+
Returns
28+
-------
29+
DataFrame
30+
"""
31+
pyreadstat = import_optional_dependency("pyreadstat")
32+
33+
if usecols is not None:
34+
if not is_list_like(usecols):
35+
raise TypeError("usecols must be list-like.")
36+
else:
37+
usecols = list(usecols) # pyreadstat requires a list
38+
39+
df, _ = pyreadstat.read_sav(path, usecols=usecols,
40+
apply_value_formats=convert_categoricals)
41+
return df

pandas/tests/api/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class TestPDApi(Base):
8181
'read_gbq', 'read_hdf', 'read_html', 'read_json',
8282
'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
8383
'read_sql_query', 'read_sql_table', 'read_stata',
84-
'read_table', 'read_feather', 'read_parquet']
84+
'read_table', 'read_feather', 'read_parquet', 'read_spss']
8585

8686
# top-level to_* funcs
8787
funcs_to = ['to_datetime', 'to_msgpack',
535 Bytes
Binary file not shown.

pandas/tests/io/data/labelled-num.sav

507 Bytes
Binary file not shown.

pandas/tests/io/data/labelled-str.sav

525 Bytes
Binary file not shown.

pandas/tests/io/data/umlauts.sav

567 Bytes
Binary file not shown.

pandas/tests/io/test_spss.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pandas as pd
5+
from pandas.util import testing as tm
6+
7+
pyreadstat = pytest.importorskip("pyreadstat")
8+
9+
10+
def test_spss_labelled_num(datapath):
11+
# test file from the Haven project (https://haven.tidyverse.org/)
12+
fname = datapath("io", "data", "labelled-num.sav")
13+
14+
df = pd.read_spss(fname, convert_categoricals=True)
15+
expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
16+
expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
17+
tm.assert_frame_equal(df, expected)
18+
19+
df = pd.read_spss(fname, convert_categoricals=False)
20+
expected = pd.DataFrame({"VAR00002": 1.0}, index=[0])
21+
tm.assert_frame_equal(df, expected)
22+
23+
24+
def test_spss_labelled_num_na(datapath):
25+
# test file from the Haven project (https://haven.tidyverse.org/)
26+
fname = datapath("io", "data", "labelled-num-na.sav")
27+
28+
df = pd.read_spss(fname, convert_categoricals=True)
29+
expected = pd.DataFrame({"VAR00002": ["This is one", None]})
30+
expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
31+
tm.assert_frame_equal(df, expected)
32+
33+
df = pd.read_spss(fname, convert_categoricals=False)
34+
expected = pd.DataFrame({"VAR00002": [1.0, np.nan]})
35+
tm.assert_frame_equal(df, expected)
36+
37+
38+
def test_spss_labelled_str(datapath):
39+
# test file from the Haven project (https://haven.tidyverse.org/)
40+
fname = datapath("io", "data", "labelled-str.sav")
41+
42+
df = pd.read_spss(fname, convert_categoricals=True)
43+
expected = pd.DataFrame({"gender": ["Male", "Female"]})
44+
expected["gender"] = pd.Categorical(expected["gender"])
45+
tm.assert_frame_equal(df, expected)
46+
47+
df = pd.read_spss(fname, convert_categoricals=False)
48+
expected = pd.DataFrame({"gender": ["M", "F"]})
49+
tm.assert_frame_equal(df, expected)
50+
51+
52+
def test_spss_umlauts(datapath):
53+
# test file from the Haven project (https://haven.tidyverse.org/)
54+
fname = datapath("io", "data", "umlauts.sav")
55+
56+
df = pd.read_spss(fname, convert_categoricals=True)
57+
expected = pd.DataFrame({"var1": ["the ä umlaut",
58+
"the ü umlaut",
59+
"the ä umlaut",
60+
"the ö umlaut"]})
61+
expected["var1"] = pd.Categorical(expected["var1"])
62+
tm.assert_frame_equal(df, expected)
63+
64+
df = pd.read_spss(fname, convert_categoricals=False)
65+
expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]})
66+
tm.assert_frame_equal(df, expected)
67+
68+
69+
def test_spss_usecols(datapath):
70+
# usecols must be list-like
71+
fname = datapath("io", "data", "labelled-num.sav")
72+
73+
with pytest.raises(TypeError, match="usecols must be list-like."):
74+
pd.read_spss(fname, usecols="VAR00002")

requirements-dev.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ sqlalchemy
5252
xarray
5353
xlrd
5454
xlsxwriter
55-
xlwt
55+
xlwt
56+
pyreadstat

0 commit comments

Comments
 (0)