Skip to content

Commit 402cd7f

Browse files
Merge pull request #116 from sinhrks/famafrenchreader
CLN: FamaFrenchReader
2 parents f6813e3 + 779d87d commit 402cd7f

File tree

4 files changed

+143
-62
lines changed

4 files changed

+143
-62
lines changed

pandas_datareader/data.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,17 @@
1414
from pandas_datareader.yahoo.options import Options as YahooOptions
1515

1616
from pandas_datareader.fred import FredReader
17-
from pandas_datareader.famafrench import _get_data as get_data_famafrench
17+
from pandas_datareader.famafrench import FamaFrenchReader
1818
from pandas_datareader.oecd import OECDReader
1919

2020

2121
# ToDo: deprecate
2222
def get_data_fred(*args, **kwargs):
2323
return FredReader(*args, **kwargs).read()
2424

25+
def get_data_famafrench(*args, **kwargs):
26+
return FamaFrenchReader(*args, **kwargs).read()
27+
2528
def get_data_google(*args, **kwargs):
2629
return GoogleDailyReader(*args, **kwargs).read()
2730

@@ -88,21 +91,28 @@ def DataReader(name, data_source=None, start=None, end=None,
8891
adjust_price=False, chunksize=25,
8992
retry_count=retry_count, pause=pause,
9093
session=session).read()
94+
9195
elif data_source == "yahoo-actions":
9296
return YahooActionReader(symbols=name, start=start, end=end,
9397
retry_count=retry_count, pause=pause,
9498
session=session).read()
99+
95100
elif data_source == "google":
96101
return GoogleDailyReader(symbols=name, start=start, end=end,
97102
chunksize=25,
98103
retry_count=retry_count, pause=pause,
99104
session=session).read()
105+
100106
elif data_source == "fred":
101107
return FredReader(symbols=name, start=start, end=end,
102108
retry_count=retry_count, pause=pause,
103109
session=session).read()
110+
104111
elif data_source == "famafrench":
105-
return get_data_famafrench(name)
112+
return FamaFrenchReader(symbols=name, start=start, end=end,
113+
retry_count=retry_count, pause=pause,
114+
session=session).read()
115+
106116
elif data_source == "oecd":
107117
return OECDReader(symbols=name, start=start, end=end,
108118
retry_count=retry_count, pause=pause,

pandas_datareader/famafrench.py

+80-58
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import tempfile
22
import re
33
import datetime as dt
4-
from pandas.io.common import urlopen, ZipFile
4+
from pandas.io.common import ZipFile
55
from pandas.compat import lmap, StringIO
66
from pandas import read_csv, to_datetime
77

8+
from pandas_datareader.base import _BaseReader
89

910
_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/'
1011
_URL_PREFIX = 'ftp/'
@@ -33,28 +34,17 @@ def get_available_datasets():
3334
return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l)
3435

3536

36-
def _download_data_famafrench(name):
37-
url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX])
38-
with urlopen(url) as socket:
39-
raw = socket.read()
40-
41-
with tempfile.TemporaryFile() as tmpf:
42-
tmpf.write(raw)
43-
44-
with ZipFile(tmpf, 'r') as zf:
45-
data = zf.open(zf.namelist()[0]).read().decode()
46-
47-
return data
48-
49-
5037
def _parse_date_famafrench(x):
5138
x = x.strip()
52-
try: return dt.datetime.strptime(x, '%Y%m')
53-
except: pass
39+
try:
40+
return dt.datetime.strptime(x, '%Y%m')
41+
except:
42+
pass
5443
return to_datetime(x)
5544

5645

57-
def _get_data(name):
46+
class FamaFrenchReader(_BaseReader):
47+
5848
"""
5949
Get data for the given name from the Fama/French data library.
6050
@@ -66,43 +56,75 @@ def _get_data(name):
6656
df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys.
6757
See df['DESCR'] for a description of the dataset
6858
"""
69-
params = {'index_col': 0,
70-
'parse_dates': [0],
71-
'date_parser': _parse_date_famafrench}
72-
73-
# headers in these files are not valid
74-
if name.endswith('_Breakpoints'):
75-
c = ['<=0', '>0'] if name.find('-') > -1 else ['Count']
76-
r = list(range(0, 105, 5))
77-
params['names'] = ['Date'] + c + list(zip(r, r[1:]))
78-
params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3
79-
80-
doc_chunks, tables = [], []
81-
data = _download_data_famafrench(name)
82-
for chunk in data.split(2 * '\r\n'):
83-
if len(chunk) < 800:
84-
doc_chunks.append(chunk.replace('\r\n', ' ').strip())
85-
else:
86-
tables.append(chunk)
87-
88-
datasets, table_desc = {}, []
89-
for i, src in enumerate(tables):
90-
match = re.search('^\s*,', src, re.M) # the table starts there
91-
start = 0 if not match else match.start()
92-
93-
df = read_csv(StringIO('Date' + src[start:]), **params)
94-
try: df = df.to_period(df.index.inferred_freq[:1])
95-
except: pass
96-
datasets[i] = df
97-
98-
title = src[:start].replace('\r\n', ' ').strip()
99-
shape = '({0} rows x {1} cols)'.format(*df.shape)
100-
table_desc.append('{0} {1}'.format(title, shape).strip())
101-
102-
descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-')
103-
if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'
104-
105-
table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc))
106-
107-
datasets['DESCR'] = descr + '\n'.join(table_descr)
108-
return datasets
59+
60+
@property
61+
def url(self):
62+
return ''.join([_URL, _URL_PREFIX, self.symbols, _URL_SUFFIX])
63+
64+
def _read_zipfile(self, url):
65+
raw = self._get_response(url).content
66+
67+
with tempfile.TemporaryFile() as tmpf:
68+
tmpf.write(raw)
69+
70+
with ZipFile(tmpf, 'r') as zf:
71+
data = zf.open(zf.namelist()[0]).read().decode()
72+
73+
return data
74+
75+
def _read_one_data(self, url, params):
76+
77+
params = {'index_col': 0,
78+
'parse_dates': [0],
79+
'date_parser': _parse_date_famafrench}
80+
81+
# headers in these files are not valid
82+
if self.symbols.endswith('_Breakpoints'):
83+
84+
if self.symbols.find('-') > -1:
85+
c = ['<=0', '>0']
86+
else:
87+
c = ['Count']
88+
r = list(range(0, 105, 5))
89+
params['names'] = ['Date'] + c + list(zip(r, r[1:]))
90+
91+
if self.symbols != 'Prior_2-12_Breakpoints':
92+
params['skiprows'] = 1
93+
else:
94+
params['skiprows'] = 3
95+
96+
doc_chunks, tables = [], []
97+
data = self._read_zipfile(url)
98+
99+
for chunk in data.split(2 * '\r\n'):
100+
if len(chunk) < 800:
101+
doc_chunks.append(chunk.replace('\r\n', ' ').strip())
102+
else:
103+
tables.append(chunk)
104+
105+
datasets, table_desc = {}, []
106+
for i, src in enumerate(tables):
107+
match = re.search('^\s*,', src, re.M) # the table starts there
108+
start = 0 if not match else match.start()
109+
110+
df = read_csv(StringIO('Date' + src[start:]), **params)
111+
try:
112+
idx_name = df.index.name # hack for pandas 0.16.2
113+
df = df.to_period(df.index.inferred_freq[:1])
114+
df.index.name = idx_name
115+
except:
116+
pass
117+
df = df.truncate(self.start, self.end)
118+
datasets[i] = df
119+
120+
title = src[:start].replace('\r\n', ' ').strip()
121+
shape = '({0} rows x {1} cols)'.format(*df.shape)
122+
table_desc.append('{0} {1}'.format(title, shape).strip())
123+
124+
descr = '{0}\n{1}\n\n'.format(self.symbols.replace('_', ' '), len(self.symbols) * '-')
125+
if doc_chunks:
126+
descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'
127+
table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc))
128+
datasets['DESCR'] = descr + '\n'.join(table_descr)
129+
130+
return datasets

pandas_datareader/oecd.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
from pandas import concat, read_csv
77

88
from pandas_datareader.io import read_jsdmx
9-
10-
119
from pandas_datareader.base import _BaseReader
1210

1311

@@ -18,6 +16,7 @@ class OECDReader(_BaseReader):
1816
@property
1917
def url(self):
2018
url = 'http://stats.oecd.org/SDMX-JSON/data'
19+
2120
if not isinstance(self.symbols, compat.string_types):
2221
raise ValueError('data name must be string')
2322

pandas_datareader/tests/test_famafrench.py

+50
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import nose
2+
import pandas as pd
23
import pandas.util.testing as tm
34

45
import pandas_datareader.data as web
@@ -27,6 +28,55 @@ def test_index(self):
2728
assert ff[0].index.freq == 'M'
2829
assert ff[1].index.freq == 'A-DEC'
2930

31+
def test_f_f_research(self):
32+
results = web.DataReader("F-F_Research_Data_Factors", "famafrench",
33+
start='2010-01-01', end='2010-12-01')
34+
self.assertTrue(isinstance(results, dict))
35+
self.assertEqual(len(results), 3)
36+
37+
exp = pd.DataFrame({'Mkt-RF': [-3.36, 3.4, 6.31, 2., -7.89, -5.56,
38+
6.93, -4.77, 9.54, 3.88, 0.6, 6.82],
39+
'SMB': [0.2, 1.44, 1.57, 4.92, -0.09, -2.15,
40+
0.24, -3.03, 3.84, 1.01, 3.69, 0.85],
41+
'HML': [0.61, 2.74, 2.01, 3.12, -2.32, -4.27,
42+
0.04, -1.51, -2.94, -2.23, -0.58, 3.47],
43+
'RF': [0., 0., 0.01, 0.01, 0.01, 0.01, 0.01,
44+
0.01, 0.01, 0.01, 0.01, 0.01]},
45+
index=pd.period_range('2010-01-01', '2010-12-01', freq='M', name='Date'),
46+
columns=['Mkt-RF', 'SMB', 'HML', 'RF'])
47+
tm.assert_frame_equal(results[0], exp)
48+
49+
def test_me_breakpoints(self):
50+
results = web.DataReader("ME_Breakpoints", "famafrench",
51+
start='2010-01-01', end='2010-12-01')
52+
self.assertTrue(isinstance(results, dict))
53+
self.assertEqual(len(results), 2)
54+
self.assertEqual(results[0].shape, (12, 21))
55+
56+
exp_columns = pd.Index(['Count', (0, 5), (5, 10), (10, 15), (15, 20), (20, 25),
57+
(25, 30), (30, 35), (35, 40), (40, 45), (45, 50), (50, 55),
58+
(55, 60), (60, 65), (65, 70), (70, 75), (75, 80), (80, 85),
59+
(85, 90), (90, 95), (95, 100)], dtype='object')
60+
tm.assert_index_equal(results[0].columns, exp_columns)
61+
62+
exp_index = pd.period_range('2010-01-01', '2010-12-01', freq='M', name='Date')
63+
tm.assert_index_equal(results[0].index, exp_index)
64+
65+
def test_prior_2_12_breakpoints(self):
66+
results = web.DataReader("Prior_2-12_Breakpoints", "famafrench",
67+
start='2010-01-01', end='2010-12-01')
68+
self.assertTrue(isinstance(results, dict))
69+
self.assertEqual(len(results), 2)
70+
self.assertEqual(results[0].shape, (12, 22))
71+
72+
exp_columns = pd.Index(['<=0', '>0', (0, 5), (5, 10), (10, 15), (15, 20), (20, 25),
73+
(25, 30), (30, 35), (35, 40), (40, 45), (45, 50), (50, 55),
74+
(55, 60), (60, 65), (65, 70), (70, 75), (75, 80), (80, 85),
75+
(85, 90), (90, 95), (95, 100)], dtype='object')
76+
tm.assert_index_equal(results[0].columns, exp_columns)
77+
78+
exp_index = pd.period_range('2010-01-01', '2010-12-01', freq='M', name='Date')
79+
tm.assert_index_equal(results[0].index, exp_index)
3080

3181

3282
if __name__ == '__main__':

0 commit comments

Comments
 (0)