Skip to content

Commit f7b92a2

Browse files
Merge pull request #56 from 0x0L/famafrench
ENH: Fama/French re-write.
2 parents 83a6e36 + 896fc02 commit f7b92a2

File tree

4 files changed

+136
-41
lines changed

4 files changed

+136
-41
lines changed

docs/source/remote_data.rst

+7-3
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,18 @@ FRED
155155
Fama/French
156156
===========
157157

158-
Dataset names are listed at `Fama/French Data Library
158+
Access datasets from the `Fama/French Data Library
159159
<http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html>`__.
160+
The ``get_available_datasets`` function returns a list of all available datasets.
160161

161162
.. ipython:: python
162163
164+
from pandas_datareader.famafrench import get_available_datasets
163165
import pandas_datareader.data as web
164-
ip = web.DataReader("5_Industry_Portfolios", "famafrench")
165-
ip[4].ix[192607]
166+
len(get_available_datasets())
167+
ds = web.DataReader("5_Industry_Portfolios", "famafrench")
168+
print(ds['DESCR'])
169+
ds[4].ix['1926-07']
166170
167171
.. _remote_data.wb:
168172

pandas_datareader/famafrench.py

+95-30
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,108 @@
11
import tempfile
2-
import numpy as np
2+
import re
3+
import datetime as dt
34
from pandas.io.common import urlopen, ZipFile
4-
from pandas.compat import lmap
5-
from pandas import DataFrame
5+
from pandas.compat import lmap, StringIO
6+
from pandas import read_csv, to_datetime
67

7-
_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
88

9+
_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/'
10+
_URL_PREFIX = 'ftp/'
11+
_URL_SUFFIX = '_CSV.zip'
912

10-
def _get_data(name):
11-
# path of zip files
12-
zip_file_path = '{0}/{1}_TXT.zip'.format(_URL, name)
1313

14-
with urlopen(zip_file_path) as url:
15-
raw = url.read()
14+
def get_available_datasets():
15+
"""
16+
Get the list of datasets available from the Fama/French data library.
17+
18+
Returns
19+
-------
20+
A list of valid inputs for get_data_famafrench.
21+
"""
22+
try:
23+
from lxml.html import parse
24+
except ImportError:
25+
raise ImportError("Please install lxml if you want to use the "
26+
"get_datasets_famafrench function")
27+
28+
root = parse(_URL + 'data_library.html')
29+
30+
l = filter(lambda x: x.startswith(_URL_PREFIX) and x.endswith(_URL_SUFFIX),
31+
[e.attrib['href'] for e in root.findall('.//a') if 'href' in e.attrib])
32+
33+
return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l)
34+
35+
36+
def _download_data_famafrench(name):
37+
url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX])
38+
with urlopen(url) as socket:
39+
raw = socket.read()
1640

1741
with tempfile.TemporaryFile() as tmpf:
1842
tmpf.write(raw)
1943

2044
with ZipFile(tmpf, 'r') as zf:
21-
data = zf.open(zf.namelist()[0]).readlines()
22-
23-
line_lengths = np.array(lmap(len, data))
24-
file_edges = np.where(line_lengths == 2)[0]
25-
26-
datasets = {}
27-
edges = zip(file_edges + 1, file_edges[1:])
28-
for i, (left_edge, right_edge) in enumerate(edges):
29-
dataset = [d.split() for d in data[left_edge:right_edge]]
30-
if len(dataset) > 10:
31-
ncol_raw = np.array(lmap(len, dataset))
32-
ncol = np.median(ncol_raw)
33-
header_index = np.where(ncol_raw == ncol - 1)[0][-1]
34-
header = dataset[header_index]
35-
ds_header = dataset[header_index + 1:]
36-
# to ensure the header is unique
37-
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
38-
start=1)]
39-
index = np.array([d[0] for d in ds_header], dtype=int)
40-
dataset = np.array([d[1:] for d in ds_header], dtype=float)
41-
datasets[i] = DataFrame(dataset, index, columns=header)
45+
data = zf.open(zf.namelist()[0]).read().decode()
46+
47+
return data
48+
49+
50+
def _parse_date_famafrench(x):
51+
x = x.strip()
52+
try: return dt.datetime.strptime(x, '%Y%m')
53+
except: pass
54+
return to_datetime(x)
55+
56+
57+
def _get_data(name):
58+
"""
59+
Get data for the given name from the Fama/French data library.
60+
61+
For annual and monthly data, index is a pandas.PeriodIndex, otherwise
62+
it's a pandas.DatetimeIndex.
63+
64+
Returns
65+
-------
66+
df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys.
67+
See df['DESCR'] for a description of the dataset
68+
"""
69+
params = {'index_col': 0,
70+
'parse_dates': [0],
71+
'date_parser': _parse_date_famafrench}
72+
73+
# headers in these files are not valid
74+
if name.endswith('_Breakpoints'):
75+
c = ['<=0', '>0'] if name.find('-') > -1 else ['Count']
76+
r = list(range(0, 105, 5))
77+
params['names'] = ['Date'] + c + list(zip(r, r[1:]))
78+
params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3
79+
80+
doc_chunks, tables = [], []
81+
data = _download_data_famafrench(name)
82+
for chunk in data.split(2 * '\r\n'):
83+
if len(chunk) < 800:
84+
doc_chunks.append(chunk.replace('\r\n', ' ').strip())
85+
else:
86+
tables.append(chunk)
87+
88+
datasets, table_desc = {}, []
89+
for i, src in enumerate(tables):
90+
match = re.search('^\s*,', src, re.M) # the table starts there
91+
start = 0 if not match else match.start()
92+
93+
df = read_csv(StringIO('Date' + src[start:]), **params)
94+
try: df = df.to_period(df.index.inferred_freq[:1])
95+
except: pass
96+
datasets[i] = df
97+
98+
title = src[:start].replace('\r\n', ' ').strip()
99+
shape = '({0} rows x {1} cols)'.format(*df.shape)
100+
table_desc.append('{0} {1}'.format(title, shape).strip())
101+
102+
descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-')
103+
if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'
104+
105+
table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc))
42106

107+
datasets['DESCR'] = descr + '\n'.join(table_descr)
43108
return datasets

pandas_datareader/tests/test_data.py

-8
Original file line numberDiff line numberDiff line change
@@ -465,14 +465,6 @@ def test_read_fred(self):
465465
vix = DataReader("VIXCLS", "fred")
466466
assert isinstance(vix, DataFrame)
467467

468-
def test_read_famafrench(self):
469-
for name in ("F-F_Research_Data_Factors",
470-
"F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
471-
"F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"):
472-
ff = DataReader(name, "famafrench")
473-
assert ff
474-
assert isinstance(ff, dict)
475-
476468
def test_not_implemented(self):
477469
self.assertRaises(NotImplementedError, DataReader, "NA", "NA")
478470

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import nose
2+
import pandas.util.testing as tm
3+
4+
import pandas_datareader.data as web
5+
from pandas_datareader.famafrench import get_available_datasets
6+
7+
8+
class TestFamaFrench(tm.TestCase):
9+
def test_get_data(self):
10+
keys = [
11+
'F-F_Research_Data_Factors', 'F-F_ST_Reversal_Factor',
12+
'6_Portfolios_2x3', 'Portfolios_Formed_on_ME',
13+
'Prior_2-12_Breakpoints', 'ME_Breakpoints',
14+
]
15+
for name in keys:
16+
ff = web.DataReader(name, 'famafrench')
17+
assert 'DESCR' in ff
18+
assert len(ff) > 1
19+
20+
def test_get_available_datasets(self):
21+
# _skip_if_no_lxml()
22+
l = get_available_datasets()
23+
assert len(l) > 100
24+
25+
def test_index(self):
26+
ff = web.DataReader('F-F_Research_Data_Factors', 'famafrench')
27+
assert ff[0].index.freq == 'M'
28+
assert ff[1].index.freq == 'A-DEC'
29+
30+
31+
32+
if __name__ == '__main__':
33+
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
34+
exit=False)

0 commit comments

Comments
 (0)