Merge pull request #56 from 0x0L/famafrench

davidastephens · davidastephens · commit f7b92a2c4f48 · 2015-08-29T12:28:34.000-07:00
ENH: Fama/French re-write.
diff --git a/docs/source/remote_data.rst b/docs/source/remote_data.rst
@@ -155,14 +155,18 @@ FRED
 Fama/French
 ===========
 
-Dataset names are listed at `Fama/French Data Library
+Access datasets from the `Fama/French Data Library
 <http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html>`__.
+The ``get_available_datasets`` function returns a list of all available datasets.
 
 .. ipython:: python
 
+    from pandas_datareader.famafrench import get_available_datasets
     import pandas_datareader.data as web
-    ip = web.DataReader("5_Industry_Portfolios", "famafrench")
-    ip[4].ix[192607]
+    len(get_available_datasets())
+    ds = web.DataReader("5_Industry_Portfolios", "famafrench")
+    print(ds['DESCR'])
+    ds[4].ix['1926-07']
 
 .. _remote_data.wb:
 
diff --git a/pandas_datareader/famafrench.py b/pandas_datareader/famafrench.py
@@ -1,43 +1,108 @@
 import tempfile
-import numpy as np
+import re
+import datetime as dt
 from pandas.io.common import urlopen, ZipFile
-from pandas.compat import lmap
-from pandas import DataFrame
+from pandas.compat import lmap, StringIO
+from pandas import read_csv, to_datetime
 
-_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
 
+_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/'
+_URL_PREFIX = 'ftp/'
+_URL_SUFFIX = '_CSV.zip'
 
-def _get_data(name):
-    # path of zip files
-    zip_file_path = '{0}/{1}_TXT.zip'.format(_URL, name)
 
-    with urlopen(zip_file_path) as url:
-        raw = url.read()
+def get_available_datasets():
+    """
+    Get the list of datasets available from the Fama/French data library.
+
+    Returns
+    -------
+    A list of valid inputs for get_data_famafrench.
+    """
+    try:
+        from lxml.html import parse
+    except ImportError:
+        raise ImportError("Please install lxml if you want to use the "
+                          "get_datasets_famafrench function")
+
+    root = parse(_URL + 'data_library.html')
+
+    l = filter(lambda x: x.startswith(_URL_PREFIX) and x.endswith(_URL_SUFFIX),
+               [e.attrib['href'] for e in root.findall('.//a') if 'href' in e.attrib])
+
+    return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l)
+
+
+def _download_data_famafrench(name):
+    url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX])
+    with urlopen(url) as socket:
+        raw = socket.read()
 
     with tempfile.TemporaryFile() as tmpf:
         tmpf.write(raw)
 
         with ZipFile(tmpf, 'r') as zf:
-            data = zf.open(zf.namelist()[0]).readlines()
-
-    line_lengths = np.array(lmap(len, data))
-    file_edges = np.where(line_lengths == 2)[0]
-
-    datasets = {}
-    edges = zip(file_edges + 1, file_edges[1:])
-    for i, (left_edge, right_edge) in enumerate(edges):
-        dataset = [d.split() for d in data[left_edge:right_edge]]
-        if len(dataset) > 10:
-            ncol_raw = np.array(lmap(len, dataset))
-            ncol = np.median(ncol_raw)
-            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
-            header = dataset[header_index]
-            ds_header = dataset[header_index + 1:]
-            # to ensure the header is unique
-            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
-                                                                     start=1)]
-            index = np.array([d[0] for d in ds_header], dtype=int)
-            dataset = np.array([d[1:] for d in ds_header], dtype=float)
-            datasets[i] = DataFrame(dataset, index, columns=header)
+            data = zf.open(zf.namelist()[0]).read().decode()
+
+    return data
+
+
+def _parse_date_famafrench(x):
+    x = x.strip()
+    try: return dt.datetime.strptime(x, '%Y%m')
+    except: pass
+    return to_datetime(x)
+
+
+def _get_data(name):
+    """
+    Get data for the given name from the Fama/French data library.
+
+    For annual and monthly data, index is a pandas.PeriodIndex, otherwise
+    it's a pandas.DatetimeIndex.
+
+    Returns
+    -------
+    df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys.
+         See df['DESCR'] for a description of the dataset
+    """
+    params = {'index_col': 0,
+              'parse_dates': [0],
+              'date_parser': _parse_date_famafrench}
+
+    # headers in these files are not valid
+    if name.endswith('_Breakpoints'):
+        c = ['<=0', '>0'] if name.find('-') > -1 else ['Count']
+        r = list(range(0, 105, 5))
+        params['names'] = ['Date'] + c + list(zip(r, r[1:]))
+        params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3
+
+    doc_chunks, tables = [], []
+    data = _download_data_famafrench(name)
+    for chunk in data.split(2 * '\r\n'):
+        if len(chunk) < 800:
+            doc_chunks.append(chunk.replace('\r\n', ' ').strip())
+        else:
+            tables.append(chunk)
+
+    datasets, table_desc = {}, []
+    for i, src in enumerate(tables):
+        match = re.search('^\s*,', src, re.M)  # the table starts there
+        start = 0 if not match else match.start()
+
+        df = read_csv(StringIO('Date' + src[start:]), **params)
+        try: df = df.to_period(df.index.inferred_freq[:1])
+        except: pass
+        datasets[i] = df
+
+        title = src[:start].replace('\r\n', ' ').strip()
+        shape = '({0} rows x {1} cols)'.format(*df.shape)
+        table_desc.append('{0} {1}'.format(title, shape).strip())
+
+    descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-')
+    if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'
+
+    table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc))
 
+    datasets['DESCR'] = descr + '\n'.join(table_descr)
     return datasets
diff --git a/pandas_datareader/tests/test_data.py b/pandas_datareader/tests/test_data.py
@@ -465,14 +465,6 @@ def test_read_fred(self):
         vix = DataReader("VIXCLS", "fred")
         assert isinstance(vix, DataFrame)
 
-    def test_read_famafrench(self):
-        for name in ("F-F_Research_Data_Factors",
-                     "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
-                     "F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"):
-            ff = DataReader(name, "famafrench")
-            assert ff
-            assert isinstance(ff, dict)
-
     def test_not_implemented(self):
         self.assertRaises(NotImplementedError, DataReader, "NA", "NA")
 
diff --git a/pandas_datareader/tests/test_famafrench.py b/pandas_datareader/tests/test_famafrench.py
@@ -0,0 +1,34 @@
+import nose
+import pandas.util.testing as tm
+
+import pandas_datareader.data as web
+from pandas_datareader.famafrench import get_available_datasets
+
+
+class TestFamaFrench(tm.TestCase):
+    def test_get_data(self):
+        keys = [
+            'F-F_Research_Data_Factors', 'F-F_ST_Reversal_Factor',
+            '6_Portfolios_2x3', 'Portfolios_Formed_on_ME',
+            'Prior_2-12_Breakpoints', 'ME_Breakpoints',
+        ]
+        for name in keys:
+            ff = web.DataReader(name, 'famafrench')
+            assert 'DESCR' in ff
+            assert len(ff) > 1
+
+    def test_get_available_datasets(self):
+        # _skip_if_no_lxml()
+        l = get_available_datasets()
+        assert len(l) > 100
+
+    def test_index(self):
+        ff = web.DataReader('F-F_Research_Data_Factors', 'famafrench')
+        assert ff[0].index.freq == 'M'
+        assert ff[1].index.freq == 'A-DEC'
+
+
+
+if __name__ == '__main__':
+    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
+                   exit=False)