|
1 | 1 | import tempfile
|
2 |
| -import numpy as np |
| 2 | +import re |
| 3 | +import datetime as dt |
3 | 4 | from pandas.io.common import urlopen, ZipFile
|
4 |
| -from pandas.compat import lmap |
5 |
| -from pandas import DataFrame |
| 5 | +from pandas.compat import lmap, StringIO |
| 6 | +from pandas import read_csv, to_datetime |
6 | 7 |
|
7 |
| -_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' |
8 | 8 |
|
| 9 | +_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/' |
| 10 | +_URL_PREFIX = 'ftp/' |
| 11 | +_URL_SUFFIX = '_CSV.zip' |
9 | 12 |
|
10 |
| -def _get_data(name): |
11 |
| - # path of zip files |
12 |
| - zip_file_path = '{0}/{1}_TXT.zip'.format(_URL, name) |
13 | 13 |
|
14 |
| - with urlopen(zip_file_path) as url: |
15 |
| - raw = url.read() |
| 14 | +def get_available_datasets(): |
| 15 | + """ |
| 16 | + Get the list of datasets available from the Fama/French data library. |
| 17 | +
|
| 18 | + Returns |
| 19 | + ------- |
| 20 | + A list of valid inputs for get_data_famafrench. |
| 21 | + """ |
| 22 | + try: |
| 23 | + from lxml.html import parse |
| 24 | + except ImportError: |
| 25 | + raise ImportError("Please install lxml if you want to use the " |
| 26 | + "get_datasets_famafrench function") |
| 27 | + |
| 28 | + root = parse(_URL + 'data_library.html') |
| 29 | + |
| 30 | + l = filter(lambda x: x.startswith(_URL_PREFIX) and x.endswith(_URL_SUFFIX), |
| 31 | + [e.attrib['href'] for e in root.findall('.//a') if 'href' in e.attrib]) |
| 32 | + |
| 33 | + return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l) |
| 34 | + |
| 35 | + |
| 36 | +def _download_data_famafrench(name): |
| 37 | + url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX]) |
| 38 | + with urlopen(url) as socket: |
| 39 | + raw = socket.read() |
16 | 40 |
|
17 | 41 | with tempfile.TemporaryFile() as tmpf:
|
18 | 42 | tmpf.write(raw)
|
19 | 43 |
|
20 | 44 | with ZipFile(tmpf, 'r') as zf:
|
21 |
| - data = zf.open(zf.namelist()[0]).readlines() |
22 |
| - |
23 |
| - line_lengths = np.array(lmap(len, data)) |
24 |
| - file_edges = np.where(line_lengths == 2)[0] |
25 |
| - |
26 |
| - datasets = {} |
27 |
| - edges = zip(file_edges + 1, file_edges[1:]) |
28 |
| - for i, (left_edge, right_edge) in enumerate(edges): |
29 |
| - dataset = [d.split() for d in data[left_edge:right_edge]] |
30 |
| - if len(dataset) > 10: |
31 |
| - ncol_raw = np.array(lmap(len, dataset)) |
32 |
| - ncol = np.median(ncol_raw) |
33 |
| - header_index = np.where(ncol_raw == ncol - 1)[0][-1] |
34 |
| - header = dataset[header_index] |
35 |
| - ds_header = dataset[header_index + 1:] |
36 |
| - # to ensure the header is unique |
37 |
| - header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, |
38 |
| - start=1)] |
39 |
| - index = np.array([d[0] for d in ds_header], dtype=int) |
40 |
| - dataset = np.array([d[1:] for d in ds_header], dtype=float) |
41 |
| - datasets[i] = DataFrame(dataset, index, columns=header) |
| 45 | + data = zf.open(zf.namelist()[0]).read().decode() |
| 46 | + |
| 47 | + return data |
| 48 | + |
| 49 | + |
| 50 | +def _parse_date_famafrench(x): |
| 51 | + x = x.strip() |
| 52 | + try: return dt.datetime.strptime(x, '%Y%m') |
| 53 | + except: pass |
| 54 | + return to_datetime(x) |
| 55 | + |
| 56 | + |
| 57 | +def _get_data(name): |
| 58 | + """ |
| 59 | + Get data for the given name from the Fama/French data library. |
| 60 | +
|
| 61 | + For annual and monthly data, index is a pandas.PeriodIndex, otherwise |
| 62 | + it's a pandas.DatetimeIndex. |
| 63 | +
|
| 64 | + Returns |
| 65 | + ------- |
| 66 | + df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys. |
| 67 | + See df['DESCR'] for a description of the dataset |
| 68 | + """ |
| 69 | + params = {'index_col': 0, |
| 70 | + 'parse_dates': [0], |
| 71 | + 'date_parser': _parse_date_famafrench} |
| 72 | + |
| 73 | + # headers in these files are not valid |
| 74 | + if name.endswith('_Breakpoints'): |
| 75 | + c = ['<=0', '>0'] if name.find('-') > -1 else ['Count'] |
| 76 | + r = list(range(0, 105, 5)) |
| 77 | + params['names'] = ['Date'] + c + list(zip(r, r[1:])) |
| 78 | + params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3 |
| 79 | + |
| 80 | + doc_chunks, tables = [], [] |
| 81 | + data = _download_data_famafrench(name) |
| 82 | + for chunk in data.split(2 * '\r\n'): |
| 83 | + if len(chunk) < 800: |
| 84 | + doc_chunks.append(chunk.replace('\r\n', ' ').strip()) |
| 85 | + else: |
| 86 | + tables.append(chunk) |
| 87 | + |
| 88 | + datasets, table_desc = {}, [] |
| 89 | + for i, src in enumerate(tables): |
| 90 | + match = re.search('^\s*,', src, re.M) # the table starts there |
| 91 | + start = 0 if not match else match.start() |
| 92 | + |
| 93 | + df = read_csv(StringIO('Date' + src[start:]), **params) |
| 94 | + try: df = df.to_period(df.index.inferred_freq[:1]) |
| 95 | + except: pass |
| 96 | + datasets[i] = df |
| 97 | + |
| 98 | + title = src[:start].replace('\r\n', ' ').strip() |
| 99 | + shape = '({0} rows x {1} cols)'.format(*df.shape) |
| 100 | + table_desc.append('{0} {1}'.format(title, shape).strip()) |
| 101 | + |
| 102 | + descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-') |
| 103 | + if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n' |
| 104 | + |
| 105 | + table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc)) |
42 | 106 |
|
| 107 | + datasets['DESCR'] = descr + '\n'.join(table_descr) |
43 | 108 | return datasets
|
0 commit comments