Skip to content

CLN: Cleanup subclass #110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 0 additions & 122 deletions pandas_datareader/_utils.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,12 @@
import time
import warnings
import numpy as np
import datetime as dt

from pandas import to_datetime
import pandas.compat as compat
from pandas.core.common import PandasError
from pandas import Panel, DataFrame
from pandas.io.common import urlopen
from pandas import read_csv
from pandas.compat import StringIO, bytes_to_str
from pandas.util.testing import _network_error_classes

if compat.PY3:
from urllib.parse import urlencode
else:
from urllib import urlencode

class SymbolWarning(UserWarning):
pass

class RemoteDataError(PandasError, IOError):
pass

def _get_data_from(symbols, start, end, interval, retry_count, pause,
chunksize, src_fn):

# If a single symbol, (e.g., 'GOOG')
if isinstance(symbols, (compat.string_types, int)):
hist_data = src_fn(symbols, start, end, interval, retry_count, pause)
# Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
elif isinstance(symbols, DataFrame):
hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize,
retry_count, pause, src_fn)
else:
hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize,
retry_count, pause, src_fn)
return hist_data

def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause,
method):
stocks = {}
failed = []
passed = []
for sym_group in _in_chunks(symbols, chunksize):
for sym in sym_group:
try:
stocks[sym] = method(sym, start, end, interval, retry_count, pause)
passed.append(sym)
except IOError:
warnings.warn('Failed to read symbol: {0!r}, replacing with '
'NaN.'.format(sym), SymbolWarning)
failed.append(sym)

if len(passed) == 0:
raise RemoteDataError("No data fetched using "
"{0!r}".format(method.__name__))
try:
if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0:
df_na = stocks[passed[0]].copy()
df_na[:] = np.nan
for sym in failed:
stocks[sym] = df_na
return Panel(stocks).swapaxes('items', 'minor')
except AttributeError:
# cannot construct a panel with just 1D nans indicating no data
raise RemoteDataError("No data fetched using "
"{0!r}".format(method.__name__))


def _sanitize_dates(start, end):
"""
Return (datetime_start, datetime_end) tuple
if start is None - default is 2010/01/01
if end is None - default is today
"""
start = to_datetime(start)
end = to_datetime(end)
if start is None:
start = dt.datetime(2010, 1, 1)
if end is None:
end = dt.datetime.today()
return start, end

def _in_chunks(seq, size):
"""
Return sequence in 'chunks' of size defined by size
"""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def _encode_url(url, params):
"""
Return encoded url with parameters
"""
s_params = urlencode(params)
if s_params:
return url + '?' + s_params
else:
return url

def _retry_read_url(url, retry_count, pause, name):
"""
Open url (and retry)
"""
for _ in range(retry_count):

# kludge to close the socket ASAP
try:
with urlopen(url) as resp:
lines = resp.read()
except _network_error_classes:
pass
else:
rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
parse_dates=True, na_values='-')[::-1]
# Yahoo! Finance sometimes does this awesome thing where they
# return 2 rows for the most recent business day
if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover
rs = rs[:-1]

#Get rid of unicode characters in index name.
try:
rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore')
except AttributeError:
#Python 3 string has no decode method.
rs.index.name = rs.index.name.encode('ascii', 'ignore').decode()

return rs

time.sleep(pause)

raise IOError("after %d tries, %s did not "
"return a 200 for url %r" % (retry_count, name, url))
200 changes: 200 additions & 0 deletions pandas_datareader/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import time
import warnings
import numpy as np
import datetime as dt

import requests

from pandas import to_datetime
import pandas.compat as compat
from pandas.core.common import PandasError
from pandas import Panel, DataFrame
from pandas import read_csv
from pandas.compat import StringIO, bytes_to_str
from pandas.util.testing import _network_error_classes

from pandas_datareader._utils import RemoteDataError, SymbolWarning


class _BaseReader(object):

"""

Parameters
----------
sym : string with a single Single stock symbol (ticker).
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, of the pause between retries.
session : Session, default None
requests.sessions.Session instance to be used
"""

_chunk_size = 1024 * 1024

def __init__(self, symbols, start=None, end=None,
retry_count=3, pause=0.001, session=None):
self.symbols = symbols

start, end = self._sanitize_dates(start, end)
self.start = start
self.end = end

if not isinstance(retry_count, int) or retry_count < 0:
raise ValueError("'retry_count' must be integer larger than 0")
self.retry_count = retry_count
self.pause = pause
self.session = self._init_session(session, retry_count)

def _init_session(self, session, retry_count):
if session is None:
session = requests.Session()
# do not set requests max_retries here to support arbitrary pause
return session

@property
def url(self):
# must be overridden in subclass
raise NotImplementedError

@property
def params(self):
return None

def read(self):
""" read data """
return self._read_one_data(self.url, self.params)

def _read_one_data(self, url, params):
""" read one data from specified URL """
out = self._read_url_as_StringIO(self.url, params=params)
return self._read_lines(out)

def _read_url_as_StringIO(self, url, params=None):
"""
Open url (and retry)
"""
response = self._get_response(url, params=params)
out = StringIO()
if isinstance(response.content, compat.binary_type):
out.write(bytes_to_str(response.content))
else:
out.write(response.content)
out.seek(0)
return out

def _get_response(self, url, params=None):
""" send raw HTTP request to get requests.Response from the specified url
Parameters
----------
url : str
target URL
params : dict or None
parameters passed to the URL
"""

# initial attempt + retry
for i in range(self.retry_count + 1):
response = self.session.get(url, params=params)
if response.status_code == requests.codes.ok:
return response
time.sleep(self.pause)

raise RemoteDataError('Unable to read URL: {0}'.format(url))

def _read_lines(self, out):
rs = read_csv(out, index_col=0, parse_dates=True, na_values='-')[::-1]
# Yahoo! Finance sometimes does this awesome thing where they
# return 2 rows for the most recent business day
if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover
rs = rs[:-1]
#Get rid of unicode characters in index name.
try:
rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore')
except AttributeError:
#Python 3 string has no decode method.
rs.index.name = rs.index.name.encode('ascii', 'ignore').decode()
return rs

def _sanitize_dates(self, start, end):
"""
Return (datetime_start, datetime_end) tuple
if start is None - default is 2010/01/01
if end is None - default is today
"""
start = to_datetime(start)
end = to_datetime(end)
if start is None:
start = dt.datetime(2010, 1, 1)
if end is None:
end = dt.datetime.today()
return start, end


class _DailyBaseReader(_BaseReader):
""" Base class for Google / Yahoo daily reader """

def __init__(self, symbols=None, start=None, end=None, retry_count=3,
pause=0.001, session=None, chunksize=25):
super(_DailyBaseReader, self).__init__(symbols=symbols,
start=start, end=end,
retry_count=retry_count,
pause=pause, session=session)
self.chunksize = chunksize

def _get_params(self, *args, **kwargs):
raise NotImplementedError

def read(self):
""" read data """
# If a single symbol, (e.g., 'GOOG')
if isinstance(self.symbols, (compat.string_types, int)):
df = self._read_one_data(self.url, params=self._get_params(self.symbols))
# Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
elif isinstance(self.symbols, DataFrame):
df = self._dl_mult_symbols(self.symbols.index)
else:
df = self._dl_mult_symbols(self.symbols)
return df

def _dl_mult_symbols(self, symbols):
stocks = {}
failed = []
passed = []
for sym_group in _in_chunks(symbols, self.chunksize):
for sym in sym_group:
try:
stocks[sym] = self._read_one_data(self.url, self._get_params(sym))
passed.append(sym)
except IOError:
msg = 'Failed to read symbol: {0!r}, replacing with NaN.'
warnings.warn(msg.format(sym), SymbolWarning)
failed.append(sym)

if len(passed) == 0:
msg = "No data fetched using {0!r}"
raise RemoteDataError(msg.format(self.__class__.__name__))
try:
if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0:
df_na = stocks[passed[0]].copy()
df_na[:] = np.nan
for sym in failed:
stocks[sym] = df_na
return Panel(stocks).swapaxes('items', 'minor')
except AttributeError:
# cannot construct a panel with just 1D nans indicating no data
msg = "No data fetched using {0!r}"
raise RemoteDataError(msg.format(self.__class__.__name__))


def _in_chunks(seq, size):
"""
Return sequence in 'chunks' of size defined by size
"""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
Loading