pandas-dev
diff --git a/Diff for: ‎.github/PULL_REQUEST_TEMPLATE.md
+1-1 b/Diff for: ‎.github/PULL_REQUEST_TEMPLATE.md
+1-1
diff --git a/Diff for: ‎asv_bench/benchmarks/hdfstore_bench.py
+9 b/Diff for: ‎asv_bench/benchmarks/hdfstore_bench.py
+9
diff --git a/Diff for: ‎ci/requirements-2.7.build
+1-1 b/Diff for: ‎ci/requirements-2.7.build
+1-1
diff --git a/Diff for: ‎ci/requirements-2.7.sh
+1-1 b/Diff for: ‎ci/requirements-2.7.sh
+1-1
diff --git a/Diff for: ‎ci/requirements-2.7_BUILD_TEST.sh
+1-1 b/Diff for: ‎ci/requirements-2.7_BUILD_TEST.sh
+1-1
diff --git a/Diff for: ‎ci/requirements-3.5.sh
+1-1 b/Diff for: ‎ci/requirements-3.5.sh
+1-1
diff --git a/Diff for: ‎ci/requirements-3.6.build
+1-1 b/Diff for: ‎ci/requirements-3.6.build
+1-1
diff --git a/Diff for: ‎ci/requirements-3.6.run
+1 b/Diff for: ‎ci/requirements-3.6.run
+1
diff --git a/Diff for: ‎ci/requirements-3.6_DOC.run
+1-1 b/Diff for: ‎ci/requirements-3.6_DOC.run
+1-1
diff --git a/Diff for: ‎ci/requirements-3.6_DOC.sh
+1-1 b/Diff for: ‎ci/requirements-3.6_DOC.sh
+1-1
diff --git a/Diff for: ‎ci/requirements-3.6_NUMPY_DEV.build
-1 b/Diff for: ‎ci/requirements-3.6_NUMPY_DEV.build
-1
diff --git a/Diff for: ‎ci/requirements-3.6_NUMPY_DEV.build.sh
+3 b/Diff for: ‎ci/requirements-3.6_NUMPY_DEV.build.sh
+3
diff --git a/Diff for: ‎doc/source/api.rst
+1 b/Diff for: ‎doc/source/api.rst
+1
diff --git a/Diff for: ‎doc/source/contributing.rst
+6 b/Diff for: ‎doc/source/contributing.rst
+6
diff --git a/Diff for: ‎doc/source/ecosystem.rst
+11 b/Diff for: ‎doc/source/ecosystem.rst
+11
diff --git a/Diff for: ‎doc/source/groupby.rst
+2-2 b/Diff for: ‎doc/source/groupby.rst
+2-2
diff --git a/Diff for: ‎doc/source/io.rst
+92-56 b/Diff for: ‎doc/source/io.rst
+92-56
@@ -1,4 +1,4 @@
  - [ ] closes #xxxx
  - [ ] tests added / passed
- - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff``
+ - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff`` (On Windows, ``git diff upstream/master -u -- "*.py" | flake8 --diff`` might work as an alternative.)
  - [ ] whatsnew entry
@@ -90,6 +90,15 @@ def time_query_store_table(self):
         stop = self.df2.index[15000]
         self.store.select('table', where="index > start and index < stop")
 
+    def time_store_repr(self):
+        repr(self.store)
+
+    def time_store_str(self):
+        str(self.store)
+
+    def time_store_info(self):
+        self.store.info()
+
 
 class HDF5Panel(object):
     goal_time = 0.2
 
@@ -2,5 +2,5 @@ python=2.7*
 python-dateutil=2.4.1
 pytz=2013b
 nomkl
-numpy=1.12*
+numpy
 cython=0.23
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 27"
 
-conda install -n pandas -c conda-forge feather-format
+conda install -n pandas -c conda-forge feather-format jemalloc=4.4.0
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 27 BUILD_TEST"
 
-conda install -n pandas -c conda-forge pyarrow dask
+conda install -n pandas -c conda-forge pyarrow dask jemalloc=4.4.0
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 35"
 
-conda install -n pandas -c conda-forge feather-format
+conda install -n pandas -c conda-forge feather-format jemalloc=4.4.0
@@ -2,5 +2,5 @@ python=3.6*
 python-dateutil
 pytz
 nomkl
-numpy=1.12*
+numpy
 cython
@@ -14,6 +14,7 @@ html5lib
 jinja2
 sqlalchemy
 pymysql
+jemalloc=4.4.0
 feather-format
 # psycopg2 (not avail on defaults ATM)
 beautifulsoup4
 
@@ -1,7 +1,7 @@
 ipython
 ipykernel
 ipywidgets
-sphinx
+sphinx=1.5*
 nbconvert
 nbformat
 notebook
 
@@ -6,6 +6,6 @@ echo "[install DOC_BUILD deps]"
 
 pip install pandas-gbq
 
-conda install -n pandas -c conda-forge feather-format nbsphinx pandoc
+conda install -n pandas -c conda-forge feather-format nbsphinx pandoc jemalloc=4.4.0
 
 conda install -n pandas -c r r rpy2 --yes
@@ -1,4 +1,3 @@
 python=3.6*
-python-dateutil
 pytz
 cython
@@ -11,4 +11,7 @@ pip uninstall numpy -y
 PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com"
 pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
 
+# install dateutil from master
+pip install -U git+git://github.com/dateutil/dateutil.git
+
 true
@@ -99,6 +99,7 @@ HDFStore: PyTables (HDF5)
    HDFStore.append
    HDFStore.get
    HDFStore.select
+   HDFStore.info
 
 Feather
 ~~~~~~~
 
@@ -525,6 +525,12 @@ run this slightly modified command::
 
    git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8
 
+Note that on Windows, ``grep``, ``xargs``, and other tools are likely
+unavailable. However, this has been shown to work on smaller commits in the
+standard Windows command line::
+
+   git diff master -u -- "*.py" | flake8 --diff
+
 Backwards Compatibility
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 
@@ -239,3 +239,14 @@ pandas own ``read_csv`` for CSV IO and leverages many existing packages such as
 PyTables, h5py, and pymongo to move data between non pandas formats. Its graph
 based approach is also extensible by end users for custom formats that may be
 too specific for the core of odo.
+
+.. _ecosystem.data_validation:
+
+Data validation
+---------------
+
+`Engarde <http://engarde.readthedocs.io/en/latest/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Engarde is a lightweight library used to explicitly state your assumptions abour your datasets
+and check that they're *actually* true.
@@ -1200,14 +1200,14 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
    df
    df.groupby(df.sum(), axis=1).sum()
 
-.. _groupby.multicolumn_factorization
+.. _groupby.multicolumn_factorization:
 
 Multi-column factorization
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 By using ``.ngroup()``, we can extract information about the groups in
 a way similar to :func:`factorize` (as described further in the
-:ref:`reshaping API <reshaping.factorization>`) but which applies
+:ref:`reshaping API <reshaping.factorize>`) but which applies
 naturally to multiple columns of mixed type and different
 sources. This can be useful as an intermediate categorical-like step
 in processing, when the relationships between the group rows are more
 
@@ -137,8 +137,10 @@ usecols : array-like or callable, default ``None``
 
   Using this parameter results in much faster parsing time and lower memory usage.
 as_recarray : boolean, default ``False``
-  DEPRECATED: this argument will be removed in a future version. Please call
-  ``pd.read_csv(...).to_records()`` instead.
+
+  .. deprecated:: 0.18.2
+
+     Please call ``pd.read_csv(...).to_records()`` instead.
 
   Return a NumPy recarray instead of a DataFrame after parsing the data. If
   set to ``True``, this option takes precedence over the ``squeeze`` parameter.
@@ -191,7 +193,11 @@ skiprows : list-like or integer, default ``None``
 skipfooter : int, default ``0``
   Number of lines at bottom of file to skip (unsupported with engine='c').
 skip_footer : int, default ``0``
-  DEPRECATED: use the ``skipfooter`` parameter instead, as they are identical
+
+  .. deprecated:: 0.19.0
+
+     Use the ``skipfooter`` parameter instead, as they are identical
+
 nrows : int, default ``None``
   Number of rows of file to read. Useful for reading pieces of large files.
 low_memory : boolean, default ``True``
@@ -202,16 +208,25 @@ low_memory : boolean, default ``True``
   use the ``chunksize`` or ``iterator`` parameter to return the data in chunks.
   (Only valid with C parser)
 buffer_lines : int, default None
-    DEPRECATED: this argument will be removed in a future version because its
-    value is not respected by the parser
+
+  .. deprecated:: 0.19.0
+
+     Argument removed because its value is not respected by the parser
+     
 compact_ints : boolean, default False
-  DEPRECATED: this argument will be removed in a future version
+
+  .. deprecated:: 0.19.0
+
+     Argument moved to ``pd.to_numeric``
 
   If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the
   parser will attempt to cast it as the smallest integer ``dtype`` possible, either
   signed or unsigned depending on the specification from the ``use_unsigned`` parameter.
 use_unsigned : boolean, default False
-  DEPRECATED: this argument will be removed in a future version
+
+  .. deprecated:: 0.18.2
+
+     Argument moved to ``pd.to_numeric``
 
   If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
   the column should be compacted to the smallest signed or unsigned integer dtype.
@@ -225,9 +240,9 @@ NA and Missing Data Handling
 
 na_values : scalar, str, list-like, or dict, default ``None``
   Additional strings to recognize as NA/NaN. If dict passed, specific per-column
-  NA values. By default the following values are interpreted as NaN:
-  ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA',
-  '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``.
+  NA values. See :ref:`na values const <io.navaluesconst>` below
+  for a list of the values interpreted as NaN by default.
+
 keep_default_na : boolean, default ``True``
   If na_values are specified and keep_default_na is ``False`` the default NaN
   values are overridden, otherwise they're appended to.
@@ -712,6 +727,16 @@ index column inference and discard the last column, pass ``index_col=False``:
     pd.read_csv(StringIO(data))
     pd.read_csv(StringIO(data), index_col=False)
 
+If a subset of data is being parsed using the ``usecols`` option, the
+``index_col`` specification is based on that subset, not the original data.
+
+.. ipython:: python
+
+    data = 'a,b,c\n4,apple,bat,\n8,orange,cow,'
+    print(data)
+    pd.read_csv(StringIO(data), usecols=['b', 'c'])
+    pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0)
+
 .. _io.parse_dates:
 
 Date Handling
@@ -1020,10 +1045,11 @@ the corresponding equivalent values will also imply a missing value (in this cas
 ``[5.0,5]`` are recognized as ``NaN``.
 
 To completely override the default values that are recognized as missing, specify ``keep_default_na=False``.
-The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A','N/A', 'NA',
-'#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan']``. Although a 0-length string
-``''`` is not included in the default ``NaN`` values list, it is still treated
-as a missing value.
+
+.. _io.navaluesconst:
+
+The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A',
+'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``.
 
 .. code-block:: python
 
@@ -3396,7 +3422,7 @@ Fixed Format
    This was prior to 0.13.0 the ``Storer`` format.
 
 The examples above show storing using ``put``, which write the HDF5 to ``PyTables`` in a fixed array format, called
-the ``fixed`` format. These types of stores are are **not** appendable once written (though you can simply
+the ``fixed`` format. These types of stores are **not** appendable once written (though you can simply
 remove them and rewrite). Nor are they **queryable**; they must be
 retrieved in their entirety. They also do not support dataframes with non-unique column names.
 The ``fixed`` format stores offer very fast writing and slightly faster reading than ``table`` stores.
@@ -4056,26 +4082,64 @@ Compression
 +++++++++++
 
 ``PyTables`` allows the stored data to be compressed. This applies to
-all kinds of stores, not just tables.
+all kinds of stores, not just tables. Two parameters are used to
+control compression: ``complevel`` and ``complib``.
+
+``complevel`` specifies if and how hard data is to be compressed.
+              ``complevel=0`` and ``complevel=None`` disables
+              compression and ``0<complevel<10`` enables compression.
+              
+``complib`` specifies which compression library to use. If nothing is
+            specified the default library ``zlib`` is used. A
+            compression library usually optimizes for either good
+            compression rates or speed and the results will depend on
+            the type of data. Which type of
+            compression to choose depends on your specific needs and
+            data. The list of supported compression libraries:
+
+             - `zlib <http://zlib.net/>`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow.
+             - `lzo <http://www.oberhumer.com/opensource/lzo/>`_: Fast compression and decompression.
+             - `bzip2 <http://bzip.org/>`_: Good compression rates.
+             - `blosc <http://www.blosc.org/>`_: Fast compression and decompression.
+
+             .. versionadded:: 0.20.2
+                               
+                Support for alternative blosc compressors:
+                  
+                - `blosc:blosclz <http://www.blosc.org/>`_ This is the
+                  default compressor for ``blosc``
+                - `blosc:lz4
+                  <https://fastcompression.blogspot.dk/p/lz4.html>`_:
+                  A compact, very popular and fast compressor.
+                - `blosc:lz4hc
+                  <https://fastcompression.blogspot.dk/p/lz4.html>`_:
+                  A tweaked version of LZ4, produces better
+                  compression ratios at the expense of speed.
+                - `blosc:snappy <https://google.github.io/snappy/>`_:
+                  A popular compressor used in many places.
+                - `blosc:zlib <http://zlib.net/>`_: A classic;
+                  somewhat slower than the previous ones, but
+                  achieving better compression ratios.
+                - `blosc:zstd <https://facebook.github.io/zstd/>`_: An
+                  extremely well balanced codec; it provides the best
+                  compression ratios among the others above, and at
+                  reasonably fast speed.
+
+             If ``complib`` is defined as something other than the
+             listed libraries a ``ValueError`` exception is issued.
 
-- Pass ``complevel=int`` for a compression level (1-9, with 0 being no
-  compression, and the default)
-- Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for
-  whichever compression library you prefer.
+.. note::
 
-``HDFStore`` will use the file based compression scheme if no overriding
-``complib`` or ``complevel`` options are provided. ``blosc`` offers very
-fast compression, and is my most used. Note that ``lzo`` and ``bzip2``
-may not be installed (by Python) by default.
+   If the library specified with the ``complib`` option is missing on your platform,
+   compression defaults to ``zlib`` without further ado.
 
-Compression for all objects within the file
+Enable compression for all objects within the file:
 
 .. code-block:: python
 
-   store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc')
+   store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc:blosclz')
 
-Or on-the-fly compression (this only applies to tables). You can turn
-off file compression for a specific table by passing ``complevel=0``
+Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled:
 
 .. code-block:: python
 
@@ -4410,34 +4474,6 @@ Performance
   `Here <http://stackoverflow.com/questions/14355151/how-to-make-pandas-hdfstore-put-operation-faster/14370190#14370190>`__
   for more information and some solutions.
 
-Experimental
-''''''''''''
-
-HDFStore supports ``Panel4D`` storage.
-
-.. ipython:: python
-   :okwarning:
-
-   wp = pd.Panel(randn(2, 5, 4), items=['Item1', 'Item2'],
-                 major_axis=pd.date_range('1/1/2000', periods=5),
-                 minor_axis=['A', 'B', 'C', 'D'])
-   p4d = pd.Panel4D({ 'l1' : wp })
-   p4d
-   store.append('p4d', p4d)
-   store
-
-These, by default, index the three axes ``items, major_axis,
-minor_axis``. On an ``AppendableTable`` it is possible to setup with the
-first append a different indexing scheme, depending on how you want to
-store your data. Pass the ``axes`` keyword with a list of dimensions
-(currently must by exactly 1 less than the total dimensions of the
-object). This cannot be changed after table creation.
-
-.. ipython:: python
-   :okwarning:
-
-   store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis'])
-   store.select('p4d2', where='labels=l1 and items=Item1 and minor_axis=A')
 
 .. ipython:: python
    :suppress:
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 27"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format`
	`7`	`+conda install -n pandas -c conda-forge feather-format jemalloc=4.4.0`
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 27 BUILD_TEST"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge pyarrow dask`
	`7`	`+conda install -n pandas -c conda-forge pyarrow dask jemalloc=4.4.0`
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 35"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format`
	`7`	`+conda install -n pandas -c conda-forge feather-format jemalloc=4.4.0`