Merge remote-tracking branch 'upstream/master' into ea-unstack

TomAugspurger · TomAugspurger · commit ca286f72a536 · 2018-11-05T20:39:42.000-06:00
diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
@@ -1,7 +1,7 @@
 beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
-fastparquet
+fastparquet>=0.1.2
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -12,7 +12,7 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow>=0.4.1
+pyarrow>=0.7.0
 pymysql
 pytables>=3.4.2
 pytest-cov
diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
@@ -3,7 +3,7 @@
 beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
-fastparquet
+fastparquet>=0.1.2
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -14,9 +14,9 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow>=0.4.1
+pyarrow>=0.7.0
 pymysql
-tables
+pytables>=3.4.2
 pytest-cov
 pytest-xdist
 s3fs
@@ -27,4 +27,4 @@ statsmodels
 xarray
 xlrd
 xlsxwriter
-xlwt
+xlwt
diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml
@@ -22,7 +22,7 @@ dependencies:
   - patsy
   - psycopg2
   - py
-  - pyarrow=0.4.1
+  - pyarrow=0.7.0
   - PyCrypto
   - pymysql=0.6.3
   - pytables
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -258,8 +258,8 @@ Optional Dependencies
 * `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
-* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
-* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
+* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0): necessary for feather-based storage.
+* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.1.2) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
     * `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -250,7 +250,7 @@ Backwards incompatible API changes
 Dependencies have increased minimum versions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We have updated our minimum supported versions of dependencies (:issue:`21242`).
+We have updated our minimum supported versions of dependencies (:issue:`21242`, `18742`).
 If installed, we now require:
 
 +-----------------+-----------------+----------+
@@ -268,6 +268,10 @@ If installed, we now require:
 +-----------------+-----------------+----------+
 | scipy           | 0.18.1          |          |
 +-----------------+-----------------+----------+
+| pyarrow         | 0.7.0           |          |
++-----------------+-----------------+----------+
+| fastparquet     | 0.1.2           |          |
++-----------------+-----------------+----------+
 
 Additionally we no longer depend on `feather-format` for feather based storage
 and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`).
@@ -1211,6 +1215,7 @@ Indexing
 - :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`)
 - Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`)
 - Bug in `MultiIndex.set_levels` when levels value is not subscriptable (:issue:`23273`)
+- Bug where setting a timedelta column by ``Index`` causes it to be casted to double, and therefore lose precision (:issue:`23511`)
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -2173,9 +2173,9 @@ def _box_func(self):
     def _can_hold_element(self, element):
         tipo = maybe_infer_dtype_type(element)
         if tipo is not None:
-            return issubclass(tipo.type, np.timedelta64)
+            return issubclass(tipo.type, (np.timedelta64, np.int64))
         return is_integer(element) or isinstance(
-            element, (timedelta, np.timedelta64))
+            element, (timedelta, np.timedelta64, np.int64))
 
     def fillna(self, value, **kwargs):
 
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -5,7 +5,7 @@
 
 from pandas.compat import string_types
 
-from pandas import DataFrame, Int64Index, RangeIndex, get_option
+from pandas import DataFrame, get_option
 import pandas.core.common as com
 
 from pandas.io.common import get_filepath_or_buffer, is_s3_url
@@ -89,57 +89,38 @@ def __init__(self):
                 "\nor via pip\n"
                 "pip install -U pyarrow\n"
             )
-        if LooseVersion(pyarrow.__version__) < '0.4.1':
+        if LooseVersion(pyarrow.__version__) < '0.7.0':
             raise ImportError(
-                "pyarrow >= 0.4.1 is required for parquet support\n\n"
+                "pyarrow >= 0.7.0 is required for parquet support\n\n"
                 "you can install via conda\n"
                 "conda install pyarrow -c conda-forge\n"
                 "\nor via pip\n"
                 "pip install -U pyarrow\n"
             )
 
-        self._pyarrow_lt_060 = (
-            LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
-        self._pyarrow_lt_070 = (
-            LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
-
         self.api = pyarrow
 
     def write(self, df, path, compression='snappy',
               coerce_timestamps='ms', index=None, **kwargs):
         self.validate_dataframe(df)
-
-        # Only validate the index if we're writing it.
-        if self._pyarrow_lt_070 and index is not False:
-            self._validate_write_lt_070(df)
         path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
 
         if index is None:
             from_pandas_kwargs = {}
         else:
             from_pandas_kwargs = {'preserve_index': index}
 
-        if self._pyarrow_lt_060:
-            table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
-                                               **from_pandas_kwargs)
-            self.api.parquet.write_table(
-                table, path, compression=compression, **kwargs)
-
-        else:
-            table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
-            self.api.parquet.write_table(
-                table, path, compression=compression,
-                coerce_timestamps=coerce_timestamps, **kwargs)
+        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+        self.api.parquet.write_table(
+            table, path, compression=compression,
+            coerce_timestamps=coerce_timestamps, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         path, _, _, should_close = get_filepath_or_buffer(path)
-        if self._pyarrow_lt_070:
-            result = self.api.parquet.read_pandas(path, columns=columns,
-                                                  **kwargs).to_pandas()
-        else:
-            kwargs['use_pandas_metadata'] = True
-            result = self.api.parquet.read_table(path, columns=columns,
-                                                 **kwargs).to_pandas()
+
+        kwargs['use_pandas_metadata'] = True
+        result = self.api.parquet.read_table(path, columns=columns,
+                                             **kwargs).to_pandas()
         if should_close:
             try:
                 path.close()
@@ -148,39 +129,6 @@ def read(self, path, columns=None, **kwargs):
 
         return result
 
-    def _validate_write_lt_070(self, df):
-        # Compatibility shim for pyarrow < 0.7.0
-        # TODO: Remove in pandas 0.23.0
-        from pandas.core.indexes.multi import MultiIndex
-        if isinstance(df.index, MultiIndex):
-            msg = (
-                "Multi-index DataFrames are only supported "
-                "with pyarrow >= 0.7.0"
-            )
-            raise ValueError(msg)
-        # Validate index
-        if not isinstance(df.index, Int64Index):
-            msg = (
-                "pyarrow < 0.7.0 does not support serializing {} for the "
-                "index; you can .reset_index() to make the index into "
-                "column(s), or install the latest version of pyarrow or "
-                "fastparquet."
-            )
-            raise ValueError(msg.format(type(df.index)))
-        if not df.index.equals(RangeIndex(len(df))):
-            raise ValueError(
-                "pyarrow < 0.7.0 does not support serializing a non-default "
-                "index; you can .reset_index() to make the index into "
-                "column(s), or install the latest version of pyarrow or "
-                "fastparquet."
-            )
-        if df.index.name is not None:
-            raise ValueError(
-                "pyarrow < 0.7.0 does not serialize indexes with a name; you "
-                "can set the index.name to None or install the latest version "
-                "of pyarrow or fastparquet."
-            )
-
 
 class FastParquetImpl(BaseImpl):
 
@@ -197,9 +145,9 @@ def __init__(self):
                 "\nor via pip\n"
                 "pip install -U fastparquet"
             )
-        if LooseVersion(fastparquet.__version__) < '0.1.0':
+        if LooseVersion(fastparquet.__version__) < '0.1.2':
             raise ImportError(
-                "fastparquet >= 0.1.0 is required for parquet "
+                "fastparquet >= 0.1.2 is required for parquet "
                 "support\n\n"
                 "you can install via conda\n"
                 "conda install fastparquet -c conda-forge\n"
diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py
@@ -4,11 +4,13 @@
 import numpy as np
 import pytest
 
-import pandas.util.testing as tm
-from pandas import Categorical, Index, isna
 from pandas.compat import lrange
+
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
+from pandas import Categorical, Index, isna
+import pandas.util.testing as tm
+
 
 class TestCategoricalMissing(object):
 
diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 
-import pandas.util.testing as tm
 from pandas import Categorical, Index
+import pandas.util.testing as tm
 
 
 class TestCategoricalSort(object):
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -3,10 +3,9 @@
 import pytest
 
 import pandas as pd
-import pandas.util.testing as tm
 from pandas.core.arrays import (
-    DatetimeArrayMixin, PeriodArray, TimedeltaArrayMixin
-)
+    DatetimeArrayMixin, PeriodArray, TimedeltaArrayMixin)
+import pandas.util.testing as tm
 
 
 # TODO: more freq variants
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -2,16 +2,16 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.generic import ABCIndexClass
+
 import pandas as pd
-import pandas.util.testing as tm
 from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar
 from pandas.core.arrays import IntegerArray, integer_array
 from pandas.core.arrays.integer import (
     Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype,
-    UInt32Dtype, UInt64Dtype
-)
-from pandas.core.dtypes.generic import ABCIndexClass
+    UInt32Dtype, UInt64Dtype)
 from pandas.tests.extension.base import BaseOpsUtil
+import pandas.util.testing as tm
 
 
 def make_data():
diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -337,13 +337,24 @@ def f():
         df2['y'] = ['g', 'h', 'i']
 
     def test_detect_chained_assignment_warnings(self):
+        with option_context("chained_assignment", "warn"):
+            df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
 
-        # warnings
-        with option_context('chained_assignment', 'warn'):
-            df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})
-            with tm.assert_produces_warning(
-                    expected_warning=com.SettingWithCopyWarning):
-                df.loc[0]['A'] = 111
+            with tm.assert_produces_warning(com.SettingWithCopyWarning):
+                df.loc[0]["A"] = 111
+
+    def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self):
+        # xref gh-13017.
+        with option_context("chained_assignment", "warn"):
+            df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]],
+                              columns=["a", "a", "c"])
+
+            with tm.assert_produces_warning(com.SettingWithCopyWarning):
+                df.c.loc[df.c > 0] = None
+
+            expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]],
+                                    columns=["a", "a", "c"])
+            tm.assert_frame_equal(df, expected)
 
     def test_chained_getitem_with_lists(self):
 
diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py
@@ -80,3 +80,18 @@ def test_numpy_timedelta_scalar_indexing(self, start, stop,
         result = s.loc[slice(start, stop)]
         expected = s.iloc[expected_slice]
         tm.assert_series_equal(result, expected)
+
+    def test_roundtrip_thru_setitem(self):
+        # PR 23462
+        dt1 = pd.Timedelta(0)
+        dt2 = pd.Timedelta(28767471428571405)
+        df = pd.DataFrame({'dt': pd.Series([dt1, dt2])})
+        df_copy = df.copy()
+        s = pd.Series([dt1])
+
+        expected = df['dt'].iloc[1].value
+        df.loc[[True, False]] = s
+        result = df['dt'].iloc[1].value
+
+        assert expected == result
+        tm.assert_frame_equal(df, df_copy)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py
diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py
diff --git a/setup.cfg b/setup.cfg