Merge branch 'master' into group_rank_perf

peterpanmj · web-flow · commit 690fac2f8e7e · 2018-06-06T13:19:09.000+08:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -512,3 +512,21 @@ def time_nlargest(self, keep):
 
     def time_nsmallest(self, keep):
         self.df.nsmallest(100, 'A', keep=keep)
+
+
+class Describe(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        self.df = DataFrame({
+            'a': np.random.randint(0, 100, int(1e6)),
+            'b': np.random.randint(0, 100, int(1e6)),
+            'c': np.random.randint(0, 100, int(1e6))
+        })
+
+    def time_series_describe(self):
+        self.df['a'].describe()
+
+    def time_dataframe_describe(self):
+        self.df.describe()
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput
 Use pandas DataFrames in your `scikit-learn <http://scikit-learn.org/>`__
 ML pipeline.
 
+`Featuretools <https://github.com/featuretools/featuretools/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. 
 
 .. _ecosystem.visualization:
 
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -32,6 +32,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
+- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`)
 -
 -
 
@@ -85,13 +86,16 @@ Indexing
 - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
 - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
 - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
+- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
+- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
 -
 
 I/O
 ^^^
 
 - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
 - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
+- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
 -
 
 Plotting
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8519,7 +8519,7 @@ def describe_numeric_1d(series):
             stat_index = (['count', 'mean', 'std', 'min'] +
                           formatted_percentiles + ['max'])
             d = ([series.count(), series.mean(), series.std(), series.min()] +
-                 [series.quantile(x) for x in percentiles] + [series.max()])
+                 series.quantile(percentiles).tolist() + [series.max()])
             return pd.Series(d, index=stat_index, name=series.name)
 
         def describe_categorical_1d(data):
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -378,7 +378,7 @@ def _engine(self):
     # introspection
     @cache_readonly
     def is_unique(self):
-        return not self.duplicated().any()
+        return self._engine.is_unique
 
     @property
     def is_monotonic_increasing(self):
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values):
     -------
     array
     """
+    if is_categorical_dtype(values):
+        # GH 21243/21253
+        values = np.array(values)
+
     if isinstance(values, (list, tuple)) and len(values) == 0:
         # GH 19016
         # empty lists/tuples get object dtype by default, but this is not
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from pandas.core.dtypes.missing import notna
+from pandas.core.dtypes.inference import is_file_like
 from pandas.core.index import Index, MultiIndex
 from pandas import compat
 from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
         else:
             encoding = self.encoding
 
-        if hasattr(self.path_or_buf, 'write'):
-            f = self.path_or_buf
-            close = False
+        # PR 21300 uses string buffer to receive csv writing and dump into
+        # file-like output with compression as option. GH 21241, 21118
+        f = StringIO()
+        if not is_file_like(self.path_or_buf):
+            # path_or_buf is path
+            path_or_buf = self.path_or_buf
+        elif hasattr(self.path_or_buf, 'name'):
+            # path_or_buf is file handle
+            path_or_buf = self.path_or_buf.name
         else:
-            f, handles = _get_handle(self.path_or_buf, self.mode,
-                                     encoding=encoding,
-                                     compression=None)
-            close = True if self.compression is None else False
+            # path_or_buf is file-like IO objects.
+            f = self.path_or_buf
+            path_or_buf = None
 
         try:
             writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
             self._save()
 
         finally:
-            # GH 17778 handles compression for byte strings.
-            if not close and self.compression:
-                f.close()
-                with open(f.name, 'r') as f:
-                    data = f.read()
-                f, handles = _get_handle(f.name, self.mode,
+            # GH 17778 handles zip compression for byte strings separately.
+            buf = f.getvalue()
+            if path_or_buf:
+                f, handles = _get_handle(path_or_buf, self.mode,
                                          encoding=encoding,
                                          compression=self.compression)
-                f.write(data)
-                close = True
-            if close:
+                f.write(buf)
                 f.close()
+                for _fh in handles:
+                    _fh.close()
 
     def _save_header(self):
 
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
         recons = pd.read_csv(StringIO(csv_str), index_col=0)
         assert_frame_equal(self.frame, recons)
 
-    def test_to_csv_compression(self, compression):
-
-        df = DataFrame([[0.123456, 0.234567, 0.567567],
-                        [12.32112, 123123.2, 321321.2]],
-                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
+    @pytest.mark.parametrize('df,encoding', [
+        (DataFrame([[0.123456, 0.234567, 0.567567],
+                    [12.32112, 123123.2, 321321.2]],
+                   index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
+        # GH 21241, 21118
+        (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
+        (DataFrame(5 * [[123, u"你好", u"世界"]],
+                   columns=['X', 'Y', 'Z']), 'gb2312'),
+        (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
+                   columns=['X', 'Y', 'Z']), 'cp737')
+    ])
+    def test_to_csv_compression(self, df, encoding, compression):
 
         with ensure_clean() as filename:
 
-            df.to_csv(filename, compression=compression)
+            df.to_csv(filename, compression=compression, encoding=encoding)
 
             # test the round trip - to_csv -> read_csv
-            rs = read_csv(filename, compression=compression,
-                          index_col=0)
-            assert_frame_equal(df, rs)
+            result = read_csv(filename, compression=compression,
+                              index_col=0, encoding=encoding)
+
+            with open(filename, 'w') as fh:
+                df.to_csv(fh, compression=compression, encoding=encoding)
+
+            result_fh = read_csv(filename, compression=compression,
+                                 index_col=0, encoding=encoding)
+            assert_frame_equal(df, result)
+            assert_frame_equal(df, result_fh)
 
             # explicitly make sure file is compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
+                text = fh.read().decode(encoding or 'utf8')
                 for col in df.columns:
                     assert col in text
 
             with tm.decompress_file(filename, compression) as fh:
-                assert_frame_equal(df, read_csv(fh, index_col=0))
+                assert_frame_equal(df, read_csv(fh,
+                                                index_col=0,
+                                                encoding=encoding))
 
     def test_to_csv_date_format(self):
         with ensure_clean('__tmp_to_csv_date_format__') as path:
diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py
@@ -6,8 +6,9 @@
 
 from pandas import (
     Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical,
-    date_range, timedelta_range, period_range, notna)
+    CategoricalIndex, date_range, timedelta_range, period_range, notna)
 from pandas.compat import lzip
+from pandas.core.dtypes.common import is_categorical_dtype
 from pandas.core.dtypes.dtypes import IntervalDtype
 import pandas.core.common as com
 import pandas.util.testing as tm
@@ -111,6 +112,22 @@ def test_constructor_string(self, constructor, breaks):
         with tm.assert_raises_regex(TypeError, msg):
             constructor(**self.get_kwargs_from_breaks(breaks))
 
+    @pytest.mark.parametrize('cat_constructor', [
+        Categorical, CategoricalIndex])
+    def test_constructor_categorical_valid(self, constructor, cat_constructor):
+        # GH 21243/21253
+        if isinstance(constructor, partial) and constructor.func is Index:
+            # Index is defined to create CategoricalIndex from categorical data
+            pytest.skip()
+
+        breaks = np.arange(10, dtype='int64')
+        expected = IntervalIndex.from_breaks(breaks)
+
+        cat_breaks = cat_constructor(breaks)
+        result_kwargs = self.get_kwargs_from_breaks(cat_breaks)
+        result = constructor(**result_kwargs)
+        tm.assert_index_equal(result, expected)
+
     def test_generic_errors(self, constructor):
         # filler input data to be used when supplying invalid kwargs
         filler = self.get_kwargs_from_breaks(range(10))
@@ -238,6 +255,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'):
         tuples = lzip(breaks[:-1], breaks[1:])
         if isinstance(breaks, (list, tuple)):
             return {'data': tuples}
+        elif is_categorical_dtype(breaks):
+            return {'data': breaks._constructor(tuples)}
         return {'data': com._asarray_tuplesafe(tuples)}
 
     def test_constructor_errors(self):
@@ -286,6 +305,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'):
 
         if isinstance(breaks, list):
             return {'data': ivs}
+        elif is_categorical_dtype(breaks):
+            return {'data': breaks._constructor(ivs)}
         return {'data': np.array(ivs, dtype=object)}
 
     def test_generic_errors(self, constructor):
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -581,6 +581,15 @@ def test_is_monotonic(self, data, non_lexsorted_data):
         assert c.is_monotonic_increasing
         assert not c.is_monotonic_decreasing
 
+    @pytest.mark.parametrize('values, expected', [
+        ([1, 2, 3], True),
+        ([1, 3, 1], False),
+        (list('abc'), True),
+        (list('aba'), False)])
+    def test_is_unique(self, values, expected):
+        ci = CategoricalIndex(values)
+        assert ci.is_unique is expected
+
     def test_duplicates(self):
 
         idx = CategoricalIndex([0, 0, 0], name='foo')
diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py
@@ -1,4 +1,4 @@
-#!/usr/env/bin python
+#!/usr/bin/env python
 
 """
 self-contained to write legacy storage (pickle/msgpack) files
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -137,29 +137,45 @@ def test_to_csv_path_is_none(self):
         csv_str = s.to_csv(path=None)
         assert isinstance(csv_str, str)
 
-    def test_to_csv_compression(self, compression):
-
-        s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
-                   name='X')
+    @pytest.mark.parametrize('s,encoding', [
+        (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
+                name='X'), None),
+        # GH 21241, 21118
+        (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
+        (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
+        (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
+    ])
+    def test_to_csv_compression(self, s, encoding, compression):
 
         with ensure_clean() as filename:
 
-            s.to_csv(filename, compression=compression, header=True)
+            s.to_csv(filename, compression=compression, encoding=encoding,
+                     header=True)
 
             # test the round trip - to_csv -> read_csv
-            rs = pd.read_csv(filename, compression=compression,
-                             index_col=0, squeeze=True)
-            assert_series_equal(s, rs)
+            result = pd.read_csv(filename, compression=compression,
+                                 encoding=encoding, index_col=0, squeeze=True)
+
+            with open(filename, 'w') as fh:
+                s.to_csv(fh, compression=compression, encoding=encoding,
+                         header=True)
+
+            result_fh = pd.read_csv(filename, compression=compression,
+                                    encoding=encoding, index_col=0,
+                                    squeeze=True)
+            assert_series_equal(s, result)
+            assert_series_equal(s, result_fh)
 
             # explicitly ensure file was compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
+                text = fh.read().decode(encoding or 'utf8')
                 assert s.name in text
 
             with tm.decompress_file(filename, compression) as fh:
                 assert_series_equal(s, pd.read_csv(fh,
                                                    index_col=0,
-                                                   squeeze=True))
+                                                   squeeze=True,
+                                                   encoding=encoding))
 
 
 class TestSeriesIO(TestData):
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -252,12 +252,13 @@ def test_compression_size_fh(obj, method, compression_only):
     with tm.ensure_clean() as filename:
         with open(filename, 'w') as fh:
             getattr(obj, method)(fh, compression=compression_only)
-            # GH 17778
-            assert fh.closed
+            assert not fh.closed
+        assert fh.closed
         compressed = os.path.getsize(filename)
     with tm.ensure_clean() as filename:
         with open(filename, 'w') as fh:
             getattr(obj, method)(fh, compression=None)
             assert not fh.closed
+        assert fh.closed
         uncompressed = os.path.getsize(filename)
         assert uncompressed > compressed
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
@@ -103,7 +103,6 @@ def test_pandas_datareader():
         'F', 'quandl', '2017-01-01', '2017-02-01')
 
 
-@pytest.mark.xfail(reaason="downstream install issue")
 def test_geopandas():
 
     geopandas = import_module('geopandas')  # noqa

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ Performance Improvements`
`32`	`32`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`33`	`33`
`34`	`34`	- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
	`35`	+- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`)
`35`	`36`	`-`
`36`	`37`	`-`
`37`	`38`
`@@ -85,13 +86,16 @@ Indexing`
`85`	`86`	- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
`86`	`87`	- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
`87`	`88`	- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
	`89`	+- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
	`90`	+- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
`88`	`91`	`-`
`89`	`92`
`90`	`93`	`I/O`
`91`	`94`	`^^^`
`92`	`95`
`93`	`96`	- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
`94`	`97`	- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
	`98`	+- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
`95`	`99`	`-`
`96`	`100`
`97`	`101`	`Plotting`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#!/usr/env/bin python`
	`1`	`+#!/usr/bin/env python`
`2`	`2`
`3`	`3`	`"""`
`4`	`4`	`self-contained to write legacy storage (pickle/msgpack) files`