Skip to content

Commit 690fac2

Browse files
authored
Merge branch 'master' into group_rank_perf
2 parents dca8d48 + 7dc6f70 commit 690fac2

File tree

14 files changed

+139
-44
lines changed

14 files changed

+139
-44
lines changed

asv_bench/benchmarks/frame_methods.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,3 +512,21 @@ def time_nlargest(self, keep):
512512

513513
def time_nsmallest(self, keep):
514514
self.df.nsmallest(100, 'A', keep=keep)
515+
516+
517+
class Describe(object):
518+
519+
goal_time = 0.2
520+
521+
def setup(self):
522+
self.df = DataFrame({
523+
'a': np.random.randint(0, 100, int(1e6)),
524+
'b': np.random.randint(0, 100, int(1e6)),
525+
'c': np.random.randint(0, 100, int(1e6))
526+
})
527+
528+
def time_series_describe(self):
529+
self.df['a'].describe()
530+
531+
def time_dataframe_describe(self):
532+
self.df.describe()

doc/source/ecosystem.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput
3838
Use pandas DataFrames in your `scikit-learn <http://scikit-learn.org/>`__
3939
ML pipeline.
4040

41+
`Featuretools <https://github.com/featuretools/featuretools/>`__
42+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4143

44+
Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community.
4245

4346
.. _ecosystem.visualization:
4447

doc/source/whatsnew/v0.23.1.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Performance Improvements
3232
~~~~~~~~~~~~~~~~~~~~~~~~
3333

3434
- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
35+
- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`)
3536
-
3637
-
3738

@@ -85,13 +86,16 @@ Indexing
8586
- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
8687
- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
8788
- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
89+
- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
90+
- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
8891
-
8992

9093
I/O
9194
^^^
9295

9396
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9497
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
98+
- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
9599
-
96100

97101
Plotting

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8519,7 +8519,7 @@ def describe_numeric_1d(series):
85198519
stat_index = (['count', 'mean', 'std', 'min'] +
85208520
formatted_percentiles + ['max'])
85218521
d = ([series.count(), series.mean(), series.std(), series.min()] +
8522-
[series.quantile(x) for x in percentiles] + [series.max()])
8522+
series.quantile(percentiles).tolist() + [series.max()])
85238523
return pd.Series(d, index=stat_index, name=series.name)
85248524

85258525
def describe_categorical_1d(data):

pandas/core/indexes/category.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def _engine(self):
378378
# introspection
379379
@cache_readonly
380380
def is_unique(self):
381-
return not self.duplicated().any()
381+
return self._engine.is_unique
382382

383383
@property
384384
def is_monotonic_increasing(self):

pandas/core/indexes/interval.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values):
112112
-------
113113
array
114114
"""
115+
if is_categorical_dtype(values):
116+
# GH 21243/21253
117+
values = np.array(values)
118+
115119
if isinstance(values, (list, tuple)) and len(values) == 0:
116120
# GH 19016
117121
# empty lists/tuples get object dtype by default, but this is not

pandas/io/formats/csvs.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010

1111
from pandas.core.dtypes.missing import notna
12+
from pandas.core.dtypes.inference import is_file_like
1213
from pandas.core.index import Index, MultiIndex
1314
from pandas import compat
1415
from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
127128
else:
128129
encoding = self.encoding
129130

130-
if hasattr(self.path_or_buf, 'write'):
131-
f = self.path_or_buf
132-
close = False
131+
# PR 21300 uses string buffer to receive csv writing and dump into
132+
# file-like output with compression as option. GH 21241, 21118
133+
f = StringIO()
134+
if not is_file_like(self.path_or_buf):
135+
# path_or_buf is path
136+
path_or_buf = self.path_or_buf
137+
elif hasattr(self.path_or_buf, 'name'):
138+
# path_or_buf is file handle
139+
path_or_buf = self.path_or_buf.name
133140
else:
134-
f, handles = _get_handle(self.path_or_buf, self.mode,
135-
encoding=encoding,
136-
compression=None)
137-
close = True if self.compression is None else False
141+
# path_or_buf is file-like IO objects.
142+
f = self.path_or_buf
143+
path_or_buf = None
138144

139145
try:
140146
writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
151157
self._save()
152158

153159
finally:
154-
# GH 17778 handles compression for byte strings.
155-
if not close and self.compression:
156-
f.close()
157-
with open(f.name, 'r') as f:
158-
data = f.read()
159-
f, handles = _get_handle(f.name, self.mode,
160+
# GH 17778 handles zip compression for byte strings separately.
161+
buf = f.getvalue()
162+
if path_or_buf:
163+
f, handles = _get_handle(path_or_buf, self.mode,
160164
encoding=encoding,
161165
compression=self.compression)
162-
f.write(data)
163-
close = True
164-
if close:
166+
f.write(buf)
165167
f.close()
168+
for _fh in handles:
169+
_fh.close()
166170

167171
def _save_header(self):
168172

pandas/tests/frame/test_to_csv.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
919919
recons = pd.read_csv(StringIO(csv_str), index_col=0)
920920
assert_frame_equal(self.frame, recons)
921921

922-
def test_to_csv_compression(self, compression):
923-
924-
df = DataFrame([[0.123456, 0.234567, 0.567567],
925-
[12.32112, 123123.2, 321321.2]],
926-
index=['A', 'B'], columns=['X', 'Y', 'Z'])
922+
@pytest.mark.parametrize('df,encoding', [
923+
(DataFrame([[0.123456, 0.234567, 0.567567],
924+
[12.32112, 123123.2, 321321.2]],
925+
index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
926+
# GH 21241, 21118
927+
(DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
928+
(DataFrame(5 * [[123, u"你好", u"世界"]],
929+
columns=['X', 'Y', 'Z']), 'gb2312'),
930+
(DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
931+
columns=['X', 'Y', 'Z']), 'cp737')
932+
])
933+
def test_to_csv_compression(self, df, encoding, compression):
927934

928935
with ensure_clean() as filename:
929936

930-
df.to_csv(filename, compression=compression)
937+
df.to_csv(filename, compression=compression, encoding=encoding)
931938

932939
# test the round trip - to_csv -> read_csv
933-
rs = read_csv(filename, compression=compression,
934-
index_col=0)
935-
assert_frame_equal(df, rs)
940+
result = read_csv(filename, compression=compression,
941+
index_col=0, encoding=encoding)
942+
943+
with open(filename, 'w') as fh:
944+
df.to_csv(fh, compression=compression, encoding=encoding)
945+
946+
result_fh = read_csv(filename, compression=compression,
947+
index_col=0, encoding=encoding)
948+
assert_frame_equal(df, result)
949+
assert_frame_equal(df, result_fh)
936950

937951
# explicitly make sure file is compressed
938952
with tm.decompress_file(filename, compression) as fh:
939-
text = fh.read().decode('utf8')
953+
text = fh.read().decode(encoding or 'utf8')
940954
for col in df.columns:
941955
assert col in text
942956

943957
with tm.decompress_file(filename, compression) as fh:
944-
assert_frame_equal(df, read_csv(fh, index_col=0))
958+
assert_frame_equal(df, read_csv(fh,
959+
index_col=0,
960+
encoding=encoding))
945961

946962
def test_to_csv_date_format(self):
947963
with ensure_clean('__tmp_to_csv_date_format__') as path:

pandas/tests/indexes/interval/test_construction.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66

77
from pandas import (
88
Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical,
9-
date_range, timedelta_range, period_range, notna)
9+
CategoricalIndex, date_range, timedelta_range, period_range, notna)
1010
from pandas.compat import lzip
11+
from pandas.core.dtypes.common import is_categorical_dtype
1112
from pandas.core.dtypes.dtypes import IntervalDtype
1213
import pandas.core.common as com
1314
import pandas.util.testing as tm
@@ -111,6 +112,22 @@ def test_constructor_string(self, constructor, breaks):
111112
with tm.assert_raises_regex(TypeError, msg):
112113
constructor(**self.get_kwargs_from_breaks(breaks))
113114

115+
@pytest.mark.parametrize('cat_constructor', [
116+
Categorical, CategoricalIndex])
117+
def test_constructor_categorical_valid(self, constructor, cat_constructor):
118+
# GH 21243/21253
119+
if isinstance(constructor, partial) and constructor.func is Index:
120+
# Index is defined to create CategoricalIndex from categorical data
121+
pytest.skip()
122+
123+
breaks = np.arange(10, dtype='int64')
124+
expected = IntervalIndex.from_breaks(breaks)
125+
126+
cat_breaks = cat_constructor(breaks)
127+
result_kwargs = self.get_kwargs_from_breaks(cat_breaks)
128+
result = constructor(**result_kwargs)
129+
tm.assert_index_equal(result, expected)
130+
114131
def test_generic_errors(self, constructor):
115132
# filler input data to be used when supplying invalid kwargs
116133
filler = self.get_kwargs_from_breaks(range(10))
@@ -238,6 +255,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'):
238255
tuples = lzip(breaks[:-1], breaks[1:])
239256
if isinstance(breaks, (list, tuple)):
240257
return {'data': tuples}
258+
elif is_categorical_dtype(breaks):
259+
return {'data': breaks._constructor(tuples)}
241260
return {'data': com._asarray_tuplesafe(tuples)}
242261

243262
def test_constructor_errors(self):
@@ -286,6 +305,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'):
286305

287306
if isinstance(breaks, list):
288307
return {'data': ivs}
308+
elif is_categorical_dtype(breaks):
309+
return {'data': breaks._constructor(ivs)}
289310
return {'data': np.array(ivs, dtype=object)}
290311

291312
def test_generic_errors(self, constructor):

pandas/tests/indexes/test_category.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,15 @@ def test_is_monotonic(self, data, non_lexsorted_data):
581581
assert c.is_monotonic_increasing
582582
assert not c.is_monotonic_decreasing
583583

584+
@pytest.mark.parametrize('values, expected', [
585+
([1, 2, 3], True),
586+
([1, 3, 1], False),
587+
(list('abc'), True),
588+
(list('aba'), False)])
589+
def test_is_unique(self, values, expected):
590+
ci = CategoricalIndex(values)
591+
assert ci.is_unique is expected
592+
584593
def test_duplicates(self):
585594

586595
idx = CategoricalIndex([0, 0, 0], name='foo')

pandas/tests/io/generate_legacy_storage_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/env/bin python
1+
#!/usr/bin/env python
22

33
"""
44
self-contained to write legacy storage (pickle/msgpack) files

pandas/tests/series/test_io.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -137,29 +137,45 @@ def test_to_csv_path_is_none(self):
137137
csv_str = s.to_csv(path=None)
138138
assert isinstance(csv_str, str)
139139

140-
def test_to_csv_compression(self, compression):
141-
142-
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
143-
name='X')
140+
@pytest.mark.parametrize('s,encoding', [
141+
(Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
142+
name='X'), None),
143+
# GH 21241, 21118
144+
(Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
145+
(Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
146+
(Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
147+
])
148+
def test_to_csv_compression(self, s, encoding, compression):
144149

145150
with ensure_clean() as filename:
146151

147-
s.to_csv(filename, compression=compression, header=True)
152+
s.to_csv(filename, compression=compression, encoding=encoding,
153+
header=True)
148154

149155
# test the round trip - to_csv -> read_csv
150-
rs = pd.read_csv(filename, compression=compression,
151-
index_col=0, squeeze=True)
152-
assert_series_equal(s, rs)
156+
result = pd.read_csv(filename, compression=compression,
157+
encoding=encoding, index_col=0, squeeze=True)
158+
159+
with open(filename, 'w') as fh:
160+
s.to_csv(fh, compression=compression, encoding=encoding,
161+
header=True)
162+
163+
result_fh = pd.read_csv(filename, compression=compression,
164+
encoding=encoding, index_col=0,
165+
squeeze=True)
166+
assert_series_equal(s, result)
167+
assert_series_equal(s, result_fh)
153168

154169
# explicitly ensure file was compressed
155170
with tm.decompress_file(filename, compression) as fh:
156-
text = fh.read().decode('utf8')
171+
text = fh.read().decode(encoding or 'utf8')
157172
assert s.name in text
158173

159174
with tm.decompress_file(filename, compression) as fh:
160175
assert_series_equal(s, pd.read_csv(fh,
161176
index_col=0,
162-
squeeze=True))
177+
squeeze=True,
178+
encoding=encoding))
163179

164180

165181
class TestSeriesIO(TestData):

pandas/tests/test_common.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,12 +252,13 @@ def test_compression_size_fh(obj, method, compression_only):
252252
with tm.ensure_clean() as filename:
253253
with open(filename, 'w') as fh:
254254
getattr(obj, method)(fh, compression=compression_only)
255-
# GH 17778
256-
assert fh.closed
255+
assert not fh.closed
256+
assert fh.closed
257257
compressed = os.path.getsize(filename)
258258
with tm.ensure_clean() as filename:
259259
with open(filename, 'w') as fh:
260260
getattr(obj, method)(fh, compression=None)
261261
assert not fh.closed
262+
assert fh.closed
262263
uncompressed = os.path.getsize(filename)
263264
assert uncompressed > compressed

pandas/tests/test_downstream.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ def test_pandas_datareader():
103103
'F', 'quandl', '2017-01-01', '2017-02-01')
104104

105105

106-
@pytest.mark.xfail(reaason="downstream install issue")
107106
def test_geopandas():
108107

109108
geopandas = import_module('geopandas') # noqa

0 commit comments

Comments
 (0)