Skip to content

Commit 6e9882d

Browse files
committed
fetch from master and merge
2 parents 101c0c2 + 40faba2 commit 6e9882d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+1023
-263
lines changed

asv_bench/benchmarks/inference.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -115,19 +115,27 @@ def time_maybe_convert_objects(self):
115115
class ToDatetimeFromIntsFloats:
116116
def setup(self):
117117
self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
118+
self.ts_sec_uint = Series(range(1521080307, 1521685107), dtype="uint64")
118119
self.ts_sec_float = self.ts_sec.astype("float64")
119120

120121
self.ts_nanosec = 1_000_000 * self.ts_sec
122+
self.ts_nanosec_uint = 1_000_000 * self.ts_sec_uint
121123
self.ts_nanosec_float = self.ts_nanosec.astype("float64")
122124

123-
# speed of int64 and float64 paths should be comparable
125+
# speed of int64, uint64 and float64 paths should be comparable
124126

125127
def time_nanosec_int64(self):
126128
to_datetime(self.ts_nanosec, unit="ns")
127129

130+
def time_nanosec_uint64(self):
131+
to_datetime(self.ts_nanosec_uint, unit="ns")
132+
128133
def time_nanosec_float64(self):
129134
to_datetime(self.ts_nanosec_float, unit="ns")
130135

136+
def time_sec_uint64(self):
137+
to_datetime(self.ts_sec_uint, unit="s")
138+
131139
def time_sec_int64(self):
132140
to_datetime(self.ts_sec, unit="s")
133141

asv_bench/benchmarks/io/csv.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def time_read_csv(self, bad_date_value):
206206
class ReadCSVSkipRows(BaseIO):
207207

208208
fname = "__test__.csv"
209-
params = ([None, 10000], ["c", "python"])
209+
params = ([None, 10000], ["c", "python", "pyarrow"])
210210
param_names = ["skiprows", "engine"]
211211

212212
def setup(self, skiprows, engine):
@@ -320,7 +320,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
320320

321321

322322
class ReadCSVEngine(StringIORewind):
323-
params = ["c", "python"]
323+
params = ["c", "python", "pyarrow"]
324324
param_names = ["engine"]
325325

326326
def setup(self, engine):

ci/deps/actions-39.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ dependencies:
3131
- python-dateutil
3232
- pytz
3333
- s3fs>=0.4.2
34-
- aiobotocore<=1.3.3
3534
- scipy
3635
- sqlalchemy
3736
- xlrd

ci/deps/azure-windows-38.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ dependencies:
3030
- python-dateutil
3131
- pytz
3232
- s3fs>=0.4.0
33-
- aiobotocore<=1.3.3
3433
- scipy
3534
- xlrd
3635
- xlsxwriter

ci/deps/azure-windows-39.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ dependencies:
3232
- python-dateutil
3333
- pytz
3434
- s3fs>=0.4.2
35-
- aiobotocore<=1.3.3
3635
- scipy
3736
- sqlalchemy
3837
- xlrd

doc/source/getting_started/install.rst

+3
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ Installing from PyPI
132132
pandas can be installed via pip from
133133
`PyPI <https://pypi.org/project/pandas>`__.
134134

135+
.. note::
136+
You must have ``pip>=19.3`` to install from PyPI.
137+
135138
::
136139

137140
pip install pandas

doc/source/user_guide/io.rst

+46-8
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,15 @@ dtype : Type name or dict of column -> type, default ``None``
160160
(unsupported with ``engine='python'``). Use ``str`` or ``object`` together
161161
with suitable ``na_values`` settings to preserve and
162162
not interpret dtype.
163-
engine : {``'c'``, ``'python'``}
164-
Parser engine to use. The C engine is faster while the Python engine is
165-
currently more feature-complete.
163+
engine : {``'c'``, ``'python'``, ``'pyarrow'``}
164+
Parser engine to use. The C and pyarrow engines are faster, while the python engine
165+
is currently more feature-complete. Multithreading is currently only supported by
166+
the pyarrow engine.
167+
168+
.. versionadded:: 1.4.0
169+
170+
The "pyarrow" engine was added as an *experimental* engine, and some features
171+
are unsupported, or may not work correctly, with this engine.
166172
converters : dict, default ``None``
167173
Dict of functions for converting values in certain columns. Keys can either be
168174
integers or column labels.
@@ -1622,11 +1628,17 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
16221628
Specifying the parser engine
16231629
''''''''''''''''''''''''''''
16241630

1625-
Under the hood pandas uses a fast and efficient parser implemented in C as well
1626-
as a Python implementation which is currently more feature-complete. Where
1627-
possible pandas uses the C parser (specified as ``engine='c'``), but may fall
1628-
back to Python if C-unsupported options are specified. Currently, C-unsupported
1629-
options include:
1631+
Pandas currently supports three engines, the C engine, the python engine, and an experimental
1632+
pyarrow engine (requires the ``pyarrow`` package). In general, the pyarrow engine is fastest
1633+
on larger workloads and is equivalent in speed to the C engine on most other workloads.
1634+
The python engine tends to be slower than the pyarrow and C engines on most workloads. However,
1635+
the pyarrow engine is much less robust than the C engine, which lacks a few features compared to the
1636+
Python engine.
1637+
1638+
Where possible, pandas uses the C parser (specified as ``engine='c'``), but it may fall
1639+
back to Python if C-unsupported options are specified.
1640+
1641+
Currently, options unsupported by the C and pyarrow engines include:
16301642

16311643
* ``sep`` other than a single character (e.g. regex separators)
16321644
* ``skipfooter``
@@ -1635,6 +1647,32 @@ options include:
16351647
Specifying any of the above options will produce a ``ParserWarning`` unless the
16361648
python engine is selected explicitly using ``engine='python'``.
16371649

1650+
Options that are unsupported by the pyarrow engine which are not covered by the list above include:
1651+
1652+
* ``float_precision``
1653+
* ``chunksize``
1654+
* ``comment``
1655+
* ``nrows``
1656+
* ``thousands``
1657+
* ``memory_map``
1658+
* ``dialect``
1659+
* ``warn_bad_lines``
1660+
* ``error_bad_lines``
1661+
* ``on_bad_lines``
1662+
* ``delim_whitespace``
1663+
* ``quoting``
1664+
* ``lineterminator``
1665+
* ``converters``
1666+
* ``decimal``
1667+
* ``iterator``
1668+
* ``dayfirst``
1669+
* ``infer_datetime_format``
1670+
* ``verbose``
1671+
* ``skipinitialspace``
1672+
* ``low_memory``
1673+
1674+
Specifying these options with ``engine='pyarrow'`` will raise a ``ValueError``.
1675+
16381676
.. _io.remote:
16391677

16401678
Reading/writing remote files

doc/source/whatsnew/v1.3.3.rst

+11-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ Fixed regressions
1717
- Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`)
1818
- Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`)
1919
- Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`)
20+
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
21+
22+
.. ---------------------------------------------------------------------------
23+
24+
.. _whatsnew_133.performance:
25+
26+
Performance improvements
27+
~~~~~~~~~~~~~~~~~~~~~~~~
28+
- Performance improvement for :meth:`DataFrame.__setitem__` when the key or value is not a :class:`DataFrame`, or key is not list-like (:issue:`43274`)
29+
-
2030
-
2131

2232
.. ---------------------------------------------------------------------------
@@ -25,7 +35,7 @@ Fixed regressions
2535

2636
Bug fixes
2737
~~~~~~~~~
28-
-
38+
- Bug in :meth:`.DataFrameGroupBy.agg` and :meth:`.DataFrameGroupBy.transform` with ``engine="numba"`` where ``index`` data was not being correctly passed into ``func`` (:issue:`43133`)
2939
-
3040

3141
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.4.0.rst

+8-4
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,13 @@ Styler
7878

7979
There are also bug fixes and deprecations listed below.
8080

81-
.. _whatsnew_140.enhancements.enhancement2:
81+
.. _whatsnew_140.enhancements.pyarrow_csv_engine:
8282

83-
enhancement2
84-
^^^^^^^^^^^^
83+
Multithreaded CSV reading with a new CSV Engine based on pyarrow
84+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
85+
86+
:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
87+
with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
8588

8689
.. _whatsnew_140.enhancements.other:
8790

@@ -242,7 +245,7 @@ Performance improvements
242245
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
243246
- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
244247
- Performance improvement in :func:`read_stata` (:issue:`43059`)
245-
-
248+
- Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`)
246249

247250
.. ---------------------------------------------------------------------------
248251
@@ -322,6 +325,7 @@ MultiIndex
322325
- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
323326
- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
324327
- Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`)
328+
- Bug in :meth:`MultiIndex.putmask` where the other value was also a :class:`MultiIndex` (:issue:`43212`)
325329
-
326330

327331
I/O

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ dependencies:
105105

106106
- pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf
107107
- s3fs>=0.4.0 # file IO when using 's3://...' path
108-
- aiobotocore<=1.3.3 # Remove when s3fs is at 2021.08.0
108+
- aiobotocore
109109
- fsspec>=0.7.4, <2021.6.0 # for generic remote file operations
110110
- gcsfs>=0.6.0 # file IO when using 'gcs://...' path
111111
- sqlalchemy # pandas.read_sql, DataFrame.to_sql

pandas/_libs/tslib.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def array_with_unit_to_datetime(
248248
# if we have nulls that are not type-compat
249249
# then need to iterate
250250

251-
if values.dtype.kind == "i" or values.dtype.kind == "f":
251+
if values.dtype.kind in ["i", "f", "u"]:
252252
iresult = values.astype("i8", copy=False)
253253
# fill missing values by comparing to NPY_NAT
254254
mask = iresult == NPY_NAT
@@ -263,7 +263,7 @@ def array_with_unit_to_datetime(
263263
):
264264
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
265265

266-
if values.dtype.kind == "i":
266+
if values.dtype.kind in ["i", "u"]:
267267
result = (iresult * m).astype("M8[ns]")
268268

269269
elif values.dtype.kind == "f":

pandas/core/frame.py

+27-21
Original file line numberDiff line numberDiff line change
@@ -1653,6 +1653,8 @@ def to_numpy(
16531653
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
16541654
"""
16551655
self._consolidate_inplace()
1656+
if dtype is not None:
1657+
dtype = np.dtype(dtype)
16561658
result = self._mgr.as_array(
16571659
transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
16581660
)
@@ -3620,9 +3622,11 @@ def __setitem__(self, key, value):
36203622
self._setitem_array(key, value)
36213623
elif isinstance(value, DataFrame):
36223624
self._set_item_frame_value(key, value)
3623-
elif is_list_like(value) and 1 < len(
3624-
self.columns.get_indexer_for([key])
3625-
) == len(value):
3625+
elif (
3626+
is_list_like(value)
3627+
and not self.columns.is_unique
3628+
and 1 < len(self.columns.get_indexer_for([key])) == len(value)
3629+
):
36263630
# Column to set is duplicated
36273631
self._setitem_array([key], value)
36283632
else:
@@ -9824,26 +9828,28 @@ def _reduce(
98249828
assert filter_type is None or filter_type == "bool", filter_type
98259829
out_dtype = "bool" if filter_type == "bool" else None
98269830

9827-
own_dtypes = [arr.dtype for arr in self._iter_column_arrays()]
9831+
if numeric_only is None and name in ["mean", "median"]:
9832+
own_dtypes = [arr.dtype for arr in self._mgr.arrays]
98289833

9829-
dtype_is_dt = np.array(
9830-
[is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
9831-
dtype=bool,
9832-
)
9833-
if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any():
9834-
warnings.warn(
9835-
"DataFrame.mean and DataFrame.median with numeric_only=None "
9836-
"will include datetime64 and datetime64tz columns in a "
9837-
"future version.",
9838-
FutureWarning,
9839-
stacklevel=5,
9834+
dtype_is_dt = np.array(
9835+
[is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
9836+
dtype=bool,
98409837
)
9841-
# Non-copy equivalent to
9842-
# cols = self.columns[~dtype_is_dt]
9843-
# self = self[cols]
9844-
predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
9845-
mgr = self._mgr._get_data_subset(predicate)
9846-
self = type(self)(mgr)
9838+
if dtype_is_dt.any():
9839+
warnings.warn(
9840+
"DataFrame.mean and DataFrame.median with numeric_only=None "
9841+
"will include datetime64 and datetime64tz columns in a "
9842+
"future version.",
9843+
FutureWarning,
9844+
stacklevel=5,
9845+
)
9846+
# Non-copy equivalent to
9847+
# dt64_cols = self.dtypes.apply(is_datetime64_any_dtype)
9848+
# cols = self.columns[~dt64_cols]
9849+
# self = self[cols]
9850+
predicate = lambda x: not is_datetime64_any_dtype(x.dtype)
9851+
mgr = self._mgr._get_data_subset(predicate)
9852+
self = type(self)(mgr)
98479853

98489854
# TODO: Make other agg func handle axis=None properly GH#21597
98499855
axis = self._get_axis_number(axis)

0 commit comments

Comments
 (0)