Skip to content

Commit 5395227

Browse files
committed
Merge branch 'master' into rename_DataFrame.applymap_to_DataFrame.map
2 parents 7d36014 + 0e39016 commit 5395227

30 files changed

+248
-151
lines changed

asv_bench/benchmarks/categoricals.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def time_regular(self):
4242
pd.Categorical(self.values, self.categories)
4343

4444
def time_fastpath(self):
45-
pd.Categorical(self.codes, self.cat_idx, fastpath=True)
45+
dtype = pd.CategoricalDtype(categories=self.cat_idx)
46+
pd.Categorical._simple_new(self.codes, dtype)
4647

4748
def time_datetimes(self):
4849
pd.Categorical(self.datetimes)

asv_bench/benchmarks/series_methods.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,9 @@ def time_to_numpy_double_copy(self):
385385
def time_to_numpy_copy(self):
386386
self.ser.to_numpy(copy=True)
387387

388+
def time_to_numpy_float_with_nan(self):
389+
self.ser.to_numpy(dtype="float64", na_value=np.nan)
390+
388391

389392
class Replace:
390393
param_names = ["num_to_replace"]

asv_bench/benchmarks/strings.py

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,31 @@ def setup(self, dtype):
2525

2626

2727
class Construction:
28-
params = ["str", "string"]
29-
param_names = ["dtype"]
30-
31-
def setup(self, dtype):
32-
self.series_arr = tm.rands_array(nchars=10, size=10**5)
33-
self.frame_arr = self.series_arr.reshape((50_000, 2)).copy()
34-
35-
# GH37371. Testing construction of string series/frames from ExtensionArrays
36-
self.series_cat_arr = Categorical(self.series_arr)
37-
38-
def time_series_construction(self, dtype):
39-
Series(self.series_arr, dtype=dtype)
40-
41-
def peakmem_series_construction(self, dtype):
42-
Series(self.series_arr, dtype=dtype)
43-
44-
def time_frame_construction(self, dtype):
45-
DataFrame(self.frame_arr, dtype=dtype)
46-
47-
def peakmem_frame_construction(self, dtype):
48-
DataFrame(self.frame_arr, dtype=dtype)
49-
50-
def time_cat_series_construction(self, dtype):
51-
Series(self.series_cat_arr, dtype=dtype)
52-
53-
def peakmem_cat_series_construction(self, dtype):
54-
Series(self.series_cat_arr, dtype=dtype)
28+
params = (
29+
["series", "frame", "categorical_series"],
30+
["str", "string[python]", "string[pyarrow]"],
31+
)
32+
param_names = ["pd_type", "dtype"]
33+
pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
34+
dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object}
35+
36+
def setup(self, pd_type, dtype):
37+
series_arr = tm.rands_array(
38+
nchars=10, size=10**5, dtype=self.dtype_mapping[dtype]
39+
)
40+
if pd_type == "series":
41+
self.arr = series_arr
42+
elif pd_type == "frame":
43+
self.arr = series_arr.reshape((50_000, 2)).copy()
44+
elif pd_type == "categorical_series":
45+
# GH37371. Testing construction of string series/frames from ExtensionArrays
46+
self.arr = Categorical(series_arr)
47+
48+
def time_construction(self, pd_type, dtype):
49+
self.pd_mapping[pd_type](self.arr, dtype=dtype)
50+
51+
def peakmem_construction(self, pd_type, dtype):
52+
self.pd_mapping[pd_type](self.arr, dtype=dtype)
5553

5654

5755
class Methods(Dtypes):

doc/source/development/contributing.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ that is assigned, feel free to kindly ask the current assignee if you can take i
4747

4848
We have several :ref:`contributor community <community>` communication channels, which you are
4949
welcome to join, and ask questions as you figure things out. Among them are regular meetings for
50-
new contributors, dev meetings, a dev mailing list, and a slack for the contributor community.
50+
new contributors, dev meetings, a dev mailing list, and a Slack for the contributor community.
5151
All pandas contributors are welcome to these spaces, where they can connect with each other. Even
5252
maintainers who have been with us for a long time felt just like you when they started out, and
5353
are happy to welcome you and support you as you get to know how we work, and where things are.
@@ -308,8 +308,9 @@ default commit message will open, and you can simply save and quit this file.
308308
If there are merge conflicts, you need to solve those conflicts. See for
309309
example at https://help.github.com/articles/resolving-a-merge-conflict-using-the-command-line/
310310
for an explanation on how to do this.
311-
Once the conflicts are merged and the files where the conflicts were solved are
312-
added, you can run ``git commit`` to save those fixes.
311+
Once the conflicts are resolved, you should do:
312+
1. ``git add -u`` to stage any files you've updated;
313+
2. ``git commit`` to finish the merge.
313314

314315
If you have uncommitted changes at the moment you want to update the branch with
315316
main, you will need to ``stash`` them prior to updating (see the
@@ -324,7 +325,7 @@ request by pushing to the branch on GitHub::
324325
Autofixing formatting errors
325326
----------------------------
326327

327-
We use several styling checks (e.g. ``black``, ``flake8``, ``isort``) which are run after
328+
We use several styling checks (e.g. ``black``, ``ruff``, ``isort``) which are run after
328329
you make a pull request.
329330

330331
To automatically fix formatting errors on each commit you make, you can

doc/source/development/contributing_codebase.rst

Lines changed: 18 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -18,40 +18,32 @@ tools will be run to check your code for stylistic errors.
1818
Generating any warnings will cause the test to fail.
1919
Thus, good style is a requirement for submitting code to pandas.
2020

21-
There is a tool in pandas to help contributors verify their changes before
22-
contributing them to the project::
21+
There are a couple of tools in pandas to help contributors verify their changes
22+
before contributing to the project
2323

24-
./ci/code_checks.sh
25-
26-
The script validates the doctests, formatting in docstrings, and
27-
imported modules. It is possible to run the checks independently by using the
28-
parameters ``docstrings``, ``code``, and ``doctests``
29-
(e.g. ``./ci/code_checks.sh doctests``).
24+
- ``./ci/code_checks.sh``: a script validates the doctests, formatting in docstrings,
25+
and imported modules. It is possible to run the checks independently by using the
26+
parameters ``docstrings``, ``code``, and ``doctests``
27+
(e.g. ``./ci/code_checks.sh doctests``);
28+
- ``pre-commit``, which we go into detail on in the next section.
3029

3130
In addition, because a lot of people use our library, it is important that we
3231
do not make sudden changes to the code that could have the potential to break
3332
a lot of user code as a result, that is, we need it to be as *backwards compatible*
3433
as possible to avoid mass breakages.
3534

36-
In addition to ``./ci/code_checks.sh``, some extra checks (including static type
37-
checking) are run by ``pre-commit`` - see :ref:`here <contributing.pre-commit>`
38-
for how to run them.
39-
4035
.. _contributing.pre-commit:
4136

4237
Pre-commit
4338
----------
4439

4540
Additionally, :ref:`Continuous Integration <contributing.ci>` will run code formatting checks
4641
like ``black``, ``ruff``,
47-
``isort``, and ``cpplint`` and more using `pre-commit hooks <https://pre-commit.com/>`_
42+
``isort``, and ``cpplint`` and more using `pre-commit hooks <https://pre-commit.com/>`_.
4843
Any warnings from these checks will cause the :ref:`Continuous Integration <contributing.ci>` to fail; therefore,
4944
it is helpful to run the check yourself before submitting code. This
50-
can be done by installing ``pre-commit``::
51-
52-
pip install pre-commit
53-
54-
and then running::
45+
can be done by installing ``pre-commit`` (which should already have happened if you followed the instructions
46+
in :ref:`Setting up your development environment <contributing_environment>`) and then running::
5547

5648
pre-commit install
5749

@@ -63,17 +55,17 @@ remain up-to-date with our code checks as they change.
6355
Note that if needed, you can skip these checks with ``git commit --no-verify``.
6456

6557
If you don't want to use ``pre-commit`` as part of your workflow, you can still use it
66-
to run its checks with::
58+
to run its checks with one of the following::
6759

6860
pre-commit run --files <files you have modified>
61+
pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files
6962

7063
without needing to have done ``pre-commit install`` beforehand.
7164

72-
If you want to run checks on all recently committed files on upstream/main you can use::
73-
74-
pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files
65+
Finally, we also have some slow pre-commit checks, which don't run on each commit
66+
but which do run during continuous integration. You can trigger them manually with::
7567

76-
without needing to have done ``pre-commit install`` beforehand.
68+
pre-commit run --hook-stage manual --all-files
7769

7870
.. note::
7971

@@ -170,43 +162,9 @@ pandas strongly encourages the use of :pep:`484` style type hints. New developme
170162
Style guidelines
171163
~~~~~~~~~~~~~~~~
172164

173-
Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than
174-
175-
.. code-block:: python
176-
177-
import typing
178-
179-
primes: typing.List[int] = []
180-
181-
You should write
182-
183-
.. code-block:: python
184-
185-
primes: list[int] = []
186-
187-
``Optional`` should be avoided in favor of the shorter ``| None``, so instead of
188-
189-
.. code-block:: python
190-
191-
from typing import Union
192-
193-
maybe_primes: list[Union[int, None]] = []
194-
195-
or
196-
197-
.. code-block:: python
198-
199-
from typing import Optional
200-
201-
maybe_primes: list[Optional[int]] = []
202-
203-
You should write
204-
205-
.. code-block:: python
206-
207-
from __future__ import annotations # noqa: F404
208-
209-
maybe_primes: list[int | None] = []
165+
Type imports should follow the ``from typing import ...`` convention.
166+
Your code may be automatically re-written to use some modern constructs (e.g. using the built-in ``list`` instead of ``typing.List``)
167+
by the :ref:`pre-commit checks <contributing.pre-commit>`.
210168

211169
In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 <https://github.com/python/mypy/issues/1775#issuecomment-310969854>`_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like
212170

doc/source/development/contributing_environment.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ locally before pushing your changes. It's recommended to also install the :ref:`
2121
Step 1: install a C compiler
2222
----------------------------
2323

24-
How to do this will depend on your platform. If you choose to use ``Docker``
24+
How to do this will depend on your platform. If you choose to use ``Docker`` or ``GitPod``
2525
in the next step, then you can skip this step.
2626

2727
**Windows**
@@ -213,6 +213,10 @@ You can now run::
213213
python setup.py build_ext -j 4
214214
python -m pip install -e . --no-build-isolation --no-use-pep517
215215

216+
.. note::
217+
You will need to repeat this step each time the C extensions change, for example
218+
if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``.
219+
216220
At this point you should be able to import pandas from your locally built version::
217221

218222
$ python
@@ -222,7 +226,3 @@ At this point you should be able to import pandas from your locally built versio
222226

223227
This will create the new environment, and not touch any of your existing environments,
224228
nor any existing Python installation.
225-
226-
.. note::
227-
You will need to repeat this step each time the C extensions change, for example
228-
if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``.

doc/source/user_guide/style.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2131,4 +2131,4 @@
21312131
},
21322132
"nbformat": 4,
21332133
"nbformat_minor": 1
2134-
}
2134+
}

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ The following functions gained a new keyword ``dtype_backend`` (:issue:`36712`)
120120
* :func:`read_sql`
121121
* :func:`read_sql_query`
122122
* :func:`read_sql_table`
123+
* :func:`read_parquet`
123124
* :func:`read_orc`
124125
* :func:`read_feather`
125126
* :func:`read_spss`

doc/source/whatsnew/v2.1.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ Deprecations
169169
- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`)
170170
- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`)
171171
- Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`)
172+
- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`)
172173
- Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`)
173174
- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`)
174175
- Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`)
@@ -198,6 +199,7 @@ Performance improvements
198199
- Performance improvement in :meth:`MultiIndex.set_levels` and :meth:`MultiIndex.set_codes` when ``verify_integrity=True`` (:issue:`51873`)
199200
- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
200201
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
202+
- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
201203
-
202204

203205
.. ---------------------------------------------------------------------------
@@ -298,6 +300,7 @@ Groupby/resample/rolling
298300
or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument,
299301
the function operated on the whole index rather than each element of the index. (:issue:`51979`)
300302
- Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64 or :class:`PeriodDtype` values (:issue:`52128`)
303+
- Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`)
301304
-
302305

303306
Reshaping

pandas/core/arrays/categorical.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -355,15 +355,38 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
355355

356356
_dtype: CategoricalDtype
357357

358+
@classmethod
359+
# error: Argument 2 of "_simple_new" is incompatible with supertype
360+
# "NDArrayBacked"; supertype defines the argument type as
361+
# "Union[dtype[Any], ExtensionDtype]"
362+
def _simple_new( # type: ignore[override]
363+
cls, codes: np.ndarray, dtype: CategoricalDtype
364+
) -> Self:
365+
# NB: This is not _quite_ as simple as the "usual" _simple_new
366+
codes = coerce_indexer_dtype(codes, dtype.categories)
367+
dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
368+
return super()._simple_new(codes, dtype)
369+
358370
def __init__(
359371
self,
360372
values,
361373
categories=None,
362374
ordered=None,
363375
dtype: Dtype | None = None,
364-
fastpath: bool = False,
376+
fastpath: bool | lib.NoDefault = lib.no_default,
365377
copy: bool = True,
366378
) -> None:
379+
if fastpath is not lib.no_default:
380+
# GH#20110
381+
warnings.warn(
382+
"The 'fastpath' keyword in Categorical is deprecated and will "
383+
"be removed in a future version. Use Categorical.from_codes instead",
384+
FutureWarning,
385+
stacklevel=find_stack_level(),
386+
)
387+
else:
388+
fastpath = False
389+
367390
dtype = CategoricalDtype._from_values_or_dtype(
368391
values, categories, ordered, dtype
369392
)
@@ -626,7 +649,7 @@ def _from_inferred_categories(
626649
dtype = CategoricalDtype(cats, ordered=False)
627650
codes = inferred_codes
628651

629-
return cls(codes, dtype=dtype, fastpath=True)
652+
return cls._simple_new(codes, dtype=dtype)
630653

631654
@classmethod
632655
def from_codes(
@@ -693,7 +716,7 @@ def from_codes(
693716
if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
694717
raise ValueError("codes need to be between -1 and len(categories)-1")
695718

696-
return cls(codes, dtype=dtype, fastpath=True)
719+
return cls._simple_new(codes, dtype=dtype)
697720

698721
# ------------------------------------------------------------------
699722
# Categories/Codes/Ordered
@@ -805,7 +828,7 @@ def _set_dtype(self, dtype: CategoricalDtype) -> Self:
805828
a (valid) instance of `CategoricalDtype`.
806829
"""
807830
codes = recode_for_categories(self.codes, self.categories, dtype.categories)
808-
return type(self)(codes, dtype=dtype, fastpath=True)
831+
return type(self)._simple_new(codes, dtype=dtype)
809832

810833
def set_ordered(self, value: bool) -> Self:
811834
"""

pandas/core/base.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -573,25 +573,26 @@ def to_numpy(
573573
f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
574574
)
575575

576-
if na_value is not lib.no_default:
577-
values = self._values
576+
fillna = (
577+
na_value is not lib.no_default
578+
# no need to fillna with np.nan if we already have a float dtype
579+
and not (na_value is np.nan and np.issubdtype(self.dtype, np.floating))
580+
)
581+
582+
values = self._values
583+
if fillna:
578584
if not can_hold_element(values, na_value):
579585
# if we can't hold the na_value asarray either makes a copy or we
580586
# error before modifying values. The asarray later on thus won't make
581587
# another copy
582588
values = np.asarray(values, dtype=dtype)
583589
else:
584590
values = values.copy()
585-
586591
values[np.asanyarray(self.isna())] = na_value
587-
else:
588-
values = self._values
589592

590593
result = np.asarray(values, dtype=dtype)
591594

592-
if (copy and na_value is lib.no_default) or (
593-
not copy and using_copy_on_write()
594-
):
595+
if (copy and not fillna) or (not copy and using_copy_on_write()):
595596
if np.shares_memory(self._values[:2], result[:2]):
596597
# Take slices to improve performance of check
597598
if using_copy_on_write() and not copy:

0 commit comments

Comments
 (0)