Skip to content

Commit 84c01da

Browse files
committed
Merge remote-tracking branch 'upstream/main' into sequence_to_td64ns
2 parents d93d5b2 + 4651ddb commit 84c01da

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+526
-356
lines changed

.github/workflows/unit-tests.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ jobs:
8686
TEST_ARGS: ${{ matrix.test_args || '' }}
8787
PYTEST_WORKERS: 'auto'
8888
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
89-
NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
9089
# Clipboard tests
9190
QT_QPA_PLATFORM: offscreen
9291
REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}

.github/workflows/wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ jobs:
156156
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
157157

158158
- name: Build wheels
159-
uses: pypa/[email protected].0
159+
uses: pypa/[email protected].3
160160
with:
161161
package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
162162
env:

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ minimum_pre_commit_version: 2.15.0
22
exclude: ^LICENSES/|\.(html|csv|svg)$
33
# reserve "manual" for relatively slow hooks which we still want to run in CI
44
default_stages: [
5-
commit,
6-
merge-commit,
7-
push,
5+
pre-commit,
6+
pre-merge-commit,
7+
pre-push,
88
prepare-commit-msg,
99
commit-msg,
1010
post-checkout,

ci/code_checks.sh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8585
-i "pandas.Timestamp.resolution PR02" \
8686
-i "pandas.Timestamp.tzinfo GL08" \
8787
-i "pandas.Timestamp.year GL08" \
88-
-i "pandas.api.types.is_integer PR01,SA01" \
89-
-i "pandas.api.types.is_iterator PR07,SA01" \
9088
-i "pandas.api.types.is_re_compilable PR07,SA01" \
9189
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
9290
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
@@ -123,11 +121,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
123121
-i "pandas.core.resample.Resampler.quantile PR01,PR07" \
124122
-i "pandas.core.resample.Resampler.sem SA01" \
125123
-i "pandas.core.resample.Resampler.std SA01" \
126-
-i "pandas.core.resample.Resampler.sum SA01" \
127124
-i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
128125
-i "pandas.core.resample.Resampler.var SA01" \
129126
-i "pandas.errors.AttributeConflictWarning SA01" \
130-
-i "pandas.errors.CSSWarning SA01" \
131127
-i "pandas.errors.ChainedAssignmentError SA01" \
132128
-i "pandas.errors.DataError SA01" \
133129
-i "pandas.errors.DuplicateLabelError SA01" \
@@ -136,22 +132,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
136132
-i "pandas.errors.NullFrequencyError SA01" \
137133
-i "pandas.errors.NumExprClobberingError SA01" \
138134
-i "pandas.errors.NumbaUtilError SA01" \
139-
-i "pandas.errors.OptionError SA01" \
140135
-i "pandas.errors.OutOfBoundsTimedelta SA01" \
141136
-i "pandas.errors.PerformanceWarning SA01" \
142137
-i "pandas.errors.PossibleDataLossError SA01" \
143-
-i "pandas.errors.PossiblePrecisionLoss SA01" \
144138
-i "pandas.errors.UndefinedVariableError PR01,SA01" \
145139
-i "pandas.errors.UnsortedIndexError SA01" \
146-
-i "pandas.errors.UnsupportedFunctionCall SA01" \
147140
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
148141
-i "pandas.infer_freq SA01" \
149142
-i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
150143
-i "pandas.io.stata.StataWriter.write_file SA01" \
151-
-i "pandas.json_normalize RT03,SA01" \
152144
-i "pandas.plotting.andrews_curves RT03,SA01" \
153145
-i "pandas.plotting.scatter_matrix PR07,SA01" \
154-
-i "pandas.set_eng_float_format RT03,SA01" \
155146
-i "pandas.tseries.offsets.BDay PR02,SA01" \
156147
-i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
157148
-i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
@@ -297,7 +288,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
297288
-i "pandas.tseries.offsets.Second.is_on_offset GL08" \
298289
-i "pandas.tseries.offsets.Second.n GL08" \
299290
-i "pandas.tseries.offsets.Second.normalize GL08" \
300-
-i "pandas.tseries.offsets.SemiMonthBegin SA01" \
301291
-i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \
302292
-i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \
303293
-i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \

ci/deps/actions-311-pyarrownightly.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dependencies:
1818

1919
# required dependencies
2020
- python-dateutil
21-
- numpy<2
21+
- numpy
2222
- pip
2323

2424
- pip:

doc/source/development/contributing_codebase.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,12 @@ So, before actually writing any code, you should write your tests. Often the te
298298
taken from the original GitHub issue. However, it is always worth considering additional
299299
use cases and writing corresponding tests.
300300

301+
We use `code coverage <https://en.wikipedia.org/wiki/Code_coverage>`_ to help understand
302+
the amount of code which is covered by a test. We recommend striving to ensure code
303+
you add or change within Pandas is covered by a test. Please see our
304+
`code coverage dashboard through Codecov <https://app.codecov.io/github/pandas-dev/pandas>`_
305+
for more information.
306+
301307
Adding tests is one of the most common requests after code is pushed to pandas. Therefore,
302308
it is worth getting in the habit of writing tests ahead of time so this is never an issue.
303309

doc/source/user_guide/cookbook.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ These are some neat pandas ``idioms``
3535
)
3636
df
3737
38-
if-then...
38+
If-then...
3939
**********
4040

4141
An if-then on one column
@@ -176,7 +176,7 @@ One could hard code:
176176
Selection
177177
---------
178178

179-
Dataframes
179+
DataFrames
180180
**********
181181

182182
The :ref:`indexing <indexing>` docs.
@@ -1489,7 +1489,7 @@ of the data values:
14891489
)
14901490
df
14911491
1492-
Constant series
1492+
Constant Series
14931493
---------------
14941494

14951495
To assess if a series has a constant value, we can check if ``series.nunique() <= 1``.

doc/source/user_guide/gotchas.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ Below is how to check if any of the values are ``True``:
121121
if pd.Series([False, True, False]).any():
122122
print("I am any")
123123
124-
Bitwise boolean
124+
Bitwise Boolean
125125
~~~~~~~~~~~~~~~
126126

127127
Bitwise boolean operators like ``==`` and ``!=`` return a boolean :class:`Series`

doc/source/user_guide/groupby.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ this will make an extra copy.
618618
619619
.. _groupby.aggregate.udf:
620620

621-
Aggregation with User-Defined Functions
621+
Aggregation with user-defined functions
622622
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
623623

624624
Users can also provide their own User-Defined Functions (UDFs) for custom aggregations.
@@ -1261,7 +1261,7 @@ with
12611261
df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False)
12621262
12631263
1264-
Numba Accelerated Routines
1264+
Numba accelerated routines
12651265
--------------------------
12661266

12671267
.. versionadded:: 1.1
@@ -1696,7 +1696,7 @@ introduction <categorical>` and the
16961696
16971697
dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
16981698
1699-
Groupby by indexer to 'resample' data
1699+
GroupBy by indexer to 'resample' data
17001700
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17011701

17021702
Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.

doc/source/user_guide/integer_na.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well.
147147
df.sum()
148148
df.groupby("B").A.sum()
149149
150-
Scalar NA Value
150+
Scalar NA value
151151
---------------
152152

153153
:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar

doc/source/user_guide/io.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5996,7 +5996,7 @@ Full documentation can be found `here <https://pandas-gbq.readthedocs.io/en/late
59965996

59975997
.. _io.stata:
59985998

5999-
Stata format
5999+
STATA format
60006000
------------
60016001

60026002
.. _io.stata_writer:

doc/source/whatsnew/v1.0.2.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Fixed regressions
4747

4848
.. ---------------------------------------------------------------------------
4949
50-
Indexing with nullable boolean arrays
50+
Indexing with nullable Boolean arrays
5151
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5252

5353
Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`)

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ enhancement1
3232
Other enhancements
3333
^^^^^^^^^^^^^^^^^^
3434

35-
-
35+
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
3636
-
3737

3838
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,7 @@ Performance improvements
592592
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
593593
- Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
594594
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
595+
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
595596
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
596597
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
597598
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)

pandas/_config/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ class OptionError(AttributeError, KeyError):
105105
106106
Backwards compatible with KeyError checks.
107107
108+
See Also
109+
--------
110+
options : Access and modify global pandas settings.
111+
108112
Examples
109113
--------
110114
>>> pd.options.context

pandas/_libs/lib.pyx

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,15 +259,23 @@ def is_iterator(obj: object) -> bool:
259259
Check if the object is an iterator.
260260

261261
This is intended for generators, not list-like objects.
262+
This method checks whether the passed object is an iterator. It
263+
returns `True` if the object is an iterator, and `False` otherwise.
262264

263265
Parameters
264266
----------
265267
obj : The object to check
268+
The object to check for iterator type.
266269

267270
Returns
268271
-------
269272
is_iter : bool
270273
Whether `obj` is an iterator.
274+
`True` if the object is of iterator type, otherwise `False`.
275+
276+
See Also
277+
--------
278+
api.types.is_list_like : Check if the input is list-like.
271279

272280
Examples
273281
--------
@@ -1122,9 +1130,23 @@ def is_integer(obj: object) -> bool:
11221130
"""
11231131
Return True if given object is integer.
11241132

1133+
This method checks whether the passed object is an integer type. It
1134+
returns `True` if the object is an integer, and `False` otherwise.
1135+
1136+
Parameters
1137+
----------
1138+
obj : object
1139+
The object to check for integer type.
1140+
11251141
Returns
11261142
-------
11271143
bool
1144+
`True` if the object is of integer type, otherwise `False`.
1145+
1146+
See Also
1147+
--------
1148+
api.types.is_float : Check if an object is of float type.
1149+
api.types.is_numeric_dtype : Check if an object is of numeric type.
11281150

11291151
Examples
11301152
--------

pandas/_libs/tslibs/offsets.pyx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3371,6 +3371,10 @@ cdef class SemiMonthBegin(SemiMonthOffset):
33713371
"""
33723372
Two DateOffset's per month repeating on the first day of the month & day_of_month.
33733373
3374+
This offset moves dates to the first day of the month and an additional specified
3375+
day (typically the 15th by default), useful in scenarios where bi-monthly processing
3376+
occurs on set days.
3377+
33743378
Attributes
33753379
----------
33763380
n : int, default 1
@@ -3380,6 +3384,13 @@ cdef class SemiMonthBegin(SemiMonthOffset):
33803384
day_of_month : int, {1, 3,...,27}, default 15
33813385
A specific integer for the day of the month.
33823386
3387+
See Also
3388+
--------
3389+
tseries.offsets.SemiMonthEnd : Two DateOffset's per month repeating on the last day
3390+
of the month & day_of_month.
3391+
tseries.offsets.MonthEnd : Offset to the last calendar day of the month.
3392+
tseries.offsets.MonthBegin : Offset to the first calendar day of the month.
3393+
33833394
Examples
33843395
--------
33853396
>>> ts = pd.Timestamp(2022, 1, 1)

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,10 +1864,12 @@ class Timedelta(_Timedelta):
18641864
18651865
Parameters
18661866
----------
1867-
value : Timedelta, timedelta, np.timedelta64, str, or int
1867+
value : Timedelta, timedelta, np.timedelta64, str, int or float
18681868
Input value.
18691869
unit : str, default 'ns'
1870-
Denote the unit of the input, if input is an integer.
1870+
If input is an integer, denote the unit of the input.
1871+
If input is a float, denote the unit of the integer parts.
1872+
The decimal parts with resolution lower than 1 nanosecond are ignored.
18711873
18721874
Possible values:
18731875
@@ -2176,8 +2178,10 @@ class Timedelta(_Timedelta):
21762178
Parameters
21772179
----------
21782180
freq : str
2179-
Frequency string indicating the ceiling resolution.
2180-
It uses the same units as class constructor :class:`~pandas.Timedelta`.
2181+
Frequency string indicating the ceiling resolution. Must be a fixed
2182+
frequency like 's' (second) not 'ME' (month end). See
2183+
:ref:`frequency aliases <timeseries.offset_aliases>` for
2184+
a list of possible `freq` values.
21812185
21822186
Returns
21832187
-------

pandas/conftest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,8 @@ def _create_mi_with_dt64tz_level():
667667

668668

669669
indices_dict = {
670-
"string": Index([f"pandas_{i}" for i in range(10)]),
670+
"object": Index([f"pandas_{i}" for i in range(10)], dtype=object),
671+
"string": Index([f"pandas_{i}" for i in range(10)], dtype="str"),
671672
"datetime": date_range("2020-01-01", periods=10),
672673
"datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"),
673674
"period": period_range("2020-01-01", periods=10, freq="D"),

pandas/core/array_algos/masked_reductions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ def _reductions(
6262
):
6363
return libmissing.NA
6464

65+
if values.dtype == np.dtype(object):
66+
# object dtype does not support `where` without passing an initial
67+
values = values[~mask]
68+
return func(values, axis=axis, **kwargs)
6569
return func(values, where=~mask, axis=axis, **kwargs)
6670

6771

pandas/core/arrays/arrow/array.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
unpack_tuple_and_ellipses,
6969
validate_indices,
7070
)
71+
from pandas.core.nanops import check_below_min_count
7172
from pandas.core.strings.base import BaseStringArrayMethods
7273

7374
from pandas.io._util import _arrow_dtype_mapping
@@ -1705,6 +1706,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
17051706
denominator = pc.sqrt_checked(pc.count(self._pa_array))
17061707
return pc.divide_checked(numerator, denominator)
17071708

1709+
elif name == "sum" and (
1710+
pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type)
1711+
):
1712+
1713+
def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc]
1714+
mask = pc.is_null(data) if data.null_count > 0 else None
1715+
if skip_nulls:
1716+
if min_count > 0 and check_below_min_count(
1717+
(len(data),),
1718+
None if mask is None else mask.to_numpy(),
1719+
min_count,
1720+
):
1721+
return pa.scalar(None, type=data.type)
1722+
if data.null_count > 0:
1723+
# binary_join returns null if there is any null ->
1724+
# have to filter out any nulls
1725+
data = data.filter(pc.invert(mask))
1726+
else:
1727+
if mask is not None or check_below_min_count(
1728+
(len(data),), None, min_count
1729+
):
1730+
return pa.scalar(None, type=data.type)
1731+
1732+
if pa.types.is_large_string(data.type):
1733+
# binary_join only supports string, not large_string
1734+
data = data.cast(pa.string())
1735+
data_list = pa.ListArray.from_arrays(
1736+
[0, len(data)], data.combine_chunks()
1737+
)[0]
1738+
return pc.binary_join(data_list, "")
1739+
17081740
else:
17091741
pyarrow_name = {
17101742
"median": "quantile",

0 commit comments

Comments
 (0)