pandas-dev
diff --git a/‎.github/workflows/sdist.yml
Lines changed: 64 additions & 0 deletions b/‎.github/workflows/sdist.yml
Lines changed: 64 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 5 additions & 5 deletions b/‎.pre-commit-config.yaml
Lines changed: 5 additions & 5 deletions
diff --git a/‎asv_bench/benchmarks/algos/isin.py
Lines changed: 10 additions & 0 deletions b/‎asv_bench/benchmarks/algos/isin.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎doc/source/user_guide/indexing.rst
Lines changed: 4 additions & 5 deletions b/‎doc/source/user_guide/indexing.rst
Lines changed: 4 additions & 5 deletions
diff --git a/‎doc/source/whatsnew/v1.2.5.rst
Lines changed: 7 additions & 27 deletions b/‎doc/source/whatsnew/v1.2.5.rst
Lines changed: 7 additions & 27 deletions
diff --git a/‎doc/source/whatsnew/v1.3.0.rst
Lines changed: 3 additions & 0 deletions b/‎doc/source/whatsnew/v1.3.0.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/source/whatsnew/v1.4.0.rst
Lines changed: 1 addition & 1 deletion b/‎doc/source/whatsnew/v1.4.0.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/_libs/hashtable.pyi
Lines changed: 2 additions & 0 deletions b/‎pandas/_libs/hashtable.pyi
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/_libs/hashtable.pyx
Lines changed: 10 additions & 0 deletions b/‎pandas/_libs/hashtable.pyx
Lines changed: 10 additions & 0 deletions
diff --git a/‎pandas/_libs/khash.pxd
Lines changed: 3 additions & 0 deletions b/‎pandas/_libs/khash.pxd
Lines changed: 3 additions & 0 deletions
diff --git a/‎pandas/_libs/src/klib/khash_python.h
Lines changed: 5 additions & 2 deletions b/‎pandas/_libs/src/klib/khash_python.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎pandas/_libs/tslibs/timestamps.pyx
Lines changed: 8 additions & 1 deletion b/‎pandas/_libs/tslibs/timestamps.pyx
Lines changed: 8 additions & 1 deletion
diff --git a/‎pandas/_typing.py
Lines changed: 1 addition & 0 deletions b/‎pandas/_typing.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎pandas/core/algorithms.py
Lines changed: 5 additions & 1 deletion b/‎pandas/core/algorithms.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎pandas/core/arrays/categorical.py
Lines changed: 5 additions & 1 deletion b/‎pandas/core/arrays/categorical.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎pandas/core/arrays/sparse/array.py
Lines changed: 1 addition & 1 deletion b/‎pandas/core/arrays/sparse/array.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/core/arrays/sparse/scipy_sparse.py
Lines changed: 1 addition & 1 deletion b/‎pandas/core/arrays/sparse/scipy_sparse.py
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,64 @@
+name: sdist
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+      - 1.2.x
+      - 1.3.x
+    paths-ignore:
+      - "doc/**"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.8", "3.9"]
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel
+
+        # GH 39416
+        pip install numpy
+
+    - name: Build pandas sdist
+      run: |
+        pip list
+        python setup.py sdist --formats=gztar
+
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: pandas-sdist
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install pandas from sdist
+      run: |
+        conda list
+        python -m pip install dist/*.gz
+
+    - name: Import pandas
+      run: |
+        cd ..
+        conda list
+        python -c "import pandas; pandas.show_versions();"
@@ -9,11 +9,11 @@ repos:
     -   id: absolufy-imports
         files: ^pandas/
 -   repo: https://github.com/python/black
-    rev: 21.5b2
+    rev: 21.6b0
     hooks:
     -   id: black
 -   repo: https://github.com/codespell-project/codespell
-    rev: v2.0.0
+    rev: v2.1.0
     hooks:
     -   id: codespell
         types_or: [python, rst, markdown]
@@ -53,16 +53,16 @@ repos:
         types: [text]
         args: [--append-config=flake8/cython-template.cfg]
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.8.0
+    rev: 5.9.0
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.18.3
+    rev: v2.19.4
     hooks:
     -   id: pyupgrade
         args: [--py37-plus]
 -   repo: https://github.com/pre-commit/pygrep-hooks
-    rev: v1.8.0
+    rev: v1.9.0
     hooks:
       - id: rst-backticks
       - id: rst-directive-colons
 
@@ -325,3 +325,13 @@ def setup(self, dtype, series_type):
 
     def time_isin(self, dtypes, series_type):
         self.series.isin(self.values)
+
+
+class IsInWithLongTupples:
+    def setup(self):
+        t = tuple(range(1000))
+        self.series = Series([t] * 1000)
+        self.values = [t]
+
+    def time_isin(self):
+        self.series.isin(self.values)
@@ -1523,18 +1523,17 @@ Looking up values by index/column labels
 ----------------------------------------
 
 Sometimes you want to extract a set of values given a sequence of row labels
-and column labels, this can be achieved by ``DataFrame.melt`` combined by filtering the corresponding
-rows with ``DataFrame.loc``.  For instance:
+and column labels, this can be achieved by ``pandas.factorize``  and NumPy indexing.
+For instance:
 
 .. ipython:: python
 
     df = pd.DataFrame({'col': ["A", "A", "B", "B"],
                        'A': [80, 23, np.nan, 22],
                        'B': [80, 55, 76, 67]})
     df
-    melt = df.melt('col')
-    melt = melt.loc[melt['col'] == melt['variable'], 'value']
-    melt.reset_index(drop=True)
+    idx, cols = pd.factorize(df['col'])
+    df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
 
 Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method
 which was deprecated in version 1.2.0.
 
@@ -1,7 +1,7 @@
 .. _whatsnew_125:
 
-What's new in 1.2.5 (May ??, 2021)
-----------------------------------
+What's new in 1.2.5 (June 22, 2021)
+-----------------------------------
 
 These are the changes in pandas 1.2.5. See :ref:`release` for a full changelog
 including other versions of pandas.
@@ -14,32 +14,12 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
-- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
+- Fixed regression in :func:`concat` between two :class:`DataFrame` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
 - Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`)
-- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
-- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
-- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
-
-.. ---------------------------------------------------------------------------
-
-
-.. _whatsnew_125.bug_fixes:
-
-Bug fixes
-~~~~~~~~~
-
--
--
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_125.other:
-
-Other
-~~~~~
-
--
--
+- Fixed regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
+- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
+- Fixed regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
+- Fixed regression in :meth:`DataFrame.astype` with ``dtype=str`` failing to convert ``NaN`` in categorical columns (:issue:`41797`)
 
 .. ---------------------------------------------------------------------------
 
 
@@ -269,12 +269,14 @@ Other enhancements
 - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
+- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising with ``object`` data containing ``pd.NA`` even when ``skipna=True`` (:issue:`37501`)
 - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`)
 - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
 - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
 - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`)
+- :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`)
 
 .. ---------------------------------------------------------------------------
 
@@ -914,6 +916,7 @@ Datetimelike
 - Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise a ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`)
 - Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`)
 - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
+- Bug in :meth:`DatetimeIndex.intersection` giving incorrect results with non-Tick frequencies with ``n != 1`` (:issue:`42104`)
 - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
 - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`)
 - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`)
 
@@ -96,7 +96,7 @@ Other API changes
 
 Deprecations
 ~~~~~~~~~~~~
--
+- Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`)
 -
 
 .. ---------------------------------------------------------------------------
 
@@ -228,3 +228,5 @@ def ismember(
     arr: np.ndarray,
     values: np.ndarray,
 ) -> np.ndarray: ...  # np.ndarray[bool]
+def object_hash(obj) -> int: ...
+def objects_are_equal(a, b) -> bool: ...
@@ -34,6 +34,8 @@ from pandas._libs.khash cimport (
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
     kh_needed_n_buckets,
+    kh_python_hash_equal,
+    kh_python_hash_func,
     kh_str_t,
     khcomplex64_t,
     khcomplex128_t,
@@ -46,6 +48,14 @@ def get_hashtable_trace_domain():
     return KHASH_TRACE_DOMAIN
 
 
+def object_hash(obj):
+    return kh_python_hash_func(obj)
+
+
+def objects_are_equal(a, b):
+    return kh_python_hash_equal(a, b)
+
+
 cdef int64_t NPY_NAT = util.get_nat()
 SIZE_HINT_LIMIT = (1 << 20) + 7
 
 
@@ -41,6 +41,9 @@ cdef extern from "khash_python.h":
     bint are_equivalent_float32_t \
     "kh_floats_hash_equal" (float32_t a, float32_t b) nogil
 
+    uint32_t kh_python_hash_func(object key)
+    bint kh_python_hash_equal(object a, object b)
+
     ctypedef struct kh_pymap_t:
         khuint_t n_buckets, size, n_occupied, upper_bound
         uint32_t *flags
 
@@ -226,6 +226,9 @@ int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
 
 
 int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
+    if (a == b) {
+        return 1;
+    }
     if (Py_TYPE(a) == Py_TYPE(b)) {
         // special handling for some built-in types which could have NaNs
         // as we would like to have them equivalent, but the usual
@@ -284,7 +287,7 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
 }
 
 
-khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
+khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
 
 //we could use any hashing algorithm, this is the original CPython's for tuples
 
@@ -325,7 +328,7 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
 }
 
 
-khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
+khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
     Py_hash_t hash;
     // For PyObject_Hash holds:
     //    hash(0.0) == 0 == hash(-0.0)
 
@@ -129,6 +129,13 @@ cdef inline object create_timestamp_from_ts(int64_t value,
     return ts_base
 
 
+def _unpickle_timestamp(value, freq, tz):
+    # GH#41949 dont warn on unpickle if we have a freq
+    ts = Timestamp(value, tz=tz)
+    ts._set_freq(freq)
+    return ts
+
+
 # ----------------------------------------------------------------------
 
 def integer_op_not_supported(obj):
@@ -725,7 +732,7 @@ cdef class _Timestamp(ABCTimestamp):
 
     def __reduce__(self):
         object_state = self.value, self._freq, self.tzinfo
-        return (Timestamp, object_state)
+        return (_unpickle_timestamp, object_state)
 
     # -----------------------------------------------------------------
     # Rendering Methods
 
@@ -122,6 +122,7 @@
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
 Frequency = Union[str, "DateOffset"]
 Axes = Collection[Any]
+RandomState = Union[int, ArrayLike, np.random.Generator, np.random.RandomState]
 
 # dtypes
 NpDtype = Union[str, np.dtype]
 
@@ -140,7 +140,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
             return np.asarray(values).view("uint8"), values.dtype
         else:
             # i.e. all-bool Categorical, BooleanArray
-            return np.asarray(values).astype("uint8", copy=False), values.dtype
+            try:
+                return np.asarray(values).astype("uint8", copy=False), values.dtype
+            except TypeError:
+                # GH#42107 we have pd.NAs present
+                return np.asarray(values), values.dtype
 
     elif is_integer_dtype(values.dtype):
         return np.asarray(values), values.dtype
 
@@ -26,6 +26,7 @@
     NaT,
     algos as libalgos,
     hashtable as htable,
+    lib,
 )
 from pandas._libs.arrays import NDArrayBacked
 from pandas._libs.lib import no_default
@@ -523,14 +524,17 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
             try:
                 new_cats = np.asarray(self.categories)
                 new_cats = new_cats.astype(dtype=dtype, copy=copy)
+                fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype))
             except (
                 TypeError,  # downstream error msg for CategoricalIndex is misleading
                 ValueError,
             ):
                 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
                 raise ValueError(msg)
 
-            result = take_nd(new_cats, ensure_platform_int(self._codes))
+            result = take_nd(
+                new_cats, ensure_platform_int(self._codes), fill_value=fill_value
+            )
 
         return result
 
 
@@ -1448,7 +1448,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
                 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
             )
 
-        result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs)
+        result = getattr(ufunc, method)(*(np.asarray(x) for x in inputs), **kwargs)
         if out:
             if len(out) == 1:
                 out = out[0]
 
@@ -58,7 +58,7 @@ def _get_label_to_i_dict(labels, sort_labels=False):
             return {k: i for i, k in enumerate(labels)}
 
         def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
-            ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
+            ilabels = list(zip(*(index._get_level_values(i) for i in subset)))
             labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels)
             labels_to_i = Series(labels_to_i)
             if len(subset) > 1:
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ Other API changes`
`96`	`96`
`97`	`97`	`Deprecations`
`98`	`98`	`~~~~~~~~~~~~`
`99`		`--`
	`99`	+- Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`)
`100`	`100`	`-`
`101`	`101`
`102`	`102`	`.. ---------------------------------------------------------------------------`
Original file line number	Diff line number	Diff line change
`@@ -1448,7 +1448,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, inputs, *kwargs):`
`1448`	`1448`	`sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)`
`1449`	`1449`	`)`
`1450`	`1450`
`1451`		`- result = getattr(ufunc, method)([np.asarray(x) for x in inputs], *kwargs)`
	`1451`	`+ result = getattr(ufunc, method)((np.asarray(x) for x in inputs), *kwargs)`
`1452`	`1452`	`if out:`
`1453`	`1453`	`if len(out) == 1:`
`1454`	`1454`	`out = out[0]`