pandas-dev
diff --git a/‎Makefile
Lines changed: 7 additions & 0 deletions b/‎Makefile
Lines changed: 7 additions & 0 deletions
diff --git a/‎ci/code_checks.sh
Lines changed: 23 additions & 4 deletions b/‎ci/code_checks.sh
Lines changed: 23 additions & 4 deletions
diff --git a/‎doc/source/development/contributing_docstring.rst
Lines changed: 5 additions & 5 deletions b/‎doc/source/development/contributing_docstring.rst
Lines changed: 5 additions & 5 deletions
diff --git a/‎doc/source/reference/frame.rst
Lines changed: 16 additions & 0 deletions b/‎doc/source/reference/frame.rst
Lines changed: 16 additions & 0 deletions
diff --git a/‎doc/source/reference/general_utility_functions.rst
Lines changed: 1 addition & 0 deletions b/‎doc/source/reference/general_utility_functions.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/reference/series.rst
Lines changed: 15 additions & 0 deletions b/‎doc/source/reference/series.rst
Lines changed: 15 additions & 0 deletions
diff --git a/‎doc/source/user_guide/computation.rst
Lines changed: 3 additions & 0 deletions b/‎doc/source/user_guide/computation.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/source/user_guide/duplicates.rst
Lines changed: 210 additions & 0 deletions b/‎doc/source/user_guide/duplicates.rst
Lines changed: 210 additions & 0 deletions
diff --git a/‎doc/source/user_guide/enhancingperf.rst
Lines changed: 7 additions & 0 deletions b/‎doc/source/user_guide/enhancingperf.rst
Lines changed: 7 additions & 0 deletions
@@ -25,3 +25,10 @@ doc:
 	cd doc; \
 	python make.py clean; \
 	python make.py html
+
+check:
+	python3 scripts/validate_unwanted_patterns.py \
+		--validation-type="private_function_across_module" \
+		--included-file-extensions="py" \
+		--excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored \
+		pandas/
@@ -116,6 +116,14 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
     fi
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Check for use of private module attribute access' ; echo $MSG
+    if [[ "$GITHUB_ACTIONS" == "true" ]]; then
+        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/
+    else
+        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/
+    fi
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     echo "isort --version-number"
     isort --version-number
 
@@ -179,6 +187,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     invgrep -R --include="*.py" -E "super\(\w*, (self|cls)\)" pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Check for use of builtin filter function' ; echo $MSG
+    invgrep -R --include="*.py" -P '(?<!def)[\(\s]filter\(' pandas
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     # Check for the following code in testing: `np.testing` and `np.array_equal`
     MSG='Check for invalid testing' ; echo $MSG
     invgrep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/
@@ -226,15 +238,22 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     invgrep -R --include=*.{py,pyx} '!r}' pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    # -------------------------------------------------------------------------
+    # Type annotations
+
     MSG='Check for use of comment-based annotation syntax' ; echo $MSG
     invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    # https://github.com/python/mypy/issues/7384
-    # MSG='Check for missing error codes with # type: ignore' ; echo $MSG
-    # invgrep -R --include="*.py" -P '# type: ignore(?!\[)' pandas
-    # RET=$(($RET + $?)) ; echo $MSG "DONE"
+    MSG='Check for missing error codes with # type: ignore' ; echo $MSG
+    invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
+    MSG='Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias' ; echo $MSG
+    invgrep -R --include="*.py" --exclude=_typing.py -E 'Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]' pandas
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    # -------------------------------------------------------------------------
     MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG
     invgrep -R --include=*.{py,pyx} '\.__class__' pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
@@ -32,18 +32,18 @@ The next example gives an idea of what a docstring looks like:
         Parameters
         ----------
         num1 : int
-            First number to add
+            First number to add.
         num2 : int
-            Second number to add
+            Second number to add.
 
         Returns
         -------
         int
-            The sum of `num1` and `num2`
+            The sum of `num1` and `num2`.
 
         See Also
         --------
-        subtract : Subtract one integer from another
+        subtract : Subtract one integer from another.
 
         Examples
         --------
@@ -998,4 +998,4 @@ mapping function names to docstrings. Wherever possible, we prefer using
 
 See ``pandas.core.generic.NDFrame.fillna`` for an example template, and
 ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna``
-for the filled versions.
+for the filled versions.
@@ -37,6 +37,7 @@ Attributes and underlying data
    DataFrame.shape
    DataFrame.memory_usage
    DataFrame.empty
+   DataFrame.set_flags
 
 Conversion
 ~~~~~~~~~~
@@ -276,6 +277,21 @@ Time Series-related
    DataFrame.tz_convert
    DataFrame.tz_localize
 
+.. _api.frame.flags:
+
+Flags
+~~~~~
+
+Flags refer to attributes of the pandas object. Properties of the dataset (like
+the date is was recorded, the URL it was accessed from, etc.) should be stored
+in :attr:`DataFrame.attrs`.
+
+.. autosummary::
+   :toctree: api/
+
+   Flags
+
+
 .. _api.frame.metadata:
 
 Metadata
 
@@ -37,6 +37,7 @@ Exceptions and warnings
 
    errors.AccessorRegistrationWarning
    errors.DtypeWarning
+   errors.DuplicateLabelError
    errors.EmptyDataError
    errors.InvalidIndexError
    errors.MergeError
 
@@ -39,6 +39,8 @@ Attributes
    Series.empty
    Series.dtypes
    Series.name
+   Series.flags
+   Series.set_flags
 
 Conversion
 ----------
@@ -527,6 +529,19 @@ Sparse-dtype specific methods and attributes are provided under the
    Series.sparse.from_coo
    Series.sparse.to_coo
 
+.. _api.series.flags:
+
+Flags
+~~~~~
+
+Flags refer to attributes of the pandas object. Properties of the dataset (like
+the date is was recorded, the URL it was accessed from, etc.) should be stored
+in :attr:`Series.attrs`.
+
+.. autosummary::
+   :toctree: api/
+
+   Flags
 
 .. _api.series.metadata:
 
 
@@ -361,6 +361,9 @@ compute the mean absolute deviation on a rolling basis:
    @savefig rolling_apply_ex.png
    s.rolling(window=60).apply(mad, raw=True).plot(style='k')
 
+Using the Numba engine
+~~~~~~~~~~~~~~~~~~~~~~
+
 .. versionadded:: 1.0
 
 Additionally, :meth:`~Rolling.apply` can leverage `Numba <https://numba.pydata.org/>`__
 
@@ -0,0 +1,210 @@
+.. _duplicates:
+
+****************
+Duplicate Labels
+****************
+
+:class:`Index` objects are not required to be unique; you can have duplicate row
+or column labels. This may be a bit confusing at first. If you're familiar with
+SQL, you know that row labels are similar to a primary key on a table, and you
+would never want duplicates in a SQL table. But one of pandas' roles is to clean
+messy, real-world data before it goes to some downstream system. And real-world
+data has duplicates, even in fields that are supposed to be unique.
+
+This section describes how duplicate labels change the behavior of certain
+operations, and how prevent duplicates from arising during operations, or to
+detect them if they do.
+
+.. ipython:: python
+
+   import pandas as pd
+   import numpy as np
+
+Consequences of Duplicate Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some pandas methods (:meth:`Series.reindex` for example) just don't work with
+duplicates present. The output can't be determined, and so pandas raises.
+
+.. ipython:: python
+   :okexcept:
+
+   s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b'])
+   s1.reindex(['a', 'b', 'c'])
+
+Other methods, like indexing, can give very surprising results. Typically
+indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame``
+with a scalar will return a ``Series``. Slicing a ``Series`` with a scalar will
+return a scalar. But with duplicates, this isn't the case.
+
+.. ipython:: python
+
+   df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B'])
+   df1
+
+We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series``
+
+.. ipython:: python
+
+   df1['B']  # a series
+
+But slicing ``'A'`` returns a ``DataFrame``
+
+
+.. ipython:: python
+
+   df1['A']  # a DataFrame
+
+This applies to row labels as well
+
+.. ipython:: python
+
+   df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b'])
+   df2
+   df2.loc['b', 'A']  # a scalar
+   df2.loc['a', 'A']  # a Series
+
+Duplicate Label Detection
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can check whether an :class:`Index` (storing the row or column labels) is
+unique with :attr:`Index.is_unique`:
+
+.. ipython:: python
+
+   df2
+   df2.index.is_unique
+   df2.columns.is_unique
+
+.. note::
+
+   Checking whether an index is unique is somewhat expensive for large datasets.
+   Pandas does cache this result, so re-checking on the same index is very fast.
+
+:meth:`Index.duplicated` will return a boolean ndarray indicating whether a
+label is repeated.
+
+.. ipython:: python
+
+   df2.index.duplicated()
+
+Which can be used as a boolean filter to drop duplicate rows.
+
+.. ipython:: python
+
+   df2.loc[~df2.index.duplicated(), :]
+
+If you need additional logic to handle duplicate labels, rather than just
+dropping the repeats, using :meth:`~DataFrame.groupby` on the index is a common
+trick. For example, we'll resolve duplicates by taking the average of all rows
+with the same label.
+
+.. ipython:: python
+
+   df2.groupby(level=0).mean()
+
+.. _duplicates.disallow:
+
+Disallowing Duplicate Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.2.0
+
+As noted above, handling duplicates is an important feature when reading in raw
+data. That said, you may want to avoid introducing duplicates as part of a data
+processing pipeline (from methods like :meth:`pandas.concat`,
+:meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame`
+*disallow* duplicate labels by calling ``.set_flags(allows_duplicate_labels=False)``.
+(the default is to allow them). If there are duplicate labels, an exception
+will be raised.
+
+.. ipython:: python
+   :okexcept:
+
+   pd.Series(
+       [0, 1, 2],
+       index=['a', 'b', 'b']
+   ).set_flags(allows_duplicate_labels=False)
+
+This applies to both row and column labels for a :class:`DataFrame`
+
+.. ipython:: python
+   :okexcept:
+
+   pd.DataFrame(
+       [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],
+   ).set_flags(allows_duplicate_labels=False)
+
+This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`,
+which indicates whether that object can have duplicate labels.
+
+.. ipython:: python
+
+   df = (
+       pd.DataFrame({"A": [0, 1, 2, 3]},
+                    index=['x', 'y', 'X', 'Y'])
+         .set_flags(allows_duplicate_labels=False)
+   )
+   df
+   df.flags.allows_duplicate_labels
+
+:meth:`DataFrame.set_flags` can be used to return a new ``DataFrame`` with attributes
+like ``allows_duplicate_labels`` set to some value
+
+.. ipython:: python
+
+   df2 = df.set_flags(allows_duplicate_labels=True)
+   df2.flags.allows_duplicate_labels
+
+The new ``DataFrame`` returned is a view on the same data as the old ``DataFrame``.
+Or the property can just be set directly on the same object
+
+
+.. ipython:: python
+
+   df2.flags.allows_duplicate_labels = False
+   df2.flags.allows_duplicate_labels
+
+When processing raw, messy data you might initially read in the messy data
+(which potentially has duplicate labels), deduplicate, and then disallow duplicates
+going forward, to ensure that your data pipeline doesn't introduce duplicates.
+
+
+.. code-block:: python
+
+   >>> raw = pd.read_csv("...")
+   >>> deduplicated = raw.groupby(level=0).first()  # remove duplicates
+   >>> deduplicated.flags.allows_duplicate_labels = False  # disallow going forward
+
+Setting ``allows_duplicate_labels=True`` on a ``Series`` or ``DataFrame`` with duplicate
+labels or performing an operation that introduces duplicate labels on a ``Series`` or
+``DataFrame`` that disallows duplicates will raise an
+:class:`errors.DuplicateLabelError`.
+
+.. ipython:: python
+   :okexcept:
+
+   df.rename(str.upper)
+
+This error message contains the labels that are duplicated, and the numeric positions
+of all the duplicates (including the "original") in the ``Series`` or ``DataFrame``
+
+Duplicate Label Propagation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In general, disallowing duplicates is "sticky". It's preserved through
+operations.
+
+.. ipython:: python
+   :okexcept:
+
+   s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False)
+   s1
+   s1.head().rename({"a": "b"})
+
+.. warning::
+
+   This is an experimental feature. Currently, many methods fail to
+   propagate the ``allows_duplicate_labels`` value. In future versions
+   it is expected that every method taking or returning one or more
+   DataFrame or Series objects will propagate ``allows_duplicate_labels``.
@@ -373,6 +373,13 @@ nicer interface by passing/returning pandas objects.
 
 In this example, using Numba was faster than Cython.
 
+Numba as an argument
+~~~~~~~~~~~~~~~~~~~~
+
+Additionally, we can leverage the power of `Numba <https://numba.pydata.org/>`__
+by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools
+<stats.rolling_apply>` for an extensive example.
+
 Vectorize
 ~~~~~~~~~