Merge remote-tracking branch 'upstream/1.1.x' into auto-backport-of-pr-37198-on-1.1.x

simonjayhawkins · simonjayhawkins · commit b443ea776b95 · 2020-10-26T11:08:20.000Z
diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst
@@ -21,6 +21,9 @@ Fixed regressions
 - Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`)
 - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`)
 - Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`)
+- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`)
+- Fixed regression in certain offsets (:meth:`pd.offsets.Day() <pandas.tseries.offsets.Day>` and below) no longer being hashable (:issue:`37267`)
+- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`)
 
 .. ---------------------------------------------------------------------------
 
@@ -31,6 +34,7 @@ Bug fixes
 - Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`)
 - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`)
 - Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`)
+- Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -785,6 +785,11 @@ cdef class Tick(SingleConstructorOffset):
     def is_anchored(self) -> bool:
         return False
 
+    # This is identical to BaseOffset.__hash__, but has to be redefined here
+    # for Python 3, because we've redefined __eq__.
+    def __hash__(self) -> int:
+        return hash(self._params)
+
     # --------------------------------------------------------------------
     # Comparison and Arithmetic Methods
 
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -894,6 +894,9 @@ def __eq__(self, other: Any) -> bool:
 
         return isinstance(other, PeriodDtype) and self.freq == other.freq
 
+    def __ne__(self, other: Any) -> bool:
+        return not self.__eq__(other)
+
     def __setstate__(self, state):
         # for pickle compat. __getstate__ is defined in the
         # PandasExtensionDtype superclass and uses the public properties to
diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py
@@ -332,13 +332,13 @@ def _verbose_repr(
         )
 
         for i, col in enumerate(ids):
-            dtype = dtypes[i]
+            dtype = dtypes.iloc[i]
             col = pprint_thing(col)
 
             line_no = _put_str(f" {i}", space_num)
             count = ""
             if show_counts:
-                count = counts[i]
+                count = counts.iloc[i]
 
             lines.append(
                 line_no
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning):
 
 
 precision_loss_doc = """
-Column converted from %s to %s, and some data are outside of the lossless
+Column converted from {0} to {1}, and some data are outside of the lossless
 conversion range. This may result in a loss of precision in the saved data.
 """
 
@@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
     object in a DataFrame.
     """
     ws = ""
-    #                  original, if small, if large
+    # original, if small, if large
     conversion_data = (
         (np.bool_, np.int8, np.int8),
         (np.uint8, np.int8, np.int16),
@@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
                     dtype = c_data[1]
                 else:
                     dtype = c_data[2]
-                if c_data[2] == np.float64:  # Warn if necessary
+                if c_data[2] == np.int64:  # Warn if necessary
                     if data[col].max() >= 2 ** 53:
                         ws = precision_loss_doc.format("uint64", "float64")
 
@@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
         self.value_labels = list(zip(np.arange(len(categories)), categories))
         self.value_labels.sort(key=lambda x: x[0])
         self.text_len = 0
-        self.off: List[int] = []
-        self.val: List[int] = []
         self.txt: List[bytes] = []
         self.n = 0
 
         # Compute lengths and setup lists of offsets and labels
+        offsets: List[int] = []
+        values: List[int] = []
         for vl in self.value_labels:
             category = vl[1]
             if not isinstance(category, str):
@@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
                     ValueLabelTypeMismatch,
                 )
             category = category.encode(encoding)
-            self.off.append(self.text_len)
+            offsets.append(self.text_len)
             self.text_len += len(category) + 1  # +1 for the padding
-            self.val.append(vl[0])
+            values.append(vl[0])
             self.txt.append(category)
             self.n += 1
 
@@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
             )
 
         # Ensure int32
-        self.off = np.array(self.off, dtype=np.int32)
-        self.val = np.array(self.val, dtype=np.int32)
+        self.off = np.array(offsets, dtype=np.int32)
+        self.val = np.array(values, dtype=np.int32)
 
         # Total length
         self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
@@ -876,23 +876,23 @@ def __init__(self):
         # with a label, but the underlying variable is -127 to 100
         # we're going to drop the label and cast to int
         self.DTYPE_MAP = dict(
-            list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
+            list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
             + [
-                (251, np.int8),
-                (252, np.int16),
-                (253, np.int32),
-                (254, np.float32),
-                (255, np.float64),
+                (251, np.dtype(np.int8)),
+                (252, np.dtype(np.int16)),
+                (253, np.dtype(np.int32)),
+                (254, np.dtype(np.float32)),
+                (255, np.dtype(np.float64)),
             ]
         )
         self.DTYPE_MAP_XML = dict(
             [
-                (32768, np.uint8),  # Keys to GSO
-                (65526, np.float64),
-                (65527, np.float32),
-                (65528, np.int32),
-                (65529, np.int16),
-                (65530, np.int8),
+                (32768, np.dtype(np.uint8)),  # Keys to GSO
+                (65526, np.dtype(np.float64)),
+                (65527, np.dtype(np.float32)),
+                (65528, np.dtype(np.int32)),
+                (65529, np.dtype(np.int16)),
+                (65530, np.dtype(np.int8)),
             ]
         )
         self.TYPE_MAP = list(range(251)) + list("bhlfd")
@@ -1050,9 +1050,10 @@ def __init__(
         self._order_categoricals = order_categoricals
         self._encoding = ""
         self._chunksize = chunksize
-        if self._chunksize is not None and (
-            not isinstance(chunksize, int) or chunksize <= 0
-        ):
+        self._using_iterator = False
+        if self._chunksize is None:
+            self._chunksize = 1
+        elif not isinstance(chunksize, int) or chunksize <= 0:
             raise ValueError("chunksize must be a positive integer when set.")
 
         # State variables for the file
@@ -1062,7 +1063,7 @@ def __init__(
         self._column_selector_set = False
         self._value_labels_read = False
         self._data_read = False
-        self._dtype = None
+        self._dtype: Optional[np.dtype] = None
         self._lines_read = 0
 
         self._native_byteorder = _set_endianness(sys.byteorder)
@@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None:
     # Get data type information, works for versions 117-119.
     def _get_dtypes(
         self, seek_vartypes: int
-    ) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]:
+    ) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]:
 
         self.path_or_buf.seek(seek_vartypes)
         raw_typlist = [
@@ -1519,11 +1520,8 @@ def _read_strls(self) -> None:
             self.GSO[str(v_o)] = decoded_va
 
     def __next__(self) -> DataFrame:
-        if self._chunksize is None:
-            raise ValueError(
-                "chunksize must be set to a positive integer to use as an iterator."
-            )
-        return self.read(nrows=self._chunksize or 1)
+        self._using_iterator = True
+        return self.read(nrows=self._chunksize)
 
     def get_chunk(self, size: Optional[int] = None) -> DataFrame:
         """
@@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool:
             convert = False
             for col in data:
                 dtype = data[col].dtype
-                if dtype in (np.float16, np.float32):
-                    dtype = np.float64
+                if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
+                    dtype = np.dtype(np.float64)
                     convert = True
-                elif dtype in (np.int8, np.int16, np.int32):
-                    dtype = np.int64
+                elif dtype in (
+                    np.dtype(np.int8),
+                    np.dtype(np.int16),
+                    np.dtype(np.int32),
+                ):
+                    dtype = np.dtype(np.int64)
                     convert = True
                 retyped_data.append((col, data[col].astype(dtype)))
             if convert:
@@ -1807,14 +1809,14 @@ def _do_convert_categoricals(
                 keys = np.array(list(vl.keys()))
                 column = data[col]
                 key_matches = column.isin(keys)
-                if self._chunksize is not None and key_matches.all():
-                    initial_categories = keys
+                if self._using_iterator and key_matches.all():
+                    initial_categories: Optional[np.ndarray] = keys
                     # If all categories are in the keys and we are iterating,
                     # use the same keys for all chunks. If some are missing
                     # value labels, then we will fall back to the categories
                     # varying across chunks.
                 else:
-                    if self._chunksize is not None:
+                    if self._using_iterator:
                         # warn is using an iterator
                         warnings.warn(
                             categorical_conversion_warning, CategoricalConversionWarning
@@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
         "ty",
         "%ty",
     ]:
-        return np.float64  # Stata expects doubles for SIFs
+        return np.dtype(np.float64)  # Stata expects doubles for SIFs
     else:
         raise NotImplementedError(f"Format {fmt} not implemented")
 
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -991,3 +991,10 @@ def test_is_dtype_no_warning(check):
 
     with tm.assert_produces_warning(None):
         check(data["A"])
+
+
+def test_period_dtype_compare_to_string():
+    # https://github.com/pandas-dev/pandas/issues/37265
+    dtype = PeriodDtype(freq="M")
+    assert (dtype == "period[M]") is True
+    assert (dtype != "period[M]") is False
diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py
@@ -403,3 +403,25 @@ def test_info_categorical():
 
     buf = StringIO()
     df.info(buf=buf)
+
+
+def test_info_int_columns():
+    # GH#37245
+    df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
+    buf = StringIO()
+    df.info(null_counts=True, buf=buf)
+    result = buf.getvalue()
+    expected = textwrap.dedent(
+        """\
+        <class 'pandas.core.frame.DataFrame'>
+        Index: 2 entries, A to B
+        Data columns (total 2 columns):
+         #   Column  Non-Null Count  Dtype
+        ---  ------  --------------  -----
+         0   1       2 non-null      int64
+         1   2       2 non-null      int64
+        dtypes: int64(2)
+        memory usage: 48.0+ bytes
+        """
+    )
+    assert result == expected
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -563,16 +563,20 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):
         # read_table uses the new Arrow Datasets API since pyarrow 1.0.0
         # Previous behaviour was pyarrow partitioned columns become 'category' dtypes
         # These are added to back of dataframe on read. In new API category dtype is
-        # only used if partition field is string.
-        legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0")
-        if partition_col and legacy_read_table:
-            partition_col_type = "category"
-        else:
-            partition_col_type = "int32"
-
-        expected_df[partition_col] = expected_df[partition_col].astype(
-            partition_col_type
+        # only used if partition field is string, but this changed again to use
+        # category dtype for all types (not only strings) in pyarrow 2.0.0
+        pa10 = (LooseVersion(pyarrow.__version__) >= LooseVersion("1.0.0")) and (
+            LooseVersion(pyarrow.__version__) < LooseVersion("2.0.0")
         )
+        if partition_col:
+            if pa10:
+                partition_col_type = "int32"
+            else:
+                partition_col_type = "category"
+
+            expected_df[partition_col] = expected_df[partition_col].astype(
+                partition_col_type
+            )
 
         check_round_trip(
             df_compat,
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath):
         StataReader(dta_file, chunksize=0)
     with pytest.raises(ValueError, match="chunksize must be a positive"):
         StataReader(dta_file, chunksize="apple")
-    with pytest.raises(ValueError, match="chunksize must be set to a positive"):
-        with StataReader(dta_file) as reader:
-            reader.__next__()
 
 
 def test_iterator_value_labels():
@@ -1983,3 +1980,18 @@ def test_iterator_value_labels():
             for i in range(2):
                 tm.assert_index_equal(chunk.dtypes[i].categories, expected)
             tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
+
+
+def test_precision_loss():
+    df = DataFrame(
+        [[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]],
+        columns=["big", "little"],
+    )
+    with tm.ensure_clean() as path:
+        with tm.assert_produces_warning(PossiblePrecisionLoss):
+            df.to_stata(path, write_index=False)
+        reread = read_stata(path)
+        expected_dt = Series([np.float64, np.float64], index=["big", "little"])
+        tm.assert_series_equal(reread.dtypes, expected_dt)
+        assert reread.loc[0, "little"] == df.loc[0, "little"]
+        assert reread.loc[0, "big"] == float(df.loc[0, "big"])
diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py
@@ -685,6 +685,11 @@ def test_isAnchored_deprecated(self, offset_types):
         expected = off.is_anchored()
         assert result == expected
 
+    def test_offsets_hashable(self, offset_types):
+        # GH: 37267
+        off = self._get_offset(offset_types)
+        assert hash(off) is not None
+
 
 class TestDateOffset(Base):
     def setup_method(self, method):