Skip to content

Commit b443ea7

Browse files
Merge remote-tracking branch 'upstream/1.1.x' into auto-backport-of-pr-37198-on-1.1.x
2 parents 79a58db + c132858 commit b443ea7

File tree

10 files changed

+117
-53
lines changed

10 files changed

+117
-53
lines changed

doc/source/whatsnew/v1.1.4.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ Fixed regressions
2121
- Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`)
2222
- Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`)
2323
- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`)
24+
- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`)
25+
- Fixed regression in certain offsets (:meth:`pd.offsets.Day() <pandas.tseries.offsets.Day>` and below) no longer being hashable (:issue:`37267`)
26+
- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`)
2427

2528
.. ---------------------------------------------------------------------------
2629
@@ -31,6 +34,7 @@ Bug fixes
3134
- Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`)
3235
- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`)
3336
- Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`)
37+
- Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`)
3438

3539
.. ---------------------------------------------------------------------------
3640

pandas/_libs/tslibs/offsets.pyx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,11 @@ cdef class Tick(SingleConstructorOffset):
785785
def is_anchored(self) -> bool:
786786
return False
787787

788+
# This is identical to BaseOffset.__hash__, but has to be redefined here
789+
# for Python 3, because we've redefined __eq__.
790+
def __hash__(self) -> int:
791+
return hash(self._params)
792+
788793
# --------------------------------------------------------------------
789794
# Comparison and Arithmetic Methods
790795

pandas/core/dtypes/dtypes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,9 @@ def __eq__(self, other: Any) -> bool:
894894

895895
return isinstance(other, PeriodDtype) and self.freq == other.freq
896896

897+
def __ne__(self, other: Any) -> bool:
898+
return not self.__eq__(other)
899+
897900
def __setstate__(self, state):
898901
# for pickle compat. __getstate__ is defined in the
899902
# PandasExtensionDtype superclass and uses the public properties to

pandas/io/formats/info.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,13 +332,13 @@ def _verbose_repr(
332332
)
333333

334334
for i, col in enumerate(ids):
335-
dtype = dtypes[i]
335+
dtype = dtypes.iloc[i]
336336
col = pprint_thing(col)
337337

338338
line_no = _put_str(f" {i}", space_num)
339339
count = ""
340340
if show_counts:
341-
count = counts[i]
341+
count = counts.iloc[i]
342342

343343
lines.append(
344344
line_no

pandas/io/stata.py

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning):
477477

478478

479479
precision_loss_doc = """
480-
Column converted from %s to %s, and some data are outside of the lossless
480+
Column converted from {0} to {1}, and some data are outside of the lossless
481481
conversion range. This may result in a loss of precision in the saved data.
482482
"""
483483

@@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
551551
object in a DataFrame.
552552
"""
553553
ws = ""
554-
# original, if small, if large
554+
# original, if small, if large
555555
conversion_data = (
556556
(np.bool_, np.int8, np.int8),
557557
(np.uint8, np.int8, np.int16),
@@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
571571
dtype = c_data[1]
572572
else:
573573
dtype = c_data[2]
574-
if c_data[2] == np.float64: # Warn if necessary
574+
if c_data[2] == np.int64: # Warn if necessary
575575
if data[col].max() >= 2 ** 53:
576576
ws = precision_loss_doc.format("uint64", "float64")
577577

@@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
635635
self.value_labels = list(zip(np.arange(len(categories)), categories))
636636
self.value_labels.sort(key=lambda x: x[0])
637637
self.text_len = 0
638-
self.off: List[int] = []
639-
self.val: List[int] = []
640638
self.txt: List[bytes] = []
641639
self.n = 0
642640

643641
# Compute lengths and setup lists of offsets and labels
642+
offsets: List[int] = []
643+
values: List[int] = []
644644
for vl in self.value_labels:
645645
category = vl[1]
646646
if not isinstance(category, str):
@@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
650650
ValueLabelTypeMismatch,
651651
)
652652
category = category.encode(encoding)
653-
self.off.append(self.text_len)
653+
offsets.append(self.text_len)
654654
self.text_len += len(category) + 1 # +1 for the padding
655-
self.val.append(vl[0])
655+
values.append(vl[0])
656656
self.txt.append(category)
657657
self.n += 1
658658

@@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
663663
)
664664

665665
# Ensure int32
666-
self.off = np.array(self.off, dtype=np.int32)
667-
self.val = np.array(self.val, dtype=np.int32)
666+
self.off = np.array(offsets, dtype=np.int32)
667+
self.val = np.array(values, dtype=np.int32)
668668

669669
# Total length
670670
self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
@@ -876,23 +876,23 @@ def __init__(self):
876876
# with a label, but the underlying variable is -127 to 100
877877
# we're going to drop the label and cast to int
878878
self.DTYPE_MAP = dict(
879-
list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
879+
list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
880880
+ [
881-
(251, np.int8),
882-
(252, np.int16),
883-
(253, np.int32),
884-
(254, np.float32),
885-
(255, np.float64),
881+
(251, np.dtype(np.int8)),
882+
(252, np.dtype(np.int16)),
883+
(253, np.dtype(np.int32)),
884+
(254, np.dtype(np.float32)),
885+
(255, np.dtype(np.float64)),
886886
]
887887
)
888888
self.DTYPE_MAP_XML = dict(
889889
[
890-
(32768, np.uint8), # Keys to GSO
891-
(65526, np.float64),
892-
(65527, np.float32),
893-
(65528, np.int32),
894-
(65529, np.int16),
895-
(65530, np.int8),
890+
(32768, np.dtype(np.uint8)), # Keys to GSO
891+
(65526, np.dtype(np.float64)),
892+
(65527, np.dtype(np.float32)),
893+
(65528, np.dtype(np.int32)),
894+
(65529, np.dtype(np.int16)),
895+
(65530, np.dtype(np.int8)),
896896
]
897897
)
898898
self.TYPE_MAP = list(range(251)) + list("bhlfd")
@@ -1050,9 +1050,10 @@ def __init__(
10501050
self._order_categoricals = order_categoricals
10511051
self._encoding = ""
10521052
self._chunksize = chunksize
1053-
if self._chunksize is not None and (
1054-
not isinstance(chunksize, int) or chunksize <= 0
1055-
):
1053+
self._using_iterator = False
1054+
if self._chunksize is None:
1055+
self._chunksize = 1
1056+
elif not isinstance(chunksize, int) or chunksize <= 0:
10561057
raise ValueError("chunksize must be a positive integer when set.")
10571058

10581059
# State variables for the file
@@ -1062,7 +1063,7 @@ def __init__(
10621063
self._column_selector_set = False
10631064
self._value_labels_read = False
10641065
self._data_read = False
1065-
self._dtype = None
1066+
self._dtype: Optional[np.dtype] = None
10661067
self._lines_read = 0
10671068

10681069
self._native_byteorder = _set_endianness(sys.byteorder)
@@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None:
11951196
# Get data type information, works for versions 117-119.
11961197
def _get_dtypes(
11971198
self, seek_vartypes: int
1198-
) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]:
1199+
) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]:
11991200

12001201
self.path_or_buf.seek(seek_vartypes)
12011202
raw_typlist = [
@@ -1519,11 +1520,8 @@ def _read_strls(self) -> None:
15191520
self.GSO[str(v_o)] = decoded_va
15201521

15211522
def __next__(self) -> DataFrame:
1522-
if self._chunksize is None:
1523-
raise ValueError(
1524-
"chunksize must be set to a positive integer to use as an iterator."
1525-
)
1526-
return self.read(nrows=self._chunksize or 1)
1523+
self._using_iterator = True
1524+
return self.read(nrows=self._chunksize)
15271525

15281526
def get_chunk(self, size: Optional[int] = None) -> DataFrame:
15291527
"""
@@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool:
16921690
convert = False
16931691
for col in data:
16941692
dtype = data[col].dtype
1695-
if dtype in (np.float16, np.float32):
1696-
dtype = np.float64
1693+
if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
1694+
dtype = np.dtype(np.float64)
16971695
convert = True
1698-
elif dtype in (np.int8, np.int16, np.int32):
1699-
dtype = np.int64
1696+
elif dtype in (
1697+
np.dtype(np.int8),
1698+
np.dtype(np.int16),
1699+
np.dtype(np.int32),
1700+
):
1701+
dtype = np.dtype(np.int64)
17001702
convert = True
17011703
retyped_data.append((col, data[col].astype(dtype)))
17021704
if convert:
@@ -1807,14 +1809,14 @@ def _do_convert_categoricals(
18071809
keys = np.array(list(vl.keys()))
18081810
column = data[col]
18091811
key_matches = column.isin(keys)
1810-
if self._chunksize is not None and key_matches.all():
1811-
initial_categories = keys
1812+
if self._using_iterator and key_matches.all():
1813+
initial_categories: Optional[np.ndarray] = keys
18121814
# If all categories are in the keys and we are iterating,
18131815
# use the same keys for all chunks. If some are missing
18141816
# value labels, then we will fall back to the categories
18151817
# varying across chunks.
18161818
else:
1817-
if self._chunksize is not None:
1819+
if self._using_iterator:
18181820
# warn is using an iterator
18191821
warnings.warn(
18201822
categorical_conversion_warning, CategoricalConversionWarning
@@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
20102012
"ty",
20112013
"%ty",
20122014
]:
2013-
return np.float64 # Stata expects doubles for SIFs
2015+
return np.dtype(np.float64) # Stata expects doubles for SIFs
20142016
else:
20152017
raise NotImplementedError(f"Format {fmt} not implemented")
20162018

pandas/tests/dtypes/test_dtypes.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -991,3 +991,10 @@ def test_is_dtype_no_warning(check):
991991

992992
with tm.assert_produces_warning(None):
993993
check(data["A"])
994+
995+
996+
def test_period_dtype_compare_to_string():
997+
# https://github.com/pandas-dev/pandas/issues/37265
998+
dtype = PeriodDtype(freq="M")
999+
assert (dtype == "period[M]") is True
1000+
assert (dtype != "period[M]") is False

pandas/tests/io/formats/test_info.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,3 +403,25 @@ def test_info_categorical():
403403

404404
buf = StringIO()
405405
df.info(buf=buf)
406+
407+
408+
def test_info_int_columns():
409+
# GH#37245
410+
df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
411+
buf = StringIO()
412+
df.info(null_counts=True, buf=buf)
413+
result = buf.getvalue()
414+
expected = textwrap.dedent(
415+
"""\
416+
<class 'pandas.core.frame.DataFrame'>
417+
Index: 2 entries, A to B
418+
Data columns (total 2 columns):
419+
# Column Non-Null Count Dtype
420+
--- ------ -------------- -----
421+
0 1 2 non-null int64
422+
1 2 2 non-null int64
423+
dtypes: int64(2)
424+
memory usage: 48.0+ bytes
425+
"""
426+
)
427+
assert result == expected

pandas/tests/io/test_parquet.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -563,16 +563,20 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):
563563
# read_table uses the new Arrow Datasets API since pyarrow 1.0.0
564564
# Previous behaviour was pyarrow partitioned columns become 'category' dtypes
565565
# These are added to back of dataframe on read. In new API category dtype is
566-
# only used if partition field is string.
567-
legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0")
568-
if partition_col and legacy_read_table:
569-
partition_col_type = "category"
570-
else:
571-
partition_col_type = "int32"
572-
573-
expected_df[partition_col] = expected_df[partition_col].astype(
574-
partition_col_type
566+
# only used if partition field is string, but this changed again to use
567+
# category dtype for all types (not only strings) in pyarrow 2.0.0
568+
pa10 = (LooseVersion(pyarrow.__version__) >= LooseVersion("1.0.0")) and (
569+
LooseVersion(pyarrow.__version__) < LooseVersion("2.0.0")
575570
)
571+
if partition_col:
572+
if pa10:
573+
partition_col_type = "int32"
574+
else:
575+
partition_col_type = "category"
576+
577+
expected_df[partition_col] = expected_df[partition_col].astype(
578+
partition_col_type
579+
)
576580

577581
check_round_trip(
578582
df_compat,

pandas/tests/io/test_stata.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath):
19661966
StataReader(dta_file, chunksize=0)
19671967
with pytest.raises(ValueError, match="chunksize must be a positive"):
19681968
StataReader(dta_file, chunksize="apple")
1969-
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
1970-
with StataReader(dta_file) as reader:
1971-
reader.__next__()
19721969

19731970

19741971
def test_iterator_value_labels():
@@ -1983,3 +1980,18 @@ def test_iterator_value_labels():
19831980
for i in range(2):
19841981
tm.assert_index_equal(chunk.dtypes[i].categories, expected)
19851982
tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
1983+
1984+
1985+
def test_precision_loss():
1986+
df = DataFrame(
1987+
[[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]],
1988+
columns=["big", "little"],
1989+
)
1990+
with tm.ensure_clean() as path:
1991+
with tm.assert_produces_warning(PossiblePrecisionLoss):
1992+
df.to_stata(path, write_index=False)
1993+
reread = read_stata(path)
1994+
expected_dt = Series([np.float64, np.float64], index=["big", "little"])
1995+
tm.assert_series_equal(reread.dtypes, expected_dt)
1996+
assert reread.loc[0, "little"] == df.loc[0, "little"]
1997+
assert reread.loc[0, "big"] == float(df.loc[0, "big"])

pandas/tests/tseries/offsets/test_offsets.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,11 @@ def test_isAnchored_deprecated(self, offset_types):
685685
expected = off.is_anchored()
686686
assert result == expected
687687

688+
def test_offsets_hashable(self, offset_types):
689+
# GH: 37267
690+
off = self._get_offset(offset_types)
691+
assert hash(off) is not None
692+
688693

689694
class TestDateOffset(Base):
690695
def setup_method(self, method):

0 commit comments

Comments
 (0)