Skip to content

Commit 69fe98d

Browse files
authored
REF: Make read_json less stateful (#59124)
* REF: Make read_json less stateful * Fix typing * Clean up dataframe column casting * Remove extra bool return * Add whatsnew
1 parent bc79c52 commit 69fe98d

File tree

4 files changed

+86
-131
lines changed

4 files changed

+86
-131
lines changed

doc/source/whatsnew/v3.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -563,8 +563,8 @@ I/O
563563
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
564564
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
565565
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
566+
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
566567
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
567-
-
568568

569569
Period
570570
^^^^^^

pandas/io/json/_json.py

+82-127
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,7 @@ def read(self) -> DataFrame | Series:
969969
else:
970970
return obj
971971

972-
def _get_object_parser(self, json) -> DataFrame | Series:
972+
def _get_object_parser(self, json: str) -> DataFrame | Series:
973973
"""
974974
Parses a json document into a pandas object.
975975
"""
@@ -985,16 +985,14 @@ def _get_object_parser(self, json) -> DataFrame | Series:
985985
"date_unit": self.date_unit,
986986
"dtype_backend": self.dtype_backend,
987987
}
988-
obj = None
989988
if typ == "frame":
990-
obj = FrameParser(json, **kwargs).parse()
991-
992-
if typ == "series" or obj is None:
989+
return FrameParser(json, **kwargs).parse()
990+
elif typ == "series":
993991
if not isinstance(dtype, bool):
994992
kwargs["dtype"] = dtype
995-
obj = SeriesParser(json, **kwargs).parse()
996-
997-
return obj
993+
return SeriesParser(json, **kwargs).parse()
994+
else:
995+
raise ValueError(f"{typ=} must be 'frame' or 'series'.")
998996

999997
def close(self) -> None:
1000998
"""
@@ -1107,7 +1105,6 @@ def __init__(
11071105
self.convert_dates = convert_dates
11081106
self.date_unit = date_unit
11091107
self.keep_default_dates = keep_default_dates
1110-
self.obj: DataFrame | Series | None = None
11111108
self.dtype_backend = dtype_backend
11121109

11131110
@final
@@ -1121,26 +1118,22 @@ def check_keys_split(self, decoded: dict) -> None:
11211118
raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
11221119

11231120
@final
1124-
def parse(self):
1125-
self._parse()
1121+
def parse(self) -> DataFrame | Series:
1122+
obj = self._parse()
11261123

1127-
if self.obj is None:
1128-
return None
11291124
if self.convert_axes:
1130-
self._convert_axes()
1131-
self._try_convert_types()
1132-
return self.obj
1125+
obj = self._convert_axes(obj)
1126+
obj = self._try_convert_types(obj)
1127+
return obj
11331128

1134-
def _parse(self) -> None:
1129+
def _parse(self) -> DataFrame | Series:
11351130
raise AbstractMethodError(self)
11361131

11371132
@final
1138-
def _convert_axes(self) -> None:
1133+
def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series:
11391134
"""
11401135
Try to convert axes.
11411136
"""
1142-
obj = self.obj
1143-
assert obj is not None # for mypy
11441137
for axis_name in obj._AXIS_ORDERS:
11451138
ax = obj._get_axis(axis_name)
11461139
ser = Series(ax, dtype=ax.dtype, copy=False)
@@ -1153,9 +1146,10 @@ def _convert_axes(self) -> None:
11531146
)
11541147
if result:
11551148
new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)
1156-
setattr(self.obj, axis_name, new_axis)
1149+
setattr(obj, axis_name, new_axis)
1150+
return obj
11571151

1158-
def _try_convert_types(self) -> None:
1152+
def _try_convert_types(self, obj):
11591153
raise AbstractMethodError(self)
11601154

11611155
@final
@@ -1182,8 +1176,10 @@ def _try_convert_data(
11821176

11831177
elif self.dtype is True:
11841178
pass
1185-
else:
1186-
# dtype to force
1179+
elif not _should_convert_dates(
1180+
convert_dates, self.keep_default_dates, name
1181+
):
1182+
# convert_dates takes precedence over columns listed in dtypes
11871183
dtype = (
11881184
self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
11891185
)
@@ -1194,8 +1190,8 @@ def _try_convert_data(
11941190
return data, False
11951191

11961192
if convert_dates:
1197-
new_data, result = self._try_convert_to_date(data)
1198-
if result:
1193+
new_data = self._try_convert_to_date(data)
1194+
if new_data is not data:
11991195
return new_data, True
12001196

12011197
converted = False
@@ -1245,16 +1241,16 @@ def _try_convert_data(
12451241
return data, converted
12461242

12471243
@final
1248-
def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
1244+
def _try_convert_to_date(self, data: Series) -> Series:
12491245
"""
12501246
Try to parse a ndarray like into a date column.
12511247
12521248
Try to coerce object in epoch/iso formats and integer/float in epoch
1253-
formats. Return a boolean if parsing was successful.
1249+
formats.
12541250
"""
12551251
# no conversion on empty
12561252
if not len(data):
1257-
return data, False
1253+
return data
12581254

12591255
new_data = data
12601256

@@ -1265,7 +1261,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
12651261
try:
12661262
new_data = data.astype("int64")
12671263
except OverflowError:
1268-
return data, False
1264+
return data
12691265
except (TypeError, ValueError):
12701266
pass
12711267

@@ -1277,57 +1273,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
12771273
| (new_data._values == iNaT)
12781274
)
12791275
if not in_range.all():
1280-
return data, False
1276+
return data
12811277

12821278
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
12831279
for date_unit in date_units:
12841280
try:
1285-
new_data = to_datetime(new_data, errors="raise", unit=date_unit)
1281+
return to_datetime(new_data, errors="raise", unit=date_unit)
12861282
except (ValueError, OverflowError, TypeError):
12871283
continue
1288-
return new_data, True
1289-
return data, False
1284+
return data
12901285

12911286

12921287
class SeriesParser(Parser):
12931288
_default_orient = "index"
12941289
_split_keys = ("name", "index", "data")
1295-
obj: Series | None
12961290

1297-
def _parse(self) -> None:
1291+
def _parse(self) -> Series:
12981292
data = ujson_loads(self.json, precise_float=self.precise_float)
12991293

13001294
if self.orient == "split":
13011295
decoded = {str(k): v for k, v in data.items()}
13021296
self.check_keys_split(decoded)
1303-
self.obj = Series(**decoded)
1297+
return Series(**decoded)
13041298
else:
1305-
self.obj = Series(data)
1299+
return Series(data)
13061300

1307-
def _try_convert_types(self) -> None:
1308-
if self.obj is None:
1309-
return
1310-
obj, result = self._try_convert_data(
1311-
"data", self.obj, convert_dates=self.convert_dates
1312-
)
1313-
if result:
1314-
self.obj = obj
1301+
def _try_convert_types(self, obj: Series) -> Series:
1302+
obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates)
1303+
return obj
13151304

13161305

13171306
class FrameParser(Parser):
13181307
_default_orient = "columns"
13191308
_split_keys = ("columns", "index", "data")
1320-
obj: DataFrame | None
13211309

1322-
def _parse(self) -> None:
1310+
def _parse(self) -> DataFrame:
13231311
json = self.json
13241312
orient = self.orient
13251313

1326-
if orient == "columns":
1327-
self.obj = DataFrame(
1328-
ujson_loads(json, precise_float=self.precise_float), dtype=None
1329-
)
1330-
elif orient == "split":
1314+
if orient == "split":
13311315
decoded = {
13321316
str(k): v
13331317
for k, v in ujson_loads(json, precise_float=self.precise_float).items()
@@ -1341,90 +1325,61 @@ def _parse(self) -> None:
13411325
orig_names,
13421326
is_potential_multi_index(orig_names, None),
13431327
)
1344-
self.obj = DataFrame(dtype=None, **decoded)
1328+
return DataFrame(dtype=None, **decoded)
13451329
elif orient == "index":
1346-
self.obj = DataFrame.from_dict(
1330+
return DataFrame.from_dict(
13471331
ujson_loads(json, precise_float=self.precise_float),
13481332
dtype=None,
13491333
orient="index",
13501334
)
13511335
elif orient == "table":
1352-
self.obj = parse_table_schema(json, precise_float=self.precise_float)
1336+
return parse_table_schema(json, precise_float=self.precise_float)
13531337
else:
1354-
self.obj = DataFrame(
1338+
# includes orient == "columns"
1339+
return DataFrame(
13551340
ujson_loads(json, precise_float=self.precise_float), dtype=None
13561341
)
13571342

1358-
def _process_converter(
1359-
self,
1360-
f: Callable[[Hashable, Series], tuple[Series, bool]],
1361-
filt: Callable[[Hashable], bool] | None = None,
1362-
) -> None:
1363-
"""
1364-
Take a conversion function and possibly recreate the frame.
1365-
"""
1366-
if filt is None:
1367-
filt = lambda col: True
1368-
1369-
obj = self.obj
1370-
assert obj is not None # for mypy
1371-
1372-
needs_new_obj = False
1373-
new_obj = {}
1374-
for i, (col, c) in enumerate(obj.items()):
1375-
if filt(col):
1376-
new_data, result = f(col, c)
1377-
if result:
1378-
c = new_data
1379-
needs_new_obj = True
1380-
new_obj[i] = c
1381-
1382-
if needs_new_obj:
1383-
# possibly handle dup columns
1384-
new_frame = DataFrame(new_obj, index=obj.index)
1385-
new_frame.columns = obj.columns
1386-
self.obj = new_frame
1387-
1388-
def _try_convert_types(self) -> None:
1389-
if self.obj is None:
1390-
return
1391-
if self.convert_dates:
1392-
self._try_convert_dates()
1393-
1394-
self._process_converter(
1395-
lambda col, c: self._try_convert_data(col, c, convert_dates=False)
1343+
def _try_convert_types(self, obj: DataFrame) -> DataFrame:
1344+
arrays = []
1345+
for col_label, series in obj.items():
1346+
result, _ = self._try_convert_data(
1347+
col_label,
1348+
series,
1349+
convert_dates=_should_convert_dates(
1350+
self.convert_dates,
1351+
keep_default_dates=self.keep_default_dates,
1352+
col=col_label,
1353+
),
1354+
)
1355+
arrays.append(result.array)
1356+
return DataFrame._from_arrays(
1357+
arrays, obj.columns, obj.index, verify_integrity=False
13961358
)
13971359

1398-
def _try_convert_dates(self) -> None:
1399-
if self.obj is None:
1400-
return
1401-
1402-
# our columns to parse
1403-
convert_dates_list_bool = self.convert_dates
1404-
if isinstance(convert_dates_list_bool, bool):
1405-
convert_dates_list_bool = []
1406-
convert_dates = set(convert_dates_list_bool)
1407-
1408-
def is_ok(col) -> bool:
1409-
"""
1410-
Return if this col is ok to try for a date parse.
1411-
"""
1412-
if col in convert_dates:
1413-
return True
1414-
if not self.keep_default_dates:
1415-
return False
1416-
if not isinstance(col, str):
1417-
return False
1418-
1419-
col_lower = col.lower()
1420-
if (
1421-
col_lower.endswith(("_at", "_time"))
1422-
or col_lower == "modified"
1423-
or col_lower == "date"
1424-
or col_lower == "datetime"
1425-
or col_lower.startswith("timestamp")
1426-
):
1427-
return True
1428-
return False
14291360

1430-
self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)
1361+
def _should_convert_dates(
1362+
convert_dates: bool | list[str],
1363+
keep_default_dates: bool,
1364+
col: Hashable,
1365+
) -> bool:
1366+
"""
1367+
Return bool whether a DataFrame column should be cast to datetime.
1368+
"""
1369+
if convert_dates is False:
1370+
# convert_dates=True means follow keep_default_dates
1371+
return False
1372+
elif not isinstance(convert_dates, bool) and col in set(convert_dates):
1373+
return True
1374+
elif not keep_default_dates:
1375+
return False
1376+
elif not isinstance(col, str):
1377+
return False
1378+
col_lower = col.lower()
1379+
if (
1380+
col_lower.endswith(("_at", "_time"))
1381+
or col_lower in {"modified", "date", "datetime"}
1382+
or col_lower.startswith("timestamp")
1383+
):
1384+
return True
1385+
return False

pandas/tests/io/json/test_pandas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ def test_frame_from_json_precise_float(self):
792792

793793
def test_typ(self):
794794
s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
795-
result = read_json(StringIO(s.to_json()), typ=None)
795+
result = read_json(StringIO(s.to_json()), typ="series")
796796
tm.assert_series_equal(result, s)
797797

798798
def test_reconstruction_index(self):

pandas/tests/io/json/test_readlines.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine):
165165
s = pd.Series({"A": 1, "B": 2})
166166

167167
strio = StringIO(s.to_json(lines=True, orient="records"))
168-
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
168+
unchunked = read_json(strio, lines=True, typ="series", engine=engine)
169169

170170
strio = StringIO(s.to_json(lines=True, orient="records"))
171171
with read_json(
172-
strio, lines=True, typ="Series", chunksize=1, engine=engine
172+
strio, lines=True, typ="series", chunksize=1, engine=engine
173173
) as reader:
174174
chunked = pd.concat(reader)
175175

0 commit comments

Comments
 (0)