REF: Make read_json less stateful (#59124)

mroeschke · web-flow · commit 69fe98dda091 · 2024-07-01T19:33:07.000-07:00
* REF: Make read_json less stateful

* Fix typing

* Clean up dataframe column casting

* Remove extra bool return

* Add whatsnew
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -563,8 +563,8 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
--
 
 Period
 ^^^^^^
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -969,7 +969,7 @@ def read(self) -> DataFrame | Series:
                 else:
                     return obj
 
-    def _get_object_parser(self, json) -> DataFrame | Series:
+    def _get_object_parser(self, json: str) -> DataFrame | Series:
         """
         Parses a json document into a pandas object.
         """
@@ -985,16 +985,14 @@ def _get_object_parser(self, json) -> DataFrame | Series:
             "date_unit": self.date_unit,
             "dtype_backend": self.dtype_backend,
         }
-        obj = None
         if typ == "frame":
-            obj = FrameParser(json, **kwargs).parse()
-
-        if typ == "series" or obj is None:
+            return FrameParser(json, **kwargs).parse()
+        elif typ == "series":
             if not isinstance(dtype, bool):
                 kwargs["dtype"] = dtype
-            obj = SeriesParser(json, **kwargs).parse()
-
-        return obj
+            return SeriesParser(json, **kwargs).parse()
+        else:
+            raise ValueError(f"{typ=} must be 'frame' or 'series'.")
 
     def close(self) -> None:
         """
@@ -1107,7 +1105,6 @@ def __init__(
         self.convert_dates = convert_dates
         self.date_unit = date_unit
         self.keep_default_dates = keep_default_dates
-        self.obj: DataFrame | Series | None = None
         self.dtype_backend = dtype_backend
 
     @final
@@ -1121,26 +1118,22 @@ def check_keys_split(self, decoded: dict) -> None:
             raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
 
     @final
-    def parse(self):
-        self._parse()
+    def parse(self) -> DataFrame | Series:
+        obj = self._parse()
 
-        if self.obj is None:
-            return None
         if self.convert_axes:
-            self._convert_axes()
-        self._try_convert_types()
-        return self.obj
+            obj = self._convert_axes(obj)
+        obj = self._try_convert_types(obj)
+        return obj
 
-    def _parse(self) -> None:
+    def _parse(self) -> DataFrame | Series:
         raise AbstractMethodError(self)
 
     @final
-    def _convert_axes(self) -> None:
+    def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series:
         """
         Try to convert axes.
         """
-        obj = self.obj
-        assert obj is not None  # for mypy
         for axis_name in obj._AXIS_ORDERS:
             ax = obj._get_axis(axis_name)
             ser = Series(ax, dtype=ax.dtype, copy=False)
@@ -1153,9 +1146,10 @@ def _convert_axes(self) -> None:
             )
             if result:
                 new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)
-                setattr(self.obj, axis_name, new_axis)
+                setattr(obj, axis_name, new_axis)
+        return obj
 
-    def _try_convert_types(self) -> None:
+    def _try_convert_types(self, obj):
         raise AbstractMethodError(self)
 
     @final
@@ -1182,8 +1176,10 @@ def _try_convert_data(
 
             elif self.dtype is True:
                 pass
-            else:
-                # dtype to force
+            elif not _should_convert_dates(
+                convert_dates, self.keep_default_dates, name
+            ):
+                # convert_dates takes precedence over columns listed in dtypes
                 dtype = (
                     self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
                 )
@@ -1194,8 +1190,8 @@ def _try_convert_data(
                         return data, False
 
         if convert_dates:
-            new_data, result = self._try_convert_to_date(data)
-            if result:
+            new_data = self._try_convert_to_date(data)
+            if new_data is not data:
                 return new_data, True
 
         converted = False
@@ -1245,16 +1241,16 @@ def _try_convert_data(
         return data, converted
 
     @final
-    def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
+    def _try_convert_to_date(self, data: Series) -> Series:
         """
         Try to parse a ndarray like into a date column.
 
         Try to coerce object in epoch/iso formats and integer/float in epoch
-        formats. Return a boolean if parsing was successful.
+        formats.
         """
         # no conversion on empty
         if not len(data):
-            return data, False
+            return data
 
         new_data = data
 
@@ -1265,7 +1261,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
             try:
                 new_data = data.astype("int64")
             except OverflowError:
-                return data, False
+                return data
             except (TypeError, ValueError):
                 pass
 
@@ -1277,57 +1273,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
                 | (new_data._values == iNaT)
             )
             if not in_range.all():
-                return data, False
+                return data
 
         date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
         for date_unit in date_units:
             try:
-                new_data = to_datetime(new_data, errors="raise", unit=date_unit)
+                return to_datetime(new_data, errors="raise", unit=date_unit)
             except (ValueError, OverflowError, TypeError):
                 continue
-            return new_data, True
-        return data, False
+        return data
 
 
 class SeriesParser(Parser):
     _default_orient = "index"
     _split_keys = ("name", "index", "data")
-    obj: Series | None
 
-    def _parse(self) -> None:
+    def _parse(self) -> Series:
         data = ujson_loads(self.json, precise_float=self.precise_float)
 
         if self.orient == "split":
             decoded = {str(k): v for k, v in data.items()}
             self.check_keys_split(decoded)
-            self.obj = Series(**decoded)
+            return Series(**decoded)
         else:
-            self.obj = Series(data)
+            return Series(data)
 
-    def _try_convert_types(self) -> None:
-        if self.obj is None:
-            return
-        obj, result = self._try_convert_data(
-            "data", self.obj, convert_dates=self.convert_dates
-        )
-        if result:
-            self.obj = obj
+    def _try_convert_types(self, obj: Series) -> Series:
+        obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates)
+        return obj
 
 
 class FrameParser(Parser):
     _default_orient = "columns"
     _split_keys = ("columns", "index", "data")
-    obj: DataFrame | None
 
-    def _parse(self) -> None:
+    def _parse(self) -> DataFrame:
         json = self.json
         orient = self.orient
 
-        if orient == "columns":
-            self.obj = DataFrame(
-                ujson_loads(json, precise_float=self.precise_float), dtype=None
-            )
-        elif orient == "split":
+        if orient == "split":
             decoded = {
                 str(k): v
                 for k, v in ujson_loads(json, precise_float=self.precise_float).items()
@@ -1341,90 +1325,61 @@ def _parse(self) -> None:
                 orig_names,
                 is_potential_multi_index(orig_names, None),
             )
-            self.obj = DataFrame(dtype=None, **decoded)
+            return DataFrame(dtype=None, **decoded)
         elif orient == "index":
-            self.obj = DataFrame.from_dict(
+            return DataFrame.from_dict(
                 ujson_loads(json, precise_float=self.precise_float),
                 dtype=None,
                 orient="index",
             )
         elif orient == "table":
-            self.obj = parse_table_schema(json, precise_float=self.precise_float)
+            return parse_table_schema(json, precise_float=self.precise_float)
         else:
-            self.obj = DataFrame(
+            # includes orient == "columns"
+            return DataFrame(
                 ujson_loads(json, precise_float=self.precise_float), dtype=None
             )
 
-    def _process_converter(
-        self,
-        f: Callable[[Hashable, Series], tuple[Series, bool]],
-        filt: Callable[[Hashable], bool] | None = None,
-    ) -> None:
-        """
-        Take a conversion function and possibly recreate the frame.
-        """
-        if filt is None:
-            filt = lambda col: True
-
-        obj = self.obj
-        assert obj is not None  # for mypy
-
-        needs_new_obj = False
-        new_obj = {}
-        for i, (col, c) in enumerate(obj.items()):
-            if filt(col):
-                new_data, result = f(col, c)
-                if result:
-                    c = new_data
-                    needs_new_obj = True
-            new_obj[i] = c
-
-        if needs_new_obj:
-            # possibly handle dup columns
-            new_frame = DataFrame(new_obj, index=obj.index)
-            new_frame.columns = obj.columns
-            self.obj = new_frame
-
-    def _try_convert_types(self) -> None:
-        if self.obj is None:
-            return
-        if self.convert_dates:
-            self._try_convert_dates()
-
-        self._process_converter(
-            lambda col, c: self._try_convert_data(col, c, convert_dates=False)
+    def _try_convert_types(self, obj: DataFrame) -> DataFrame:
+        arrays = []
+        for col_label, series in obj.items():
+            result, _ = self._try_convert_data(
+                col_label,
+                series,
+                convert_dates=_should_convert_dates(
+                    self.convert_dates,
+                    keep_default_dates=self.keep_default_dates,
+                    col=col_label,
+                ),
+            )
+            arrays.append(result.array)
+        return DataFrame._from_arrays(
+            arrays, obj.columns, obj.index, verify_integrity=False
         )
 
-    def _try_convert_dates(self) -> None:
-        if self.obj is None:
-            return
-
-        # our columns to parse
-        convert_dates_list_bool = self.convert_dates
-        if isinstance(convert_dates_list_bool, bool):
-            convert_dates_list_bool = []
-        convert_dates = set(convert_dates_list_bool)
-
-        def is_ok(col) -> bool:
-            """
-            Return if this col is ok to try for a date parse.
-            """
-            if col in convert_dates:
-                return True
-            if not self.keep_default_dates:
-                return False
-            if not isinstance(col, str):
-                return False
-
-            col_lower = col.lower()
-            if (
-                col_lower.endswith(("_at", "_time"))
-                or col_lower == "modified"
-                or col_lower == "date"
-                or col_lower == "datetime"
-                or col_lower.startswith("timestamp")
-            ):
-                return True
-            return False
 
-        self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)
+def _should_convert_dates(
+    convert_dates: bool | list[str],
+    keep_default_dates: bool,
+    col: Hashable,
+) -> bool:
+    """
+    Return bool whether a DataFrame column should be cast to datetime.
+    """
+    if convert_dates is False:
+        # convert_dates=True means follow keep_default_dates
+        return False
+    elif not isinstance(convert_dates, bool) and col in set(convert_dates):
+        return True
+    elif not keep_default_dates:
+        return False
+    elif not isinstance(col, str):
+        return False
+    col_lower = col.lower()
+    if (
+        col_lower.endswith(("_at", "_time"))
+        or col_lower in {"modified", "date", "datetime"}
+        or col_lower.startswith("timestamp")
+    ):
+        return True
+    return False
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -792,7 +792,7 @@ def test_frame_from_json_precise_float(self):
 
     def test_typ(self):
         s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
-        result = read_json(StringIO(s.to_json()), typ=None)
+        result = read_json(StringIO(s.to_json()), typ="series")
         tm.assert_series_equal(result, s)
 
     def test_reconstruction_index(self):
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine):
     s = pd.Series({"A": 1, "B": 2})
 
     strio = StringIO(s.to_json(lines=True, orient="records"))
-    unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
+    unchunked = read_json(strio, lines=True, typ="series", engine=engine)
 
     strio = StringIO(s.to_json(lines=True, orient="records"))
     with read_json(
-        strio, lines=True, typ="Series", chunksize=1, engine=engine
+        strio, lines=True, typ="series", chunksize=1, engine=engine
     ) as reader:
         chunked = pd.concat(reader)