GH1033 Add overloads of engine for pd.read_json (#1035)

loicdiridollou · web-flow · commit 92bd9cbe02ee · 2024-11-21T18:12:04.000-05:00
* GHXXX Add overloads of engine for pd.read_json

* GH1033 PR Feedback

* GH1033 PR Feedback

* GH1033 Fix ignore type

* GH1033 PR feedback
diff --git a/pandas-stubs/io/json/_json.pyi b/pandas-stubs/io/json/_json.pyi
@@ -48,10 +48,61 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["ujson"] = ...,
 ) -> JsonReader[Series]: ...
 @overload
 def read_json(
-    path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
+    path_or_buf: FilePath | ReadBuffer[bytes],
+    *,
+    orient: JsonSeriesOrient | None = ...,
+    typ: Literal["series"],
+    dtype: bool | Mapping[HashableT, DtypeArg] | None = ...,
+    convert_axes: bool | None = ...,
+    convert_dates: bool | list[str] = ...,
+    keep_default_dates: bool = ...,
+    precise_float: bool = ...,
+    date_unit: TimeUnit | None = ...,
+    encoding: str | None = ...,
+    encoding_errors: (
+        Literal["strict", "ignore", "replace", "backslashreplace", "surrogateescape"]
+        | None
+    ) = ...,
+    lines: Literal[True],
+    chunksize: int,
+    compression: CompressionOptions = ...,
+    nrows: int | None = ...,
+    storage_options: StorageOptions = ...,
+    dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["pyarrow"],
+) -> JsonReader[Series]: ...
+@overload
+def read_json(
+    path_or_buf: FilePath | ReadBuffer[bytes],
+    *,
+    orient: JsonFrameOrient | None = ...,
+    typ: Literal["frame"] = ...,
+    dtype: bool | Mapping[HashableT, DtypeArg] | None = ...,
+    convert_axes: bool | None = ...,
+    convert_dates: bool | list[str] = ...,
+    keep_default_dates: bool = ...,
+    precise_float: bool = ...,
+    date_unit: TimeUnit | None = ...,
+    encoding: str | None = ...,
+    encoding_errors: (
+        Literal["strict", "ignore", "replace", "backslashreplace", "surrogateescape"]
+        | None
+    ) = ...,
+    lines: Literal[True],
+    chunksize: int,
+    compression: CompressionOptions = ...,
+    nrows: int | None = ...,
+    storage_options: StorageOptions = ...,
+    dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["ujson"] = ...,
+) -> JsonReader[DataFrame]: ...
+@overload
+def read_json(
+    path_or_buf: FilePath | ReadBuffer[bytes],
     *,
     orient: JsonFrameOrient | None = ...,
     typ: Literal["frame"] = ...,
@@ -72,6 +123,7 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["pyarrow"],
 ) -> JsonReader[DataFrame]: ...
 @overload
 def read_json(
@@ -96,6 +148,32 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["ujson"] = ...,
+) -> Series: ...
+@overload
+def read_json(
+    path_or_buf: FilePath | ReadBuffer[bytes],
+    *,
+    orient: JsonSeriesOrient | None = ...,
+    typ: Literal["series"],
+    dtype: bool | Mapping[HashableT, DtypeArg] | None = ...,
+    convert_axes: bool | None = ...,
+    convert_dates: bool | list[str] = ...,
+    keep_default_dates: bool = ...,
+    precise_float: bool = ...,
+    date_unit: TimeUnit | None = ...,
+    encoding: str | None = ...,
+    encoding_errors: (
+        Literal["strict", "ignore", "replace", "backslashreplace", "surrogateescape"]
+        | None
+    ) = ...,
+    lines: Literal[True],
+    chunksize: None = ...,
+    compression: CompressionOptions = ...,
+    nrows: int | None = ...,
+    storage_options: StorageOptions = ...,
+    dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["pyarrow"],
 ) -> Series: ...
 @overload
 def read_json(
@@ -120,6 +198,32 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["ujson"] = ...,
+) -> DataFrame: ...
+@overload
+def read_json(
+    path_or_buf: FilePath | ReadBuffer[bytes],
+    *,
+    orient: JsonFrameOrient | None = ...,
+    typ: Literal["frame"] = ...,
+    dtype: bool | Mapping[HashableT, DtypeArg] | None = ...,
+    convert_axes: bool | None = ...,
+    convert_dates: bool | list[str] = ...,
+    keep_default_dates: bool = ...,
+    precise_float: bool = ...,
+    date_unit: TimeUnit | None = ...,
+    encoding: str | None = ...,
+    encoding_errors: (
+        Literal["strict", "ignore", "replace", "backslashreplace", "surrogateescape"]
+        | None
+    ) = ...,
+    lines: Literal[True],
+    chunksize: None = ...,
+    compression: CompressionOptions = ...,
+    nrows: int | None = ...,
+    storage_options: StorageOptions = ...,
+    dtype_backend: DtypeBackend | NoDefault = ...,
+    engine: Literal["pyarrow"],
 ) -> DataFrame: ...
 
 class JsonReader(abc.Iterator, Generic[NDFrameT]):
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1714,3 +1714,33 @@ def test_read_excel_index_col() -> None:
             ),
             pd.DataFrame,
         )
+
+
+def test_read_json_engine() -> None:
+    """Test the engine argument for `pd.read_json` introduced with pandas 2.0."""
+    data = """{"index": {"0": 0, "1": 1},
+       "a": {"0": 1, "1": null},
+       "b": {"0": 2.5, "1": 4.5},
+       "c": {"0": true, "1": false},
+       "d": {"0": "a", "1": "b"},
+       "e": {"0": 1577.2, "1": 1577.1}}"""
+    check(
+        assert_type(pd.read_json(io.StringIO(data), engine="ujson"), pd.DataFrame),
+        pd.DataFrame,
+    )
+
+    data_lines = b"""{"col 1":"a","col 2":"b"}
+    {"col 1":"c","col 2":"d"}"""
+    dd = io.BytesIO(data_lines)
+    check(
+        assert_type(
+            pd.read_json(dd, lines=True, engine="pyarrow"),
+            pd.DataFrame,
+        ),
+        pd.DataFrame,
+    )
+
+    if TYPE_CHECKING_INVALID_USAGE:
+        pd.read_json(dd, lines=False, engine="pyarrow")  # type: ignore[call-overload] # pyright: ignore[reportArgumentType, reportCallIssue]
+        pd.read_json(io.StringIO(data), engine="pyarrow")  # type: ignore[call-overload] # pyright: ignore[reportArgumentType]
+        pd.read_json(io.StringIO(data), lines=True, engine="pyarrow")  # type: ignore[call-overload] # pyright: ignore[reportArgumentType, reportCallIssue]