ENH: read_json engine keyword and pyarrow integration (#49249)

abkosar · abkosar · commit 5adf8a31f8a7 · 2022-12-02T23:09:54.000-05:00
diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -319,6 +319,9 @@ def closed(self) -> bool:
 # read_csv engines
 CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
 
+# read_json engines
+JSONEngine = Literal["ujson", "pyarrow"]
+
 # read_xml parsers
 XMLParsers = Literal["lxml", "etree"]
 
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -28,6 +28,7 @@
     DtypeArg,
     FilePath,
     IndexLabel,
+    JSONEngine,
     JSONSerializable,
     ReadBuffer,
     StorageOptions,
@@ -66,6 +67,7 @@
     build_table_schema,
     parse_table_schema,
 )
+from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper
 from pandas.io.parsers.readers import validate_integer
 
 if TYPE_CHECKING:
@@ -389,6 +391,7 @@ def read_json(
     date_unit: str | None = ...,
     encoding: str | None = ...,
     encoding_errors: str | None = ...,
+    engine: JSONEngine = ...,
     lines: bool = ...,
     chunksize: int,
     compression: CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    engine: JSONEngine = ...,
 ) -> JsonReader[Literal["series"]]:
     ...
 
@@ -440,6 +444,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    engine: JSONEngine = ...,
 ) -> Series:
     ...
 
@@ -463,6 +468,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    engine: JSONEngine = ...,
 ) -> DataFrame:
     ...
 
@@ -489,6 +495,7 @@ def read_json(
     compression: CompressionOptions = "infer",
     nrows: int | None = None,
     storage_options: StorageOptions = None,
+    engine: JSONEngine = "ujson",
 ) -> DataFrame | Series | JsonReader:
     """
     Convert a JSON string to pandas object.
@@ -605,6 +612,9 @@ def read_json(
 
         .. versionadded:: 1.3.0
 
+    engine : {{'ujson', 'pyarrow'}}, default "ujson"
+        Parser engine to use.
+
     lines : bool, default False
         Read the file as a json object per line.
 
@@ -746,6 +756,7 @@ def read_json(
         nrows=nrows,
         storage_options=storage_options,
         encoding_errors=encoding_errors,
+        engine=engine,
     )
 
     if chunksize:
@@ -782,6 +793,7 @@ def __init__(
         nrows: int | None,
         storage_options: StorageOptions = None,
         encoding_errors: str | None = "strict",
+        engine: JSONEngine = "ujson",
     ) -> None:
 
         self.orient = orient
@@ -793,6 +805,7 @@ def __init__(
         self.precise_float = precise_float
         self.date_unit = date_unit
         self.encoding = encoding
+        self.engine = engine
         self.compression = compression
         self.storage_options = storage_options
         self.lines = lines
@@ -810,9 +823,48 @@ def __init__(
             self.nrows = validate_integer("nrows", self.nrows, 0)
             if not self.lines:
                 raise ValueError("nrows can only be passed if lines=True")
+        if self.engine == "pyarrow":
+            if not self.lines:
+                raise ValueError(
+                    "currently pyarrow engine only supports "
+                    "the line-delimited JSON format"
+                )
+        if self.engine not in ["pyarrow", "ujson"]:
+            raise ValueError(
+                f"The engine type {self.engine} is currently not supported."
+            )
+
+        if self.engine == "pyarrow":
+            self._engine = self._make_engine(filepath_or_buffer)
+        if self.engine == "ujson":
+            data = self._get_data_from_filepath(filepath_or_buffer)
+            self.data = self._preprocess_data(data)
+
+    def _make_engine(
+        self,
+        filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
+    ) -> ArrowJsonParserWrapper:
+
+        if not isinstance(filepath_or_buffer, list):
+            is_text = False
+            mode = "rb"
+            self.handles = get_handle(
+                self._get_data_from_filepath(filepath_or_buffer),
+                mode=mode,
+                encoding=self.encoding,
+                is_text=is_text,
+                compression=self.compression,
+                storage_options=self.storage_options,
+                errors=self.encoding_errors,
+            )
+            filepath_or_buffer = self.handles.handle
 
-        data = self._get_data_from_filepath(filepath_or_buffer)
-        self.data = self._preprocess_data(data)
+        try:
+            return ArrowJsonParserWrapper(filepath_or_buffer)
+        except Exception:
+            if self.handles is not None:
+                self.handles.close()
+            raise
 
     def _preprocess_data(self, data):
         """
@@ -896,20 +948,23 @@ def read(self) -> DataFrame | Series:
         Read the whole JSON input into a pandas object.
         """
         obj: DataFrame | Series
-        if self.lines:
-            if self.chunksize:
-                obj = concat(self)
-            elif self.nrows:
-                lines = list(islice(self.data, self.nrows))
-                lines_json = self._combine_lines(lines)
-                obj = self._get_object_parser(lines_json)
+        if self.engine == "pyarrow":
+            obj = self._engine.read()
+        if self.engine == "ujson":
+            if self.lines:
+                if self.chunksize:
+                    obj = concat(self)
+                elif self.nrows:
+                    lines = list(islice(self.data, self.nrows))
+                    lines_json = self._combine_lines(lines)
+                    obj = self._get_object_parser(lines_json)
+                else:
+                    data = ensure_str(self.data)
+                    data_lines = data.split("\n")
+                    obj = self._get_object_parser(self._combine_lines(data_lines))
             else:
-                data = ensure_str(self.data)
-                data_lines = data.split("\n")
-                obj = self._get_object_parser(self._combine_lines(data_lines))
-        else:
-            obj = self._get_object_parser(self.data)
-        self.close()
+                obj = self._get_object_parser(self.data)
+            self.close()
         return obj
 
     def _get_object_parser(self, json) -> DataFrame | Series:
diff --git a/pandas/io/json/arrow_json_parser_wrapper.py b/pandas/io/json/arrow_json_parser_wrapper.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pandas._typing import ReadBuffer
+from pandas.compat._optional import import_optional_dependency
+
+if TYPE_CHECKING:
+    from pandas import DataFrame
+
+
+class ArrowJsonParserWrapper:
+    """
+    Wrapper for the pyarrow engine for read_json()
+    """
+
+    def __init__(self, src: ReadBuffer[bytes]) -> None:
+        self.src = src
+
+    def read(self) -> DataFrame:
+        """
+        Reads the contents of a JSON file into a DataFrame and
+        processes it according to the kwargs passed in the
+        constructor.
+
+        Returns
+        -------
+        DataFrame
+            The DataFrame created from the JSON file.
+        """
+        pyarrow_json = import_optional_dependency("pyarrow.json")
+        table = pyarrow_json.read_json(self.src)
+
+        frame = table.to_pandas()
+        return frame
diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py
@@ -7,3 +7,27 @@ def orient(request):
     Fixture for orients excluding the table format.
     """
     return request.param
+
+
+@pytest.fixture
+def json_dir_path(datapath):
+    """
+    The directory path to the data files needed for parser tests.
+    """
+    return datapath("io", "json", "data")
+
+
+@pytest.fixture(params=["ujson", "pyarrow"])
+def engine(request):
+    return request.param
+
+
+@pytest.fixture
+def json_engine_pyarrow_xfail(request):
+    """
+    Fixture that xfails a test if the engine is pyarrow.
+    """
+    engine = request.getfixturevalue("engine")
+    if engine == "pyarrow":
+        mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
+        request.node.add_marker(mark)
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py