Skip to content

Commit 5adf8a3

Browse files
committed
ENH: read_json engine keyword and pyarrow integration (#49249)
1 parent f569301 commit 5adf8a3

File tree

5 files changed

+202
-48
lines changed

5 files changed

+202
-48
lines changed

pandas/_typing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,9 @@ def closed(self) -> bool:
319319
# read_csv engines
320320
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
321321

322+
# read_json engines
323+
JSONEngine = Literal["ujson", "pyarrow"]
324+
322325
# read_xml parsers
323326
XMLParsers = Literal["lxml", "etree"]
324327

pandas/io/json/_json.py

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
DtypeArg,
2929
FilePath,
3030
IndexLabel,
31+
JSONEngine,
3132
JSONSerializable,
3233
ReadBuffer,
3334
StorageOptions,
@@ -66,6 +67,7 @@
6667
build_table_schema,
6768
parse_table_schema,
6869
)
70+
from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper
6971
from pandas.io.parsers.readers import validate_integer
7072

7173
if TYPE_CHECKING:
@@ -389,6 +391,7 @@ def read_json(
389391
date_unit: str | None = ...,
390392
encoding: str | None = ...,
391393
encoding_errors: str | None = ...,
394+
engine: JSONEngine = ...,
392395
lines: bool = ...,
393396
chunksize: int,
394397
compression: CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417420
compression: CompressionOptions = ...,
418421
nrows: int | None = ...,
419422
storage_options: StorageOptions = ...,
423+
engine: JSONEngine = ...,
420424
) -> JsonReader[Literal["series"]]:
421425
...
422426

@@ -440,6 +444,7 @@ def read_json(
440444
compression: CompressionOptions = ...,
441445
nrows: int | None = ...,
442446
storage_options: StorageOptions = ...,
447+
engine: JSONEngine = ...,
443448
) -> Series:
444449
...
445450

@@ -463,6 +468,7 @@ def read_json(
463468
compression: CompressionOptions = ...,
464469
nrows: int | None = ...,
465470
storage_options: StorageOptions = ...,
471+
engine: JSONEngine = ...,
466472
) -> DataFrame:
467473
...
468474

@@ -489,6 +495,7 @@ def read_json(
489495
compression: CompressionOptions = "infer",
490496
nrows: int | None = None,
491497
storage_options: StorageOptions = None,
498+
engine: JSONEngine = "ujson",
492499
) -> DataFrame | Series | JsonReader:
493500
"""
494501
Convert a JSON string to pandas object.
@@ -605,6 +612,9 @@ def read_json(
605612
606613
.. versionadded:: 1.3.0
607614
615+
engine : {{'ujson', 'pyarrow'}}, default "ujson"
616+
Parser engine to use.
617+
608618
lines : bool, default False
609619
Read the file as a json object per line.
610620
@@ -746,6 +756,7 @@ def read_json(
746756
nrows=nrows,
747757
storage_options=storage_options,
748758
encoding_errors=encoding_errors,
759+
engine=engine,
749760
)
750761

751762
if chunksize:
@@ -782,6 +793,7 @@ def __init__(
782793
nrows: int | None,
783794
storage_options: StorageOptions = None,
784795
encoding_errors: str | None = "strict",
796+
engine: JSONEngine = "ujson",
785797
) -> None:
786798

787799
self.orient = orient
@@ -793,6 +805,7 @@ def __init__(
793805
self.precise_float = precise_float
794806
self.date_unit = date_unit
795807
self.encoding = encoding
808+
self.engine = engine
796809
self.compression = compression
797810
self.storage_options = storage_options
798811
self.lines = lines
@@ -810,9 +823,48 @@ def __init__(
810823
self.nrows = validate_integer("nrows", self.nrows, 0)
811824
if not self.lines:
812825
raise ValueError("nrows can only be passed if lines=True")
826+
if self.engine == "pyarrow":
827+
if not self.lines:
828+
raise ValueError(
829+
"currently pyarrow engine only supports "
830+
"the line-delimited JSON format"
831+
)
832+
if self.engine not in ["pyarrow", "ujson"]:
833+
raise ValueError(
834+
f"The engine type {self.engine} is currently not supported."
835+
)
836+
837+
if self.engine == "pyarrow":
838+
self._engine = self._make_engine(filepath_or_buffer)
839+
if self.engine == "ujson":
840+
data = self._get_data_from_filepath(filepath_or_buffer)
841+
self.data = self._preprocess_data(data)
842+
843+
def _make_engine(
844+
self,
845+
filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
846+
) -> ArrowJsonParserWrapper:
847+
848+
if not isinstance(filepath_or_buffer, list):
849+
is_text = False
850+
mode = "rb"
851+
self.handles = get_handle(
852+
self._get_data_from_filepath(filepath_or_buffer),
853+
mode=mode,
854+
encoding=self.encoding,
855+
is_text=is_text,
856+
compression=self.compression,
857+
storage_options=self.storage_options,
858+
errors=self.encoding_errors,
859+
)
860+
filepath_or_buffer = self.handles.handle
813861

814-
data = self._get_data_from_filepath(filepath_or_buffer)
815-
self.data = self._preprocess_data(data)
862+
try:
863+
return ArrowJsonParserWrapper(filepath_or_buffer)
864+
except Exception:
865+
if self.handles is not None:
866+
self.handles.close()
867+
raise
816868

817869
def _preprocess_data(self, data):
818870
"""
@@ -896,20 +948,23 @@ def read(self) -> DataFrame | Series:
896948
Read the whole JSON input into a pandas object.
897949
"""
898950
obj: DataFrame | Series
899-
if self.lines:
900-
if self.chunksize:
901-
obj = concat(self)
902-
elif self.nrows:
903-
lines = list(islice(self.data, self.nrows))
904-
lines_json = self._combine_lines(lines)
905-
obj = self._get_object_parser(lines_json)
951+
if self.engine == "pyarrow":
952+
obj = self._engine.read()
953+
if self.engine == "ujson":
954+
if self.lines:
955+
if self.chunksize:
956+
obj = concat(self)
957+
elif self.nrows:
958+
lines = list(islice(self.data, self.nrows))
959+
lines_json = self._combine_lines(lines)
960+
obj = self._get_object_parser(lines_json)
961+
else:
962+
data = ensure_str(self.data)
963+
data_lines = data.split("\n")
964+
obj = self._get_object_parser(self._combine_lines(data_lines))
906965
else:
907-
data = ensure_str(self.data)
908-
data_lines = data.split("\n")
909-
obj = self._get_object_parser(self._combine_lines(data_lines))
910-
else:
911-
obj = self._get_object_parser(self.data)
912-
self.close()
966+
obj = self._get_object_parser(self.data)
967+
self.close()
913968
return obj
914969

915970
def _get_object_parser(self, json) -> DataFrame | Series:
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from pandas._typing import ReadBuffer
6+
from pandas.compat._optional import import_optional_dependency
7+
8+
if TYPE_CHECKING:
9+
from pandas import DataFrame
10+
11+
12+
class ArrowJsonParserWrapper:
13+
"""
14+
Wrapper for the pyarrow engine for read_json()
15+
"""
16+
17+
def __init__(self, src: ReadBuffer[bytes]) -> None:
18+
self.src = src
19+
20+
def read(self) -> DataFrame:
21+
"""
22+
Reads the contents of a JSON file into a DataFrame and
23+
processes it according to the kwargs passed in the
24+
constructor.
25+
26+
Returns
27+
-------
28+
DataFrame
29+
The DataFrame created from the JSON file.
30+
"""
31+
pyarrow_json = import_optional_dependency("pyarrow.json")
32+
table = pyarrow_json.read_json(self.src)
33+
34+
frame = table.to_pandas()
35+
return frame

pandas/tests/io/json/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,27 @@ def orient(request):
77
Fixture for orients excluding the table format.
88
"""
99
return request.param
10+
11+
12+
@pytest.fixture
13+
def json_dir_path(datapath):
14+
"""
15+
The directory path to the data files needed for parser tests.
16+
"""
17+
return datapath("io", "json", "data")
18+
19+
20+
@pytest.fixture(params=["ujson", "pyarrow"])
21+
def engine(request):
22+
return request.param
23+
24+
25+
@pytest.fixture
26+
def json_engine_pyarrow_xfail(request):
27+
"""
28+
Fixture that xfails a test if the engine is pyarrow.
29+
"""
30+
engine = request.getfixturevalue("engine")
31+
if engine == "pyarrow":
32+
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
33+
request.node.add_marker(mark)

0 commit comments

Comments
 (0)