Skip to content

Commit 4f69b94

Browse files
committed
Added a fixture for the engine keyword and add it to all existing JSON tests.
Removed unused methods from arrow_json_parser_wrapper.py. Fixed placement of engine keyword.
1 parent f569301 commit 4f69b94

File tree

5 files changed

+270
-48
lines changed

5 files changed

+270
-48
lines changed

pandas/_typing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,9 @@ def closed(self) -> bool:
319319
# read_csv engines
320320
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
321321

322+
# read_json engines
323+
JSONEngine = Literal["ujson", "pyarrow"]
324+
322325
# read_xml parsers
323326
XMLParsers = Literal["lxml", "etree"]
324327

pandas/io/json/_json.py

Lines changed: 68 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
DtypeArg,
2929
FilePath,
3030
IndexLabel,
31+
JSONEngine,
3132
JSONSerializable,
3233
ReadBuffer,
3334
StorageOptions,
@@ -66,6 +67,7 @@
6667
build_table_schema,
6768
parse_table_schema,
6869
)
70+
from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper
6971
from pandas.io.parsers.readers import validate_integer
7072

7173
if TYPE_CHECKING:
@@ -389,6 +391,7 @@ def read_json(
389391
date_unit: str | None = ...,
390392
encoding: str | None = ...,
391393
encoding_errors: str | None = ...,
394+
engine: JSONEngine = ...,
392395
lines: bool = ...,
393396
chunksize: int,
394397
compression: CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417420
compression: CompressionOptions = ...,
418421
nrows: int | None = ...,
419422
storage_options: StorageOptions = ...,
423+
engine: JSONEngine = ...,
420424
) -> JsonReader[Literal["series"]]:
421425
...
422426

@@ -440,6 +444,7 @@ def read_json(
440444
compression: CompressionOptions = ...,
441445
nrows: int | None = ...,
442446
storage_options: StorageOptions = ...,
447+
engine: JSONEngine = ...,
443448
) -> Series:
444449
...
445450

@@ -463,6 +468,7 @@ def read_json(
463468
compression: CompressionOptions = ...,
464469
nrows: int | None = ...,
465470
storage_options: StorageOptions = ...,
471+
engine: JSONEngine = ...,
466472
) -> DataFrame:
467473
...
468474

@@ -489,6 +495,7 @@ def read_json(
489495
compression: CompressionOptions = "infer",
490496
nrows: int | None = None,
491497
storage_options: StorageOptions = None,
498+
engine: JSONEngine = "ujson",
492499
) -> DataFrame | Series | JsonReader:
493500
"""
494501
Convert a JSON string to pandas object.
@@ -605,6 +612,9 @@ def read_json(
605612
606613
.. versionadded:: 1.3.0
607614
615+
engine : {{'ujson', 'pyarrow'}}, default "ujson"
616+
Parser engine to use.
617+
608618
lines : bool, default False
609619
Read the file as a json object per line.
610620
@@ -740,6 +750,7 @@ def read_json(
740750
precise_float=precise_float,
741751
date_unit=date_unit,
742752
encoding=encoding,
753+
engine=engine,
743754
lines=lines,
744755
chunksize=chunksize,
745756
compression=compression,
@@ -782,6 +793,7 @@ def __init__(
782793
nrows: int | None,
783794
storage_options: StorageOptions = None,
784795
encoding_errors: str | None = "strict",
796+
engine: JSONEngine = "ujson",
785797
) -> None:
786798

787799
self.orient = orient
@@ -793,6 +805,7 @@ def __init__(
793805
self.precise_float = precise_float
794806
self.date_unit = date_unit
795807
self.encoding = encoding
808+
self.engine = engine
796809
self.compression = compression
797810
self.storage_options = storage_options
798811
self.lines = lines
@@ -810,9 +823,46 @@ def __init__(
810823
self.nrows = validate_integer("nrows", self.nrows, 0)
811824
if not self.lines:
812825
raise ValueError("nrows can only be passed if lines=True")
826+
if self.engine == "pyarrow":
827+
if not self.lines:
828+
raise ValueError(
829+
"currently pyarrow engine only supports "
830+
"the line-delimited JSON format"
831+
)
832+
if self.engine not in ["pyarrow", "ujson"]:
833+
raise ValueError("This engine type is currently not supported.")
834+
835+
if self.engine == "pyarrow":
836+
self._engine = self._make_engine(filepath_or_buffer)
837+
if self.engine == "ujson":
838+
data = self._get_data_from_filepath(filepath_or_buffer)
839+
self.data = self._preprocess_data(data)
840+
841+
def _make_engine(
842+
self,
843+
filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
844+
) -> ArrowJsonParserWrapper:
845+
846+
if not isinstance(filepath_or_buffer, list):
847+
is_text = False
848+
mode = "rb"
849+
self.handles = get_handle(
850+
self._get_data_from_filepath(filepath_or_buffer),
851+
mode=mode,
852+
encoding=self.encoding,
853+
is_text=is_text,
854+
compression=self.compression,
855+
storage_options=self.storage_options,
856+
errors=self.encoding_errors,
857+
)
858+
filepath_or_buffer = self.handles.handle
813859

814-
data = self._get_data_from_filepath(filepath_or_buffer)
815-
self.data = self._preprocess_data(data)
860+
try:
861+
return ArrowJsonParserWrapper(filepath_or_buffer)
862+
except Exception:
863+
if self.handles is not None:
864+
self.handles.close()
865+
raise
816866

817867
def _preprocess_data(self, data):
818868
"""
@@ -896,20 +946,23 @@ def read(self) -> DataFrame | Series:
896946
Read the whole JSON input into a pandas object.
897947
"""
898948
obj: DataFrame | Series
899-
if self.lines:
900-
if self.chunksize:
901-
obj = concat(self)
902-
elif self.nrows:
903-
lines = list(islice(self.data, self.nrows))
904-
lines_json = self._combine_lines(lines)
905-
obj = self._get_object_parser(lines_json)
949+
if self.engine == "pyarrow":
950+
obj = self._engine.read()
951+
if self.engine == "ujson":
952+
if self.lines:
953+
if self.chunksize:
954+
obj = concat(self)
955+
elif self.nrows:
956+
lines = list(islice(self.data, self.nrows))
957+
lines_json = self._combine_lines(lines)
958+
obj = self._get_object_parser(lines_json)
959+
else:
960+
data = ensure_str(self.data)
961+
data_lines = data.split("\n")
962+
obj = self._get_object_parser(self._combine_lines(data_lines))
906963
else:
907-
data = ensure_str(self.data)
908-
data_lines = data.split("\n")
909-
obj = self._get_object_parser(self._combine_lines(data_lines))
910-
else:
911-
obj = self._get_object_parser(self.data)
912-
self.close()
964+
obj = self._get_object_parser(self.data)
965+
self.close()
913966
return obj
914967

915968
def _get_object_parser(self, json) -> DataFrame | Series:
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from pandas._typing import ReadBuffer
6+
from pandas.compat._optional import import_optional_dependency
7+
8+
from pandas.core.dtypes.inference import is_integer
9+
10+
if TYPE_CHECKING:
11+
from pandas import DataFrame
12+
13+
14+
class ArrowJsonParserWrapper:
15+
"""
16+
Wrapper for the pyarrow engine for read_json()
17+
"""
18+
19+
def __init__(self, src: ReadBuffer[bytes]) -> None:
20+
super().__init__()
21+
self.src = src
22+
23+
def read(self) -> DataFrame:
24+
"""
25+
Reads the contents of a JSON file into a DataFrame and
26+
processes it according to the kwargs passed in the
27+
constructor.
28+
29+
Returns
30+
-------
31+
DataFrame
32+
The DataFrame created from the JSON file.
33+
"""
34+
pyarrow_json = import_optional_dependency("pyarrow.json")
35+
table = pyarrow_json.read_json(self.src)
36+
37+
frame = table.to_pandas()
38+
return frame
39+
40+
def _finalize_output(self, frame: DataFrame) -> DataFrame:
41+
"""
42+
Processes data read in based on kwargs.
43+
44+
Parameters
45+
----------
46+
frame: DataFrame
47+
The DataFrame to process.
48+
49+
Returns
50+
-------
51+
DataFrame
52+
The processed DataFrame.
53+
"""
54+
num_cols = len(frame.columns)
55+
multi_index_named = True
56+
if self.header is None:
57+
if self.names is None:
58+
if self.prefix is not None:
59+
self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
60+
elif self.header is None:
61+
self.names = range(num_cols)
62+
if len(self.names) != num_cols:
63+
# usecols is passed through to pyarrow, we only handle index col here
64+
# The only way self.names is not the same length as number of cols is
65+
# if we have int index_col. We should just pad the names(they will get
66+
# removed anyways) to expected length then.
67+
self.names = list(range(num_cols - len(self.names))) + self.names
68+
multi_index_named = False
69+
frame.columns = self.names
70+
# we only need the frame not the names
71+
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
72+
if self.index_col is not None:
73+
for i, item in enumerate(self.index_col):
74+
if is_integer(item):
75+
self.index_col[i] = frame.columns[item]
76+
else:
77+
# String case
78+
if item not in frame.columns:
79+
raise ValueError(f"Index {item} invalid")
80+
frame.set_index(self.index_col, drop=True, inplace=True)
81+
# Clear names if headerless and no name given
82+
if self.header is None and not multi_index_named:
83+
frame.index.names = [None] * len(frame.index.names)
84+
85+
if self.kwds.get("dtype") is not None:
86+
try:
87+
frame = frame.astype(self.kwds.get("dtype"))
88+
except TypeError as e:
89+
# GH#44901 reraise to keep api consistent
90+
raise ValueError(e)
91+
return frame

pandas/tests/io/json/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,27 @@ def orient(request):
77
Fixture for orients excluding the table format.
88
"""
99
return request.param
10+
11+
12+
@pytest.fixture
13+
def json_dir_path(datapath):
14+
"""
15+
The directory path to the data files needed for parser tests.
16+
"""
17+
return datapath("io", "json", "data")
18+
19+
20+
@pytest.fixture(params=["ujson", "pyarrow"])
21+
def engine(request):
22+
return request.param
23+
24+
25+
@pytest.fixture
26+
def json_engine_pyarrow_xfail(request):
27+
"""
28+
Fixture that xfails a test if the engine is pyarrow.
29+
"""
30+
engine = request.getfixturevalue("engine")
31+
if engine == "pyarrow":
32+
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
33+
request.node.add_marker(mark)

0 commit comments

Comments
 (0)