Skip to content

read_json engine keyword and pyarrow integration #49249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6aa72ca
ENH: Add engine keyword to read_json to enable reading from pyarrow #…
abkosar Dec 13, 2022
c248b84
moved argument to the end of signature, fixed elifs
abkosar Jan 20, 2023
91b81cf
Adding finally
abkosar Jan 20, 2023
9020663
Updated the _make_engine try-finally block
abkosar Jan 25, 2023
0388c4e
Fixing merge conflicts
abkosar Dec 13, 2022
6911791
Merge branch 'main' into main
abkosar Jan 26, 2023
5ecd3e0
Merge branch 'pandas-dev:main' into main
abkosar Jan 27, 2023
c660395
Refactored pyarrow engine code
abkosar Dec 13, 2022
54570ca
Merge branch 'pandas-dev:main' into main
abkosar Jan 29, 2023
eb709e7
Refactored pyarrow implementation to inline
abkosar Dec 13, 2022
0a9c841
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
7eccd83
Small refactors
abkosar Feb 1, 2023
986d8cb
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
b4409bb
Fixing double lines
abkosar Feb 1, 2023
0467fe8
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
569ab9b
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
4dc9adc
Merge branch 'pandas-dev:main' into main
abkosar Feb 2, 2023
38bc7db
- Added the logic for skipping test if pyarrow is not installed.
abkosar Feb 2, 2023
bed15df
Added else statement to conftest engine
abkosar Feb 2, 2023
a29e96a
Merge branch 'main' into main
abkosar Feb 2, 2023
cdfd747
- removed xfail decorators
abkosar Feb 2, 2023
c70c0b4
Merge branch 'main' into test-fixes
abkosar Feb 2, 2023
fe2b3ef
Merge branch 'main' into main
abkosar Feb 3, 2023
228ca64
Merge branch 'main' into main
abkosar Feb 3, 2023
d1acc94
Merge branch 'pandas-dev:main' into main
abkosar Feb 4, 2023
0885f07
Merge remote-tracking branch 'upstream/main' into abkosar/main
mroeschke Feb 9, 2023
ab7af44
add whatsnew, address comments
mroeschke Feb 9, 2023
c9cde9e
Merge remote-tracking branch 'upstream/main' into abkosar/main
mroeschke Feb 9, 2023
8c96553
address review
mroeschke Feb 9, 2023
9cbf598
Add note about param
mroeschke Feb 9, 2023
c59310b
Add test with lines=false
mroeschke Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,9 @@ def closed(self) -> bool:
# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]

# read_json engines
JSONEngine = Literal["ujson", "pyarrow"]

# read_xml parsers
XMLParsers = Literal["lxml", "etree"]

Expand Down
63 changes: 49 additions & 14 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@
DtypeArg,
FilePath,
IndexLabel,
JSONEngine,
JSONSerializable,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc

Expand Down Expand Up @@ -401,6 +403,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> JsonReader[Literal["frame"]]:
...

Expand All @@ -425,6 +428,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> JsonReader[Literal["series"]]:
...

Expand All @@ -449,6 +453,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> Series:
...

Expand All @@ -473,6 +478,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> DataFrame:
...

Expand Down Expand Up @@ -500,6 +506,7 @@ def read_json(
nrows: int | None = None,
storage_options: StorageOptions = None,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
engine: JSONEngine = "ujson",
) -> DataFrame | Series | JsonReader:
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -653,6 +660,9 @@ def read_json(

.. versionadded:: 2.0

engine : {{'ujson', 'pyarrow'}}, default "ujson"
Parser engine to use.

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -771,6 +781,7 @@ def read_json(
storage_options=storage_options,
encoding_errors=encoding_errors,
use_nullable_dtypes=use_nullable_dtypes,
engine=engine,
)

if chunksize:
Expand Down Expand Up @@ -807,6 +818,7 @@ def __init__(
storage_options: StorageOptions = None,
encoding_errors: str | None = "strict",
use_nullable_dtypes: bool = False,
engine: JSONEngine = "ujson",
) -> None:

self.orient = orient
Expand All @@ -818,6 +830,7 @@ def __init__(
self.precise_float = precise_float
self.date_unit = date_unit
self.encoding = encoding
self.engine = engine
self.compression = compression
self.storage_options = storage_options
self.lines = lines
Expand All @@ -832,13 +845,30 @@ def __init__(
self.chunksize = validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
if self.engine == "pyarrow":
raise ValueError(
"currently pyarrow engine doesn't support chunksize parameter"
)
if self.nrows is not None:
self.nrows = validate_integer("nrows", self.nrows, 0)
if not self.lines:
raise ValueError("nrows can only be passed if lines=True")
if self.engine == "pyarrow":
if not self.lines:
raise ValueError(
"currently pyarrow engine only supports "
"the line-delimited JSON format"
)
if self.engine not in {"pyarrow", "ujson"}:
raise ValueError(
f"The engine type {self.engine} is currently not supported."
)

data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
if self.engine == "pyarrow":
self.data = filepath_or_buffer
elif self.engine == "ujson":
data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)

def _preprocess_data(self, data):
"""
Expand Down Expand Up @@ -923,19 +953,24 @@ def read(self) -> DataFrame | Series:
"""
obj: DataFrame | Series
with self:
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
if self.engine == "pyarrow":
pyarrow_json = import_optional_dependency("pyarrow.json")
table = pyarrow_json.read_json(self.data)
obj = table.to_pandas()
elif self.engine == "ujson":
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
obj = self._get_object_parser(self.data)
obj = self._get_object_parser(self.data)
if self.use_nullable_dtypes:
return obj.convert_dtypes(infer_objects=False)
else:
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/io/json/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,31 @@ def orient(request):
Fixture for orients excluding the table format.
"""
return request.param


@pytest.fixture
def json_dir_path(datapath):
"""
The directory path to the data files needed for parser tests.
"""
return datapath("io", "json", "data")


@pytest.fixture(params=["ujson", "pyarrow"])
def engine(request):
if request.param == "pyarrow":
pytest.importorskip("pyarrow.json")
return request.param
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Instead of the else, could you do

if request.param == "pyarrow":
    pytest.importorskip(...)
return request.param

else:
return request.param


@pytest.fixture
def json_engine_pyarrow_xfail(request):
"""
Fixture that xfails a test if the engine is pyarrow.
"""
engine = request.getfixturevalue("engine")
if engine == "pyarrow":
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I would prefer this to be added to each xfailed test so the reason more clearly describes the exact thing that is not supported

Copy link
Contributor Author

@abkosar abkosar Jan 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke I added a comment to each xfailed test. Is there another special way to do it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def test_foo(request, ...)
    if engine == "pyarrow":
        request.node.add_marker(pytest.mark.xfail(reason="the comment"))

request.node.add_marker(mark)
Loading