Skip to content

read_json engine keyword and pyarrow integration #49249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6aa72ca
ENH: Add engine keyword to read_json to enable reading from pyarrow #…
abkosar Dec 13, 2022
c248b84
moved argument to the end of signature, fixed elifs
abkosar Jan 20, 2023
91b81cf
Adding finally
abkosar Jan 20, 2023
9020663
Updated the _make_engine try-finally block
abkosar Jan 25, 2023
0388c4e
Fixing merge conflicts
abkosar Dec 13, 2022
6911791
Merge branch 'main' into main
abkosar Jan 26, 2023
5ecd3e0
Merge branch 'pandas-dev:main' into main
abkosar Jan 27, 2023
c660395
Refactored pyarrow engine code
abkosar Dec 13, 2022
54570ca
Merge branch 'pandas-dev:main' into main
abkosar Jan 29, 2023
eb709e7
Refactored pyarrow implementation to inline
abkosar Dec 13, 2022
0a9c841
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
7eccd83
Small refactors
abkosar Feb 1, 2023
986d8cb
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
b4409bb
Fixing double lines
abkosar Feb 1, 2023
0467fe8
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
569ab9b
Merge branch 'pandas-dev:main' into main
abkosar Feb 1, 2023
4dc9adc
Merge branch 'pandas-dev:main' into main
abkosar Feb 2, 2023
38bc7db
- Added the logic for skipping test if pyarrow is not installed.
abkosar Feb 2, 2023
bed15df
Added else statement to conftest engine
abkosar Feb 2, 2023
a29e96a
Merge branch 'main' into main
abkosar Feb 2, 2023
cdfd747
- removed xfail decorators
abkosar Feb 2, 2023
c70c0b4
Merge branch 'main' into test-fixes
abkosar Feb 2, 2023
fe2b3ef
Merge branch 'main' into main
abkosar Feb 3, 2023
228ca64
Merge branch 'main' into main
abkosar Feb 3, 2023
d1acc94
Merge branch 'pandas-dev:main' into main
abkosar Feb 4, 2023
0885f07
Merge remote-tracking branch 'upstream/main' into abkosar/main
mroeschke Feb 9, 2023
ab7af44
add whatsnew, address comments
mroeschke Feb 9, 2023
c9cde9e
Merge remote-tracking branch 'upstream/main' into abkosar/main
mroeschke Feb 9, 2023
8c96553
address review
mroeschke Feb 9, 2023
9cbf598
Add note about param
mroeschke Feb 9, 2023
c59310b
Add test with lines=false
mroeschke Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2069,6 +2069,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
* ``lines`` : reads file as one json object per line.
* ``encoding`` : The encoding to use to decode py3 bytes.
* ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration.
* ``engine``: Either ``"ujson"``, the built-in JSON parser, or ``"pyarrow"`` which dispatches to pyarrow's ``pyarrow.json.read_json``.
The ``"pyarrow"`` is only available when ``lines=True``

The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.

Expand Down Expand Up @@ -2250,6 +2252,16 @@ For line-delimited json files, pandas can also return an iterator which reads in
for chunk in reader:
print(chunk)

Line-limited json can also be read using the pyarrow reader by specifying ``engine="pyarrow"``.

.. ipython:: python

from io import BytesIO
df = pd.read_json(BytesIO(jsonl.encode()), lines=True, engine="pyarrow")
df

.. versionadded:: 2.0.0

.. _io.table_schema:

Table schema
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ Other enhancements
- Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
- Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`)
- Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
- Added support for SQLAlchemy 2.0 (:issue:`40686`)
-

Expand Down
3 changes: 3 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,9 @@ def closed(self) -> bool:
# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]

# read_json engines
JSONEngine = Literal["ujson", "pyarrow"]

# read_xml parsers
XMLParsers = Literal["lxml", "etree"]

Expand Down
96 changes: 76 additions & 20 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@

import numpy as np

from pandas._config import using_nullable_dtypes
from pandas._config import (
get_option,
using_nullable_dtypes,
)

from pandas._libs import lib
from pandas._libs.json import (
Expand All @@ -34,11 +37,13 @@
DtypeArg,
FilePath,
IndexLabel,
JSONEngine,
JSONSerializable,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc

Expand Down Expand Up @@ -401,6 +406,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> JsonReader[Literal["frame"]]:
...

Expand All @@ -425,6 +431,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> JsonReader[Literal["series"]]:
...

Expand All @@ -449,6 +456,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> Series:
...

Expand All @@ -473,6 +481,7 @@ def read_json(
nrows: int | None = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
engine: JSONEngine = ...,
) -> DataFrame:
...

Expand Down Expand Up @@ -500,6 +509,7 @@ def read_json(
nrows: int | None = None,
storage_options: StorageOptions = None,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
engine: JSONEngine = "ujson",
) -> DataFrame | Series | JsonReader:
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -653,6 +663,12 @@ def read_json(

.. versionadded:: 2.0

engine : {{"ujson", "pyarrow"}}, default "ujson"
Parser engine to use. The ``"pyarrow"`` engine is only available when
``lines=True``.

.. versionadded:: 2.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -771,6 +787,7 @@ def read_json(
storage_options=storage_options,
encoding_errors=encoding_errors,
use_nullable_dtypes=use_nullable_dtypes,
engine=engine,
)

if chunksize:
Expand Down Expand Up @@ -807,6 +824,7 @@ def __init__(
storage_options: StorageOptions = None,
encoding_errors: str | None = "strict",
use_nullable_dtypes: bool = False,
engine: JSONEngine = "ujson",
) -> None:

self.orient = orient
Expand All @@ -818,6 +836,7 @@ def __init__(
self.precise_float = precise_float
self.date_unit = date_unit
self.encoding = encoding
self.engine = engine
self.compression = compression
self.storage_options = storage_options
self.lines = lines
Expand All @@ -828,17 +847,32 @@ def __init__(
self.handles: IOHandles[str] | None = None
self.use_nullable_dtypes = use_nullable_dtypes

if self.engine not in {"pyarrow", "ujson"}:
raise ValueError(
f"The engine type {self.engine} is currently not supported."
)
if self.chunksize is not None:
self.chunksize = validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
if self.engine == "pyarrow":
raise ValueError(
"currently pyarrow engine doesn't support chunksize parameter"
)
if self.nrows is not None:
self.nrows = validate_integer("nrows", self.nrows, 0)
if not self.lines:
raise ValueError("nrows can only be passed if lines=True")

data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)
if self.engine == "pyarrow":
if not self.lines:
raise ValueError(
"currently pyarrow engine only supports "
"the line-delimited JSON format"
)
self.data = filepath_or_buffer
elif self.engine == "ujson":
data = self._get_data_from_filepath(filepath_or_buffer)
self.data = self._preprocess_data(data)

def _preprocess_data(self, data):
"""
Expand Down Expand Up @@ -923,23 +957,45 @@ def read(self) -> DataFrame | Series:
"""
obj: DataFrame | Series
with self:
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
if self.engine == "pyarrow":
pyarrow_json = import_optional_dependency("pyarrow.json")
pa_table = pyarrow_json.read_json(self.data)
if self.use_nullable_dtypes:
if get_option("mode.dtype_backend") == "pyarrow":
from pandas.arrays import ArrowExtensionArray

return DataFrame(
{
col_name: ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
elif get_option("mode.dtype_backend") == "pandas":
from pandas.io._util import _arrow_dtype_mapping

mapping = _arrow_dtype_mapping()
return pa_table.to_pandas(types_mapper=mapping.get)
return pa_table.to_pandas()
elif self.engine == "ujson":
if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
obj = self._get_object_parser(self.data)
if self.use_nullable_dtypes:
return obj.convert_dtypes(infer_objects=False)
else:
return obj
obj = self._get_object_parser(self.data)
if self.use_nullable_dtypes:
return obj.convert_dtypes(infer_objects=False)
else:
return obj

def _get_object_parser(self, json) -> DataFrame | Series:
"""
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/json/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,10 @@ def orient(request):
Fixture for orients excluding the table format.
"""
return request.param


@pytest.fixture(params=["ujson", "pyarrow"])
def engine(request):
if request.param == "pyarrow":
pytest.importorskip("pyarrow.json")
return request.param
16 changes: 16 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1956,3 +1956,19 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))

tm.assert_series_equal(result, expected)


def test_invalid_engine():
# GH 48893
ser = Series(range(1))
out = ser.to_json()
with pytest.raises(ValueError, match="The engine type foo"):
read_json(out, engine="foo")


def test_pyarrow_engine_lines_false():
# GH 48893
ser = Series(range(1))
out = ser.to_json()
with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
read_json(out, engine="pyarrow", lines=False)
Loading