Skip to content

Commit 7a8d165

Browse files
SFuller4mroeschke
andauthored
ENH: Added append functionality for DataFrame.to_json (#48540)
* Adding functionality for mode='a' when saving DataFrame.to_json. Only supported when lines=True and orient='records'. * Adding tests for append functionality, along with updated whatsnew, user_guide, and generic docstring. * pre-commit adjustments * Update pandas/io/json/_json.py Co-authored-by: Matthew Roeschke <[email protected]> * Fixing pytest cases per request from mroeschke. Switching whatsnew version per request from mroeschke. * removed repr() from ValueError msg * fixing bad formatting * Adjusting Typing from str to Literal["a", "w"] per request. * updating typing issues in the core file * Adding functionality for mode='a' when saving DataFrame.to_json. Only supported when lines=True and orient='records'. * Adding tests for append functionality, along with updated whatsnew, user_guide, and generic docstring. * pre-commit adjustments * Update pandas/io/json/_json.py Co-authored-by: Matthew Roeschke <[email protected]> * fixing typing issues by adding mode to the overloads of to_json. Also removing whatsnew 1.6.0 * moving enhancement information to whatsnew/v2.0.0 * removing extra space from old whatsnew Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 8b75fda commit 7a8d165

File tree

5 files changed

+166
-1
lines changed

5 files changed

+166
-1
lines changed

doc/source/user_guide/io.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1918,6 +1918,7 @@ with optional parameters:
19181918
* ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
19191919
* ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
19201920
* ``lines`` : If ``records`` orient, then will write each record per line as json.
1921+
* ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w'
19211922

19221923
Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters.
19231924

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other enhancements
3737
- Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
3838
- :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
3939
- :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)
40+
- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
4041

4142
.. ---------------------------------------------------------------------------
4243
.. _whatsnew_200.notable_bug_fixes:

pandas/core/generic.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2412,6 +2412,7 @@ def to_json(
24122412
index: bool_t = True,
24132413
indent: int | None = None,
24142414
storage_options: StorageOptions = None,
2415+
mode: Literal["a", "w"] = "w",
24152416
) -> str | None:
24162417
"""
24172418
Convert the object to a JSON string.
@@ -2490,6 +2491,11 @@ def to_json(
24902491
24912492
.. versionadded:: 1.2.0
24922493
2494+
mode : str, default 'w' (writing)
2495+
Specify the IO mode for output when supplying a path_or_buf.
2496+
Accepted args are 'w' (writing) and 'a' (append) only.
2497+
mode='a' is only supported when lines is True and orient is 'records'.
2498+
24932499
Returns
24942500
-------
24952501
None or str
@@ -2673,6 +2679,7 @@ def to_json(
26732679
index=index,
26742680
indent=indent,
26752681
storage_options=storage_options,
2682+
mode=mode,
26762683
)
26772684

26782685
@final

pandas/io/json/_json.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def to_json(
9797
index: bool = ...,
9898
indent: int = ...,
9999
storage_options: StorageOptions = ...,
100+
mode: Literal["a", "w"] = ...,
100101
) -> None:
101102
...
102103

@@ -116,6 +117,7 @@ def to_json(
116117
index: bool = ...,
117118
indent: int = ...,
118119
storage_options: StorageOptions = ...,
120+
mode: Literal["a", "w"] = ...,
119121
) -> str:
120122
...
121123

@@ -134,6 +136,7 @@ def to_json(
134136
index: bool = True,
135137
indent: int = 0,
136138
storage_options: StorageOptions = None,
139+
mode: Literal["a", "w"] = "w",
137140
) -> str | None:
138141

139142
if not index and orient not in ["split", "table"]:
@@ -144,6 +147,20 @@ def to_json(
144147
if lines and orient != "records":
145148
raise ValueError("'lines' keyword only valid when 'orient' is records")
146149

150+
if mode not in ["a", "w"]:
151+
msg = (
152+
f"mode={mode} is not a valid option."
153+
"Only 'w' and 'a' are currently supported."
154+
)
155+
raise ValueError(msg)
156+
157+
if mode == "a" and (not lines or orient != "records"):
158+
msg = (
159+
"mode='a' (append) is only supported when"
160+
"lines is True and orient is 'records'"
161+
)
162+
raise ValueError(msg)
163+
147164
if orient == "table" and isinstance(obj, Series):
148165
obj = obj.to_frame(name=obj.name or "values")
149166

@@ -175,7 +192,7 @@ def to_json(
175192
if path_or_buf is not None:
176193
# apply compression and byte/text conversion
177194
with get_handle(
178-
path_or_buf, "w", compression=compression, storage_options=storage_options
195+
path_or_buf, mode, compression=compression, storage_options=storage_options
179196
) as handles:
180197
handles.handle.write(s)
181198
else:

pandas/tests/io/json/test_readlines.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,3 +296,142 @@ def __iter__(self) -> Iterator:
296296
reader = MyReader(jsonl)
297297
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
298298
assert reader.read_count > 10
299+
300+
301+
@pytest.mark.parametrize("orient_", ["split", "index", "table"])
302+
def test_to_json_append_orient(orient_):
303+
# GH 35849
304+
# Test ValueError when orient is not 'records'
305+
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
306+
msg = (
307+
r"mode='a' \(append\) is only supported when"
308+
"lines is True and orient is 'records'"
309+
)
310+
with pytest.raises(ValueError, match=msg):
311+
df.to_json(mode="a", orient=orient_)
312+
313+
314+
def test_to_json_append_lines():
315+
# GH 35849
316+
# Test ValueError when lines is not True
317+
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
318+
msg = (
319+
r"mode='a' \(append\) is only supported when"
320+
"lines is True and orient is 'records'"
321+
)
322+
with pytest.raises(ValueError, match=msg):
323+
df.to_json(mode="a", lines=False, orient="records")
324+
325+
326+
@pytest.mark.parametrize("mode_", ["r", "x"])
327+
def test_to_json_append_mode(mode_):
328+
# GH 35849
329+
# Test ValueError when mode is not supported option
330+
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
331+
msg = (
332+
f"mode={mode_} is not a valid option."
333+
"Only 'w' and 'a' are currently supported."
334+
)
335+
with pytest.raises(ValueError, match=msg):
336+
df.to_json(mode=mode_, lines=False, orient="records")
337+
338+
339+
def to_json_append_output_consistent_columns():
340+
# GH 35849
341+
# Testing that resulting output reads in as expected.
342+
# Testing same columns, new rows
343+
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
344+
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
345+
346+
expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
347+
with tm.ensure_clean("test.json") as path:
348+
# Save dataframes to the same file
349+
df1.to_json(path, lines=True, orient="records")
350+
df2.to_json(path, mode="a", lines=True, orient="records")
351+
352+
# Read path file
353+
result = read_json(path, lines=True)
354+
tm.assert_frame_equal(result, expected)
355+
356+
357+
def to_json_append_output_inconsistent_columns():
358+
# GH 35849
359+
# Testing that resulting output reads in as expected.
360+
# Testing one new column, one old column, new rows
361+
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
362+
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
363+
364+
expected = DataFrame(
365+
{
366+
"col1": [1, 2, None, None],
367+
"col2": ["a", "b", "e", "f"],
368+
"col3": [None, None, "!", "#"],
369+
}
370+
)
371+
with tm.ensure_clean("test.json") as path:
372+
# Save dataframes to the same file
373+
df1.to_json(path, mode="a", lines=True, orient="records")
374+
df3.to_json(path, mode="a", lines=True, orient="records")
375+
376+
# Read path file
377+
result = read_json(path, lines=True)
378+
tm.assert_frame_equal(result, expected)
379+
380+
381+
def to_json_append_output_different_columns():
382+
# GH 35849
383+
# Testing that resulting output reads in as expected.
384+
# Testing same, differing and new columns
385+
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
386+
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
387+
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
388+
df4 = DataFrame({"col4": [True, False]})
389+
390+
expected = DataFrame(
391+
{
392+
"col1": [1, 2, 3, 4, None, None, None, None],
393+
"col2": ["a", "b", "c", "d", "e", "f", None, None],
394+
"col3": [None, None, None, None, "!", "#", None, None],
395+
"col4": [None, None, None, None, None, None, True, False],
396+
}
397+
)
398+
with tm.ensure_clean("test.json") as path:
399+
# Save dataframes to the same file
400+
df1.to_json(path, mode="a", lines=True, orient="records")
401+
df2.to_json(path, mode="a", lines=True, orient="records")
402+
df3.to_json(path, mode="a", lines=True, orient="records")
403+
df4.to_json(path, mode="a", lines=True, orient="records")
404+
405+
# Read path file
406+
result = read_json(path, lines=True)
407+
tm.assert_frame_equal(result, expected)
408+
409+
410+
def to_json_append_output_different_columns_reordered():
411+
# GH 35849
412+
# Testing that resulting output reads in as expected.
413+
# Testing specific result column order.
414+
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
415+
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
416+
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
417+
df4 = DataFrame({"col4": [True, False]})
418+
419+
# df4, df3, df2, df1 (in that order)
420+
expected = DataFrame(
421+
{
422+
"col4": [True, False, None, None, None, None, None, None],
423+
"col2": [None, None, "e", "f", "c", "d", "a", "b"],
424+
"col3": [None, None, "!", "#", None, None, None, None],
425+
"col1": [None, None, None, None, 3, 4, 1, 2],
426+
}
427+
)
428+
with tm.ensure_clean("test.json") as path:
429+
# Save dataframes to the same file
430+
df4.to_json(path, mode="a", lines=True, orient="records")
431+
df3.to_json(path, mode="a", lines=True, orient="records")
432+
df2.to_json(path, mode="a", lines=True, orient="records")
433+
df1.to_json(path, mode="a", lines=True, orient="records")
434+
435+
# Read path file
436+
result = read_json(path, lines=True)
437+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)