ENH: Added append functionality for DataFrame.to_json (#48540)

SFuller4 · mroeschke · web-flow · commit 7a8d1654f040 · 2022-10-20T10:58:24.000-07:00
* Adding functionality for mode='a' when saving DataFrame.to_json. Only supported when lines=True and orient='records'.

* Adding tests for append functionality, along with updated whatsnew, user_guide, and generic docstring.

* pre-commit adjustments

* Update pandas/io/json/_json.py

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* Fixing pytest cases per request from mroeschke. Switching whatsnew version per request from mroeschke.

* removed repr() from ValueError msg

* fixing bad formatting

* Adjusting Typing from str to Literal["a", "w"] per request.

* updating typing issues in the core file

* Adding functionality for mode='a' when saving DataFrame.to_json. Only supported when lines=True and orient='records'.

* Adding tests for append functionality, along with updated whatsnew, user_guide, and generic docstring.

* pre-commit adjustments

* Update pandas/io/json/_json.py

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* fixing typing issues by adding mode to the overloads of to_json. Also removing whatsnew 1.6.0

* moving enhancement information to whatsnew/v2.0.0

* removing extra space from old whatsnew

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1918,6 +1918,7 @@ with optional parameters:
 * ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
 * ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
 * ``lines`` : If ``records`` orient, then will write each record per line as json.
+* ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w'
 
 Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters.
 
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
 - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)
+- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.notable_bug_fixes:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2412,6 +2412,7 @@ def to_json(
         index: bool_t = True,
         indent: int | None = None,
         storage_options: StorageOptions = None,
+        mode: Literal["a", "w"] = "w",
     ) -> str | None:
         """
         Convert the object to a JSON string.
@@ -2490,6 +2491,11 @@ def to_json(
 
             .. versionadded:: 1.2.0
 
+        mode : str, default 'w' (writing)
+            Specify the IO mode for output when supplying a path_or_buf.
+            Accepted args are 'w' (writing) and 'a' (append) only.
+            mode='a' is only supported when lines is True and orient is 'records'.
+
         Returns
         -------
         None or str
@@ -2673,6 +2679,7 @@ def to_json(
             index=index,
             indent=indent,
             storage_options=storage_options,
+            mode=mode,
         )
 
     @final
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -97,6 +97,7 @@ def to_json(
     index: bool = ...,
     indent: int = ...,
     storage_options: StorageOptions = ...,
+    mode: Literal["a", "w"] = ...,
 ) -> None:
     ...
 
@@ -116,6 +117,7 @@ def to_json(
     index: bool = ...,
     indent: int = ...,
     storage_options: StorageOptions = ...,
+    mode: Literal["a", "w"] = ...,
 ) -> str:
     ...
 
@@ -134,6 +136,7 @@ def to_json(
     index: bool = True,
     indent: int = 0,
     storage_options: StorageOptions = None,
+    mode: Literal["a", "w"] = "w",
 ) -> str | None:
 
     if not index and orient not in ["split", "table"]:
@@ -144,6 +147,20 @@ def to_json(
     if lines and orient != "records":
         raise ValueError("'lines' keyword only valid when 'orient' is records")
 
+    if mode not in ["a", "w"]:
+        msg = (
+            f"mode={mode} is not a valid option."
+            "Only 'w' and 'a' are currently supported."
+        )
+        raise ValueError(msg)
+
+    if mode == "a" and (not lines or orient != "records"):
+        msg = (
+            "mode='a' (append) is only supported when"
+            "lines is True and orient is 'records'"
+        )
+        raise ValueError(msg)
+
     if orient == "table" and isinstance(obj, Series):
         obj = obj.to_frame(name=obj.name or "values")
 
@@ -175,7 +192,7 @@ def to_json(
     if path_or_buf is not None:
         # apply compression and byte/text conversion
         with get_handle(
-            path_or_buf, "w", compression=compression, storage_options=storage_options
+            path_or_buf, mode, compression=compression, storage_options=storage_options
         ) as handles:
             handles.handle.write(s)
     else:
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -296,3 +296,142 @@ def __iter__(self) -> Iterator:
     reader = MyReader(jsonl)
     assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
     assert reader.read_count > 10
+
+
+@pytest.mark.parametrize("orient_", ["split", "index", "table"])
+def test_to_json_append_orient(orient_):
+    # GH 35849
+    # Test ValueError when orient is not 'records'
+    df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    msg = (
+        r"mode='a' \(append\) is only supported when"
+        "lines is True and orient is 'records'"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(mode="a", orient=orient_)
+
+
+def test_to_json_append_lines():
+    # GH 35849
+    # Test ValueError when lines is not True
+    df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    msg = (
+        r"mode='a' \(append\) is only supported when"
+        "lines is True and orient is 'records'"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(mode="a", lines=False, orient="records")
+
+
+@pytest.mark.parametrize("mode_", ["r", "x"])
+def test_to_json_append_mode(mode_):
+    # GH 35849
+    # Test ValueError when mode is not supported option
+    df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    msg = (
+        f"mode={mode_} is not a valid option."
+        "Only 'w' and 'a' are currently supported."
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(mode=mode_, lines=False, orient="records")
+
+
+def to_json_append_output_consistent_columns():
+    # GH 35849
+    # Testing that resulting output reads in as expected.
+    # Testing same columns, new rows
+    df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
+
+    expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df1.to_json(path, lines=True, orient="records")
+        df2.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)
+
+
+def to_json_append_output_inconsistent_columns():
+    # GH 35849
+    # Testing that resulting output reads in as expected.
+    # Testing one new column, one old column, new rows
+    df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
+
+    expected = DataFrame(
+        {
+            "col1": [1, 2, None, None],
+            "col2": ["a", "b", "e", "f"],
+            "col3": [None, None, "!", "#"],
+        }
+    )
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df1.to_json(path, mode="a", lines=True, orient="records")
+        df3.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)
+
+
+def to_json_append_output_different_columns():
+    # GH 35849
+    # Testing that resulting output reads in as expected.
+    # Testing same, differing and new columns
+    df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
+    df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
+    df4 = DataFrame({"col4": [True, False]})
+
+    expected = DataFrame(
+        {
+            "col1": [1, 2, 3, 4, None, None, None, None],
+            "col2": ["a", "b", "c", "d", "e", "f", None, None],
+            "col3": [None, None, None, None, "!", "#", None, None],
+            "col4": [None, None, None, None, None, None, True, False],
+        }
+    )
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df1.to_json(path, mode="a", lines=True, orient="records")
+        df2.to_json(path, mode="a", lines=True, orient="records")
+        df3.to_json(path, mode="a", lines=True, orient="records")
+        df4.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)
+
+
+def to_json_append_output_different_columns_reordered():
+    # GH 35849
+    # Testing that resulting output reads in as expected.
+    # Testing specific result column order.
+    df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
+    df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
+    df4 = DataFrame({"col4": [True, False]})
+
+    # df4, df3, df2, df1 (in that order)
+    expected = DataFrame(
+        {
+            "col4": [True, False, None, None, None, None, None, None],
+            "col2": [None, None, "e", "f", "c", "d", "a", "b"],
+            "col3": [None, None, "!", "#", None, None, None, None],
+            "col1": [None, None, None, None, 3, 4, 1, 2],
+        }
+    )
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df4.to_json(path, mode="a", lines=True, orient="records")
+        df3.to_json(path, mode="a", lines=True, orient="records")
+        df2.to_json(path, mode="a", lines=True, orient="records")
+        df1.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)