pandas-dev · wooseogchoi · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 11, 2024
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -247,7 +247,11 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
     .. versionadded:: 1.4.0
 
         The 'pyarrow' engine was added as an *experimental* engine, and some features
-        are unsupported, or may not work correctly, with this engine.
+        are unsupported, or may not work correctly, with this engine. For example,
+        the newlines_in_values in the ParseOptions of the pyarrow allows handling the
+        newline characters within values when parsing csv files. However, this is not
+        currently supported by Pandas. In this case, the 'csv' module in the pyarrow
+        should be used instead. For more information, refer to the example.
 converters : dict of {{Hashable : Callable}}, optional
     Functions for converting values in specified columns. Keys can either
     be column labels or column indices.
@@ -545,12 +549,26 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
 ...     parse_dates=[1, 2],
 ...     date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}},
 ... )  # doctest: +SKIP
-
 >>> df.dtypes  # doctest: +SKIP
 col 1             int64
 col 2    datetime64[ns]
 col 3    datetime64[ns]
 dtype: object
+
+The csv in the pyarrow must be used if the values in the file have
+new line characters.
+
+>>> from pyarrow import csv  # doctest: +SKIP
+>>> parse_options = csv.ParseOptions(newlines_in_values=True)  # doctest: +SKIP
+>>> table = csv.read_csv("example.csv", parse_options=parse_options)  # doctest: +SKIP
+>>> df = table.to_pandas()  # doctest: +SKIP
+>>> df.head()  # doctest: +SKIP
+     text  idx
+0  ab\ncd  0
+1  ab\ncd  1
+2  ab\ncd  2
+3  ab\ncd  3
+4  ab\ncd  4
 """  # noqa: E501
 
 

diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -13,9 +13,11 @@
 
 import pytest
 
+from pandas.compat.pyarrow import pa_version_under18p0
 from pandas.errors import ParserError
 
 import pandas._testing as tm
+from pandas.core.frame import DataFrame
 
 from pandas.io.parsers import read_csv
 import pandas.io.parsers.readers as parsers
@@ -150,6 +152,22 @@ def test_pyarrow_engine(self):
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)
 
+    @pytest.mark.skipif(not pa_version_under18p0, reason="No ParserError raised")
+    def test_pyarrow_newlines_in_values(self):
+        pytest.importorskip("pyarrow")
+        msg = (
+            "CSV parser got out of sync with chunker. "
+            "This can mean the data file contains cell values spanning multiple "
+            "lines; please consider enabling the option 'newlines_in_values'."
+        )
+        rows = [{"text": "ab\ncd", "idx": idx} for idx in range(1_000_000)]
+        df = DataFrame(rows)
+        df.to_csv("test.csv", index=False)
+
+        with pytest.raises(ParserError, match=msg):
+            read_csv("test.csv", engine="pyarrow")
+        os.unlink("test.csv")
+
     def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
         # GH 5686
         # GH 54643