pandas-dev · wooseogchoi · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 11, 2024
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -247,7 +247,11 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
     .. versionadded:: 1.4.0
 
         The 'pyarrow' engine was added as an *experimental* engine, and some features
-        are unsupported, or may not work correctly, with this engine.
+        are unsupported, or may not work correctly, with this engine. For example,
+        the newlines_in_values in the ParseOptions of the pyarrow allows handling the
+        newline characters within values when parsing csv files. However, this is not
+        currently supported by Pandas. In this case, the 'csv' module in the pyarrow
+        should be used instead. For more information, refer to the example.
 converters : dict of {{Hashable : Callable}}, optional
     Functions for converting values in specified columns. Keys can either
     be column labels or column indices.
@@ -545,12 +549,26 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
 ...     parse_dates=[1, 2],
 ...     date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}},
 ... )  # doctest: +SKIP
-
 >>> df.dtypes  # doctest: +SKIP
 col 1             int64
 col 2    datetime64[ns]
 col 3    datetime64[ns]
 dtype: object
+
+The csv in the pyarrow must be used if the values in the file have
+new line characters.
+
+>>> from pyarrow import csv
+>>> parse_options = csv.ParseOptions(newlines_in_values=True)
+>>> table = csv.read_csv("example.csv", parse_options=parse_options)
->>> from pyarrow import csv
->>> parse_options = csv.ParseOptions(newlines_in_values=True)
->>> table = csv.read_csv("example.csv", parse_options=parse_options)
+>>> import io
+>>> from pyarrow import csv
+>>> rows = [{"text": "ab\ncd", "idx": idx} for idx in range(1_000_000)]
+>>> df = pd.DataFrame(rows)
+>>> source = io.BytesIO(df.to_string(index=False).encode())
+>>> parse_options = csv.ParseOptions(newlines_in_values=True)
+>>> table = csv.read_csv(source, parse_options=parse_options)
->>> from pyarrow import csv
->>> parse_options = csv.ParseOptions(newlines_in_values=True)
->>> table = csv.read_csv("example.csv", parse_options=parse_options)
+>>> import io
+>>> from pyarrow import csv
+>>> rows = [{"text": "ab\ncd", "idx": idx} for idx in range(1_000_000)]
+>>> df = pd.DataFrame(rows)
+>>> source = io.BytesIO(df.to_string(index=False).encode())
+>>> parse_options = csv.ParseOptions(newlines_in_values=True)
+>>> table = csv.read_csv(source, parse_options=parse_options)
+>>> df = table.to_pandas()
+>>> df.head()
+     text  idx
+0  ab\ncd  0
+1  ab\ncd  1
+2  ab\ncd  2
+3  ab\ncd  3
+4  ab\ncd  4
 """  # noqa: E501
 
 

diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -16,6 +16,7 @@
 from pandas.errors import ParserError
 
 import pandas._testing as tm
+from pandas.core.frame import DataFrame
 
 from pandas.io.parsers import read_csv
 import pandas.io.parsers.readers as parsers
@@ -150,6 +151,20 @@ def test_pyarrow_engine(self):
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)
 
+    def test_pyarrow_newlines_in_values(self):
+        pytest.importorskip("pyarrow")
+        msg = (
+            "CSV parser got out of sync with chunker. "
+            "This can mean the data file contains cell values spanning multiple "
+            "lines; please consider enabling the option 'newlines_in_values'."
+        )
+        rows = [{"text": "ab\ncd", "idx": idx} for idx in range(1_000_000)]
+        df = DataFrame(rows)
+        df.to_csv("test.csv", index=False)
+        with pytest.raises(ValueError, match=msg):
+            read_csv("test.csv", engine="pyarrow")
+        os.unlink("test.csv")
+
     def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
         # GH 5686
         # GH 54643