pandas-dev · lithomas1 · Feb 9, 2020 · Feb 9, 2020 · Feb 9, 2020 · Feb 9, 2020
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "pyarrow"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -254,9 +254,30 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
+    def time_read_csv_arrow(self, sep, decimal, float_precision):
+        read_csv(
+            self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
+        )
 
-class ReadCSVCategorical(BaseIO):
 
+class ReadCSVEngine(StringIORewind):
+    params = ["c", "python", "pyarrow"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
+
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
+
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
+
+class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"
 
     def setup(self):
@@ -273,7 +294,10 @@ def time_convert_direct(self):
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "pyarrow", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +308,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -304,17 +330,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "pyarrow", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -344,22 +371,23 @@ def mem_parser_chunks(self):
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -160,9 +160,11 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use `str` or `object` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'python'``}
-  Parser engine to use. The C engine is faster while the Python engine is
-  currently more feature-complete.
+engine : {``'c'``, ``'pyarrow'``,``'python'``}
+  Parser engine to use. In terms of performance, the pyarrow engine,
+  which requires pyarrow>=0.15.0, is faster than the C engine, which
+  is faster than the python engine. However, the pyarrow and C engines
+  are currently less feature complete than their Python counterpart.
 converters : dict, default ``None``
   Dict of functions for converting values in certain columns. Keys can either be
   integers or column labels.

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -293,6 +293,9 @@ Other enhancements
 - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
 - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
 - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
+- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing
+  if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -1,5 +1,6 @@
 import distutils.version
 import importlib
+import sys
 import types
 import warnings
 
@@ -92,10 +93,16 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-
+    # Grab parent module if submodule being imported
+    parent = name.split(".")[0]
+    if parent != name:
+        name = parent
+        module_to_get = sys.modules[name]
+    else:
+        module_to_get = module
     minimum_version = VERSIONS.get(name)
     if minimum_version:
-        version = _get_version(module)
+        version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (