Skip to content

ENH: add arrow engine to read_csv #31817

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 51 commits into from
Closed
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
f22ff46
add arrow engine to read_csv
lithomas1 Feb 9, 2020
8ae43e4
fix failing test
lithomas1 Feb 9, 2020
09074df
formatting and revert unnecessary change
lithomas1 Feb 9, 2020
6be276d
remove bloat and more formatting changes
lithomas1 Feb 9, 2020
df4fa7e
Whatsnew
lithomas1 Feb 9, 2020
9cd9a6f
Merge remote-tracking branch 'upstream/master' into add-arrow-engine
lithomas1 Feb 9, 2020
ecaf3fd
Get tests up and running
lithomas1 Feb 10, 2020
b3c3287
Some fixes
lithomas1 Feb 10, 2020
474baf4
Add asvs and xfail some tests
lithomas1 Feb 11, 2020
2cd9937
address comments
lithomas1 Feb 20, 2020
48ff255
Merge branch 'master' into add-arrow-engine
lithomas1 Feb 20, 2020
3d15a56
fix typo
lithomas1 Feb 20, 2020
c969373
Merge branch 'add-arrow-engine' of github-other.com:lithomas1/pandas …
lithomas1 Feb 20, 2020
98aa134
some fixes
lithomas1 Feb 29, 2020
b9c6d2c
Fix bug
lithomas1 Apr 5, 2020
67c5db6
Fix merge conflicts
lithomas1 Apr 5, 2020
7f891a6
New benchmark and fix more tests
lithomas1 Apr 10, 2020
11fc737
Merge branch 'master' into add-arrow-engine
lithomas1 Apr 10, 2020
23425f7
More cleanups
lithomas1 Apr 10, 2020
d9b7a1f
Merge master
lithomas1 Apr 10, 2020
b8adf3c
Merge branch 'add-arrow-engine' of github-other.com:lithomas1/pandas …
lithomas1 Apr 11, 2020
01c0394
Formatting fixes and typo correction
lithomas1 Apr 11, 2020
ba5620f
skip pyarrow tests if not installed
lithomas1 Apr 12, 2020
2570c82
Address comments
lithomas1 Apr 12, 2020
b3a1f66
Get some more tests to pass
lithomas1 Apr 14, 2020
d46ceed
Fix some bugs and cleanups
lithomas1 Apr 17, 2020
d67925c
Merge branch 'master' into add-arrow-engine
lithomas1 Apr 17, 2020
6378459
Perform version checks for submodule imports too
lithomas1 May 20, 2020
9d64882
Refresh with newer pyarrow
lithomas1 May 20, 2020
852ecf9
Merge branch 'master' into add-arrow-engine
lithomas1 May 20, 2020
93382b4
Start xfailing tests
lithomas1 May 21, 2020
f1bb4e2
Get all tests to run & some fixes
lithomas1 May 27, 2020
14c13ab
Merge branch 'master' into add-arrow-engine
lithomas1 May 27, 2020
7876b4e
Lint and CI
lithomas1 May 29, 2020
4426642
Merge branch 'master' into add-arrow-engine
lithomas1 May 29, 2020
008acab
parse_dates support and fixups of some tests
lithomas1 Jun 3, 2020
2dddae7
Date parsing fixes and address comments
lithomas1 Jun 13, 2020
261ef6a
Merge branch 'master' into add-arrow-engine
lithomas1 Jun 13, 2020
88e200a
Clean/Address comments/Update docs
lithomas1 Jun 29, 2020
bf063ab
Merge branch 'master' into add-arrow-engine
lithomas1 Jun 29, 2020
ede2799
Fix typo
lithomas1 Jun 29, 2020
e8eff08
Fix doc failures
lithomas1 Jul 8, 2020
87cfcf5
Merge remote-tracking branch 'upstream/master' into add-arrow-engine
simonjayhawkins Oct 22, 2020
55139ee
wip
simonjayhawkins Oct 22, 2020
c1aeecf
more xfails and skips
simonjayhawkins Oct 22, 2020
62fc9d6
Merge branch 'master' into add-arrow-engine
lithomas1 Oct 28, 2020
b53a620
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 28, 2020
f13113d
Fix typos
lithomas1 Oct 28, 2020
f9ce2e4
Doc fixes and more typo fixes
lithomas1 Oct 28, 2020
4158d6a
Green?
lithomas1 Nov 2, 2020
d34e75f
Merge branch 'master' into add-arrow-engine
lithomas1 Nov 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 46 additions & 18 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from io import StringIO
from io import BytesIO, StringIO
import random
import string

Expand Down Expand Up @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
class ReadCSVSkipRows(BaseIO):

fname = "__test__.csv"
params = [None, 10000]
param_names = ["skiprows"]
params = ([None, 10000], ["c", "pyarrow"])
param_names = ["skiprows", "engine"]

def setup(self, skiprows):
def setup(self, skiprows, engine):
N = 20000
index = tm.makeStringIndex(N)
df = DataFrame(
Expand All @@ -164,8 +164,8 @@ def setup(self, skiprows):
)
df.to_csv(self.fname)

def time_skipprows(self, skiprows):
read_csv(self.fname, skiprows=skiprows)
def time_skipprows(self, skiprows, engine):
read_csv(self.fname, skiprows=skiprows, engine=engine)


class ReadUint64Integers(StringIORewind):
Expand Down Expand Up @@ -254,9 +254,30 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
names=list("abc"),
)

def time_read_csv_arrow(self, sep, decimal, float_precision):
read_csv(
self.data(self.StringIO_input), sep=sep, header=None, names=list("abc"),
)

class ReadCSVCategorical(BaseIO):

class ReadCSVEngine(StringIORewind):
params = ["c", "python", "pyarrow"]
param_names = ["engine"]

def setup(self, engine):
data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
self.StringIO_input = StringIO("\n".join(data))
# simulate reading from file
self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))

def time_read_stringcsv(self, engine):
read_csv(self.data(self.StringIO_input), engine=engine)

def time_read_bytescsv(self, engine):
read_csv(self.data(self.BytesIO_input), engine=engine)


class ReadCSVCategorical(BaseIO):
fname = "__test__.csv"

def setup(self):
Expand All @@ -273,7 +294,10 @@ def time_convert_direct(self):


class ReadCSVParseDates(StringIORewind):
def setup(self):
params = ["c", "pyarrow", "python"]
param_names = ["engine"]

def setup(self, engine):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
Expand All @@ -284,18 +308,20 @@ def setup(self):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)

def time_multiple_date(self):
def time_multiple_date(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=list(string.digits[:9]),
parse_dates=[[1, 2], [1, 3]],
)

def time_baseline(self):
def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
parse_dates=[1],
Expand All @@ -304,17 +330,18 @@ def time_baseline(self):


class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
param_names = ["do_cache"]
params = ([True, False], ["c", "pyarrow", "python"])
param_names = ["do_cache", "engine"]

def setup(self, do_cache):
def setup(self, do_cache, engine):
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
def time_read_csv_cached(self, do_cache, engine):
try:
read_csv(
self.data(self.StringIO_input),
engine=engine,
header=None,
parse_dates=[0],
cache_dates=do_cache,
Expand Down Expand Up @@ -344,22 +371,23 @@ def mem_parser_chunks(self):


class ReadCSVParseSpecialDate(StringIORewind):
params = (["mY", "mdY", "hm"],)
param_names = ["value"]
params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"])
param_names = ["value", "engine"]
objects = {
"mY": "01-2019\n10-2019\n02/2000\n",
"mdY": "12/02/2010\n",
"hm": "21:34\n",
}

def setup(self, value):
def setup(self, value, engine):
count_elem = 10000
data = self.objects[value] * count_elem
self.StringIO_input = StringIO(data)

def time_read_special_date(self, value):
def time_read_special_date(self, value, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=["Date"],
Expand Down
8 changes: 5 additions & 3 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,11 @@ dtype : Type name or dict of column -> type, default ``None``
(unsupported with ``engine='python'``). Use `str` or `object` together
with suitable ``na_values`` settings to preserve and
not interpret dtype.
engine : {``'c'``, ``'python'``}
Parser engine to use. The C engine is faster while the Python engine is
currently more feature-complete.
engine : {``'c'``, ``'pyarrow'``,``'python'``}
Parser engine to use. In terms of performance, the pyarrow engine,
which requires pyarrow>=0.15.0, is faster than the C engine, which
is faster than the python engine. However, the pyarrow and C engines
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionchanged tag here 1.2

are currently less feature complete than their Python counterpart.
converters : dict, default ``None``
Dict of functions for converting values in certain columns. Keys can either be
integers or column labels.
Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ Other enhancements
- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
- :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
- :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
- :func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing
if pyarrow>=0.15 is installed. However, the pyarrow engine is less feature-complete than its "c" or
"python" counterparts. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)

.. ---------------------------------------------------------------------------

Expand Down
11 changes: 9 additions & 2 deletions pandas/compat/_optional.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import distutils.version
import importlib
import sys
import types
import warnings

Expand Down Expand Up @@ -92,10 +93,16 @@ def import_optional_dependency(
raise ImportError(msg) from None
else:
return None

# Grab parent module if submodule being imported
parent = name.split(".")[0]
if parent != name:
name = parent
module_to_get = sys.modules[name]
else:
module_to_get = module
minimum_version = VERSIONS.get(name)
if minimum_version:
version = _get_version(module)
version = _get_version(module_to_get)
if distutils.version.LooseVersion(version) < minimum_version:
assert on_version in {"warn", "raise", "ignore"}
msg = (
Expand Down
Loading