Skip to content

Commit 6e1cb95

Browse files
committed
CLN: ASV io bench
1 parent b5f1e71 commit 6e1cb95

File tree

5 files changed

+268
-287
lines changed

5 files changed

+268
-287
lines changed

asv_bench/benchmarks/io_bench.py

Lines changed: 0 additions & 225 deletions
This file was deleted.

asv_bench/benchmarks/io_csv.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import timeit
2+
3+
import numpy as np
4+
import pandas.util.testing as tm
5+
from pandas import DataFrame, date_range, read_csv
6+
from pandas.compat import PY2, StringIO
7+
8+
from .pandas_vb_common import setup, BaseIO # noqa
9+
10+
11+
class ToCSV(BaseIO):
12+
13+
goal_time = 0.2
14+
fname = '__test__.csv'
15+
params = ['wide', 'long', 'mixed']
16+
param_names = ['kind']
17+
18+
def setup(self, kind):
19+
wide_frame = DataFrame(np.random.randn(3000, 30))
20+
long_frame = DataFrame({'A': np.arange(50000),
21+
'B': np.arange(50000) + 1.,
22+
'C': np.arange(50000) + 2.,
23+
'D': np.arange(50000) + 3.})
24+
mixed_frame = DataFrame({'float': np.random.randn(5000),
25+
'int': np.random.randn(5000).astype(int),
26+
'bool': (np.arange(5000) % 2) == 0,
27+
'datetime': date_range('2001',
28+
freq='s',
29+
periods=5000),
30+
'object': ['foo'] * 5000})
31+
mixed_frame.loc[30:500, 'float'] = np.nan
32+
data = {'wide': wide_frame,
33+
'long': long_frame,
34+
'mixed': mixed_frame}
35+
self.df = data[kind]
36+
37+
def time_frame(self, kind):
38+
self.df.to_csv(self.fname)
39+
40+
41+
class ToCSVDatetime(BaseIO):
42+
43+
goal_time = 0.2
44+
fname = '__test__.csv'
45+
46+
def setup(self):
47+
rng = date_range('1/1/2000', periods=1000)
48+
self.data = DataFrame(rng, index=rng)
49+
50+
def time_frame_date_formatting(self):
51+
self.data.to_csv(self.fname, date_format='%Y%m%d')
52+
53+
54+
class ReadCSVDInferDatetimeFormat(object):
55+
56+
goal_time = 0.2
57+
params = ([True, False], ['custom', 'iso8601', 'ymd'])
58+
param_names = ['infer_datetime_format', 'format']
59+
60+
def setup(self, infer_datetime_format, format):
61+
rng = date_range('1/1/2000', periods=1000)
62+
formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
63+
'iso8601': '%Y-%m-%d %H:%M:%S',
64+
'ymd': '%Y%m%d'}
65+
dt_format = formats[format]
66+
self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist()))
67+
68+
def time_read_csv(self, infer_datetime_format, format):
69+
read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'],
70+
infer_datetime_format=infer_datetime_format)
71+
72+
73+
class ReadCSVSkipRows(BaseIO):
74+
75+
goal_time = 0.2
76+
fname = '__test__.csv'
77+
params = [None, 10000]
78+
param_names = ['skiprows']
79+
80+
def setup(self, skiprows):
81+
N = 20000
82+
index = tm.makeStringIndex(N)
83+
df = DataFrame({'float1': np.random.randn(N),
84+
'float2': np.random.randn(N),
85+
'string1': ['foo'] * N,
86+
'bool1': [True] * N,
87+
'int1': np.random.randint(0, N, size=N)},
88+
index=index)
89+
df.to_csv(self.fname)
90+
91+
def time_skipprows(self, skiprows):
92+
read_csv(self.fname, skiprows=skiprows)
93+
94+
95+
class ReadUint64Integers(object):
96+
97+
goal_time = 0.2
98+
99+
def setup(self):
100+
self.na_values = [2**63 + 500]
101+
arr = np.arange(10000).astype('uint64') + 2**63
102+
self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
103+
arr = arr.astype(object)
104+
arr[500] = -1
105+
self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
106+
107+
def time_read_uint64(self):
108+
read_csv(self.data1, header=None, names=['foo'])
109+
110+
def time_read_uint64_neg_values(self):
111+
read_csv(self.data2, header=None, names=['foo'])
112+
113+
def time_read_uint64_na_values(self):
114+
read_csv(self.data1, header=None, names=['foo'],
115+
na_values=self.na_values)
116+
117+
118+
class S3(object):
119+
# Make sure that we can read part of a file from S3 without
120+
# needing to download the entire thing. Use the timeit.default_timer
121+
# to measure wall time instead of CPU time -- we want to see
122+
# how long it takes to download the data.
123+
timer = timeit.default_timer
124+
params = ([None, "gzip", "bz2"], ["python", "c"])
125+
param_names = ["compression", "engine"]
126+
127+
def setup(self, compression, engine):
128+
if compression == "bz2" and engine == "c" and PY2:
129+
# The Python 2 C parser can't read bz2 from open files.
130+
raise NotImplementedError
131+
try:
132+
import s3fs
133+
except ImportError:
134+
# Skip these benchmarks if `boto` is not installed.
135+
raise NotImplementedError
136+
137+
ext = ""
138+
if compression == "gzip":
139+
ext = ".gz"
140+
elif compression == "bz2":
141+
ext = ".bz2"
142+
self.big_fname = "s3://pandas-test/large_random.csv" + ext
143+
144+
def time_read_csv_10_rows(self, compression, engine):
145+
# Read a small number of rows from a huge (100,000 x 50) table.
146+
read_csv(self.big_fname, nrows=10, compression=compression,
147+
engine=engine)

0 commit comments

Comments
 (0)