Skip to content

Commit 5f957db

Browse files
committed
Migrate benchmarks for parser_vb
1 parent 6e1cb95 commit 5f957db

File tree

3 files changed

+132
-145
lines changed

3 files changed

+132
-145
lines changed

asv_bench/benchmarks/io_csv.py

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import random
12
import timeit
3+
import string
24

35
import numpy as np
46
import pandas.util.testing as tm
5-
from pandas import DataFrame, date_range, read_csv
6-
from pandas.compat import PY2, StringIO
7+
from pandas import DataFrame, Categorical, date_range, read_csv
8+
from pandas.compat import PY2
9+
from pandas.compat import cStringIO as StringIO
710

811
from .pandas_vb_common import setup, BaseIO # noqa
912

@@ -145,3 +148,102 @@ def time_read_csv_10_rows(self, compression, engine):
145148
# Read a small number of rows from a huge (100,000 x 50) table.
146149
read_csv(self.big_fname, nrows=10, compression=compression,
147150
engine=engine)
151+
152+
153+
class ReadCSVThousands(object):
154+
155+
goal_time = 0.2
156+
fname = '__test__.csv'
157+
params = ([',', '|'], [None, ','])
158+
param_names = ['sep', 'thousands']
159+
160+
def setup(self, sep, thousands):
161+
N = 10000
162+
K = 8
163+
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
164+
df = DataFrame(data)
165+
if thousands is not None:
166+
fmt = ':{}'.format(thousands)
167+
fmt = '{' + fmt + '}'
168+
df = df.applymap(lambda x: fmt.format(x))
169+
df.to_csv(self.fname, sep=sep)
170+
171+
def time_thousands(self, sep, thousands):
172+
read_csv(self.fname, sep=sep, thousands=thousands)
173+
174+
175+
class ReadCSVComment(object):
176+
177+
goal_time = 0.2
178+
179+
def setup(self):
180+
data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
181+
self.s_data = StringIO('\n'.join(data))
182+
183+
def time_comment(self):
184+
read_csv(self.s_data, comment='#', header=None, names=list('abc'))
185+
186+
187+
class ReadCSVFloatPrecision(object):
188+
189+
goal_time = 0.2
190+
params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
191+
param_names = ['sep', 'decimal', 'float_precision']
192+
193+
def setup(self, sep, decimal, float_precision):
194+
floats = [''.join(random.choice(string.digits) for _ in range(28))
195+
for _ in range(15)]
196+
rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
197+
data = rows * 5
198+
data = data.format(*floats) * 200 # 1000 x 3 strings csv
199+
self.s_data = StringIO(data)
200+
201+
def time_read_csv(self, sep, decimal, float_precision):
202+
read_csv(self.s_data, sep=sep, header=None, names=list('abc'),
203+
float_precision=float_precision)
204+
205+
def time_read_csv_python_engine(self, sep, decimal, float_precision):
206+
read_csv(self.s_data, sep=sep, header=None, engine='python',
207+
float_precision=None, names=list('abc'))
208+
209+
210+
class ReadCSVCategorical(BaseIO):
211+
212+
goal_time = 0.2
213+
fname = '__test__.csv'
214+
215+
def setup(self):
216+
N = 100000
217+
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
218+
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
219+
df.to_csv(self.fname, index=False)
220+
221+
def time_convert_post(self):
222+
read_csv(self.fname).apply(Categorical)
223+
224+
def time_convert_direct(self):
225+
read_csv(self.fname, dtype='category')
226+
227+
228+
class ReadCSVParseDates(object):
229+
230+
goal_time = 0.2
231+
232+
def setup(self):
233+
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
234+
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
235+
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
236+
{},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
237+
{},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
238+
"""
239+
two_cols = ['KORD,19990127'] * 5
240+
data = data.format(*two_cols)
241+
self.s_data = StringIO(data)
242+
243+
def time_multiple_date(self):
244+
read_csv(self.s_data, sep=',', header=None,
245+
names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]])
246+
247+
def time_baseline(self):
248+
read_csv(self.s_data, sep=',', header=None, parse_dates=[1],
249+
names=list(string.digits[:9]))

asv_bench/benchmarks/io_json.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,45 +9,51 @@ class ReadJSON(BaseIO):
99

1010
goal_time = 0.2
1111
fname = "__test__.json"
12-
params = (['records', 'split'], [None, 25000], ['int', 'datetime'])
13-
param_names = ['orient', 'chunksize', 'index']
12+
params = (['split', 'index', 'records'], ['int', 'datetime'])
13+
param_names = ['orient', 'index']
1414

15-
def setup(self, orient, chunksize, index):
15+
def setup(self, orient, index):
1616
N = 100000
1717
indexes = {'int': np.arange(N),
1818
'datetime': date_range('20000101', periods=N, freq='H')}
1919
df = DataFrame(np.random.randn(N, 5),
2020
columns=['float_{}'.format(i) for i in range(5)],
2121
index=indexes[index])
22-
df.to_json(self.fname, orient=lines_orient[1], lines=lines_orient[0])
22+
df.to_json(self.fname, orient=orient)
2323

24-
def time_read_json(self, orient, chunksize, index):
25-
read_json(self.fname, orient=orient, chunksize=chunksize)
24+
def time_read_json(self, orient, index):
25+
read_json(self.fname, orient=orient)
2626

27-
def time_read_json_concat(self, orient, chunksize, index):
28-
concat(read_json(self.fname, orient=orient, chunksize=chunksize))
2927

30-
def peakmem_read_json(self, orient, chunksize, index):
31-
read_json(self.fname, orient=orient, chunksize=chunksize)
28+
class ReadJSONLines(BaseIO):
3229

33-
def peakmem_read_json_concat(self, orient, chunksize, index):
34-
concat(read_json(self.fname, orient=orient, chunksize=chunksize))
30+
goal_time = 0.2
31+
fname = "__test_lines__.json"
32+
params = ['int', 'datetime']
33+
param_names = ['index']
34+
35+
def setup(self, index):
36+
N = 100000
37+
indexes = {'int': np.arange(N),
38+
'datetime': date_range('20000101', periods=N, freq='H')}
39+
df = DataFrame(np.random.randn(N, 5),
40+
columns=['float_{}'.format(i) for i in range(5)],
41+
index=indexes[index])
42+
df.to_json(self.fname, orient='records', lines=True)
3543

36-
def time_read_json_lines(self, orient, chunksize, index):
37-
read_json(self.fname, orient='records', lines=True,
38-
chunksize=chunksize)
44+
def time_read_json_lines(self, index):
45+
read_json(self.fname, orient='records', lines=True)
3946

40-
def time_read_json_lines_concat(self, orient, chunksize, index):
47+
def time_read_json_lines_concat(self, index):
4148
concat(read_json(self.fname, orient='records', lines=True,
42-
chunksize=chunksize))
49+
chunksize=25000))
4350

44-
def peakmem_read_json_lines(self, orient, chunksize, index):
45-
read_json(self.fname, orient='records', lines=True,
46-
chunksize=chunksize)
51+
def peakmem_read_json_lines(self, index):
52+
read_json(self.fname, orient='records', lines=True)
4753

48-
def peakmem_read_json_lines_concat(self, orient, chunksize, index):
54+
def peakmem_read_json_lines_concat(self, index):
4955
concat(read_json(self.fname, orient='records', lines=True,
50-
chunksize=chunksize))
56+
chunksize=25000))
5157

5258

5359
class ToJSON(BaseIO):

asv_bench/benchmarks/parser_vb.py

Lines changed: 0 additions & 121 deletions
This file was deleted.

0 commit comments

Comments
 (0)