|
| 1 | +import random |
1 | 2 | import timeit
|
| 3 | +import string |
2 | 4 |
|
3 | 5 | import numpy as np
|
4 | 6 | import pandas.util.testing as tm
|
5 |
| -from pandas import DataFrame, date_range, read_csv |
6 |
| -from pandas.compat import PY2, StringIO |
| 7 | +from pandas import DataFrame, Categorical, date_range, read_csv |
| 8 | +from pandas.compat import PY2 |
| 9 | +from pandas.compat import cStringIO as StringIO |
7 | 10 |
|
8 | 11 | from .pandas_vb_common import setup, BaseIO # noqa
|
9 | 12 |
|
@@ -145,3 +148,102 @@ def time_read_csv_10_rows(self, compression, engine):
|
145 | 148 | # Read a small number of rows from a huge (100,000 x 50) table.
|
146 | 149 | read_csv(self.big_fname, nrows=10, compression=compression,
|
147 | 150 | engine=engine)
|
| 151 | + |
| 152 | + |
| 153 | +class ReadCSVThousands(object): |
| 154 | + |
| 155 | + goal_time = 0.2 |
| 156 | + fname = '__test__.csv' |
| 157 | + params = ([',', '|'], [None, ',']) |
| 158 | + param_names = ['sep', 'thousands'] |
| 159 | + |
| 160 | + def setup(self, sep, thousands): |
| 161 | + N = 10000 |
| 162 | + K = 8 |
| 163 | + data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) |
| 164 | + df = DataFrame(data) |
| 165 | + if thousands is not None: |
| 166 | + fmt = ':{}'.format(thousands) |
| 167 | + fmt = '{' + fmt + '}' |
| 168 | + df = df.applymap(lambda x: fmt.format(x)) |
| 169 | + df.to_csv(self.fname, sep=sep) |
| 170 | + |
| 171 | + def time_thousands(self, sep, thousands): |
| 172 | + read_csv(self.fname, sep=sep, thousands=thousands) |
| 173 | + |
| 174 | + |
| 175 | +class ReadCSVComment(object): |
| 176 | + |
| 177 | + goal_time = 0.2 |
| 178 | + |
| 179 | + def setup(self): |
| 180 | + data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) |
| 181 | + self.s_data = StringIO('\n'.join(data)) |
| 182 | + |
| 183 | + def time_comment(self): |
| 184 | + read_csv(self.s_data, comment='#', header=None, names=list('abc')) |
| 185 | + |
| 186 | + |
| 187 | +class ReadCSVFloatPrecision(object): |
| 188 | + |
| 189 | + goal_time = 0.2 |
| 190 | + params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) |
| 191 | + param_names = ['sep', 'decimal', 'float_precision'] |
| 192 | + |
| 193 | + def setup(self, sep, decimal, float_precision): |
| 194 | + floats = [''.join(random.choice(string.digits) for _ in range(28)) |
| 195 | + for _ in range(15)] |
| 196 | + rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n' |
| 197 | + data = rows * 5 |
| 198 | + data = data.format(*floats) * 200 # 1000 x 3 strings csv |
| 199 | + self.s_data = StringIO(data) |
| 200 | + |
| 201 | + def time_read_csv(self, sep, decimal, float_precision): |
| 202 | + read_csv(self.s_data, sep=sep, header=None, names=list('abc'), |
| 203 | + float_precision=float_precision) |
| 204 | + |
| 205 | + def time_read_csv_python_engine(self, sep, decimal, float_precision): |
| 206 | + read_csv(self.s_data, sep=sep, header=None, engine='python', |
| 207 | + float_precision=None, names=list('abc')) |
| 208 | + |
| 209 | + |
| 210 | +class ReadCSVCategorical(BaseIO): |
| 211 | + |
| 212 | + goal_time = 0.2 |
| 213 | + fname = '__test__.csv' |
| 214 | + |
| 215 | + def setup(self): |
| 216 | + N = 100000 |
| 217 | + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] |
| 218 | + df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc')) |
| 219 | + df.to_csv(self.fname, index=False) |
| 220 | + |
| 221 | + def time_convert_post(self): |
| 222 | + read_csv(self.fname).apply(Categorical) |
| 223 | + |
| 224 | + def time_convert_direct(self): |
| 225 | + read_csv(self.fname, dtype='category') |
| 226 | + |
| 227 | + |
| 228 | +class ReadCSVParseDates(object): |
| 229 | + |
| 230 | + goal_time = 0.2 |
| 231 | + |
| 232 | + def setup(self): |
| 233 | + data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n |
| 234 | + {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n |
| 235 | + {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n |
| 236 | + {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n |
| 237 | + {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n |
| 238 | + """ |
| 239 | + two_cols = ['KORD,19990127'] * 5 |
| 240 | + data = data.format(*two_cols) |
| 241 | + self.s_data = StringIO(data) |
| 242 | + |
| 243 | + def time_multiple_date(self): |
| 244 | + read_csv(self.s_data, sep=',', header=None, |
| 245 | + names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]]) |
| 246 | + |
| 247 | + def time_baseline(self): |
| 248 | + read_csv(self.s_data, sep=',', header=None, parse_dates=[1], |
| 249 | + names=list(string.digits[:9])) |
0 commit comments