Skip to content

Commit 200227e

Browse files
mroeschkejreback
authored andcommitted
CLN: ASV categoricals benchmark (#18465)
1 parent 5cd4cb2 commit 200227e

File tree

1 file changed

+82
-51
lines changed

1 file changed

+82
-51
lines changed

asv_bench/benchmarks/categoricals.py

Lines changed: 82 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
from .pandas_vb_common import *
1+
import numpy as np
2+
import pandas as pd
3+
import pandas.util.testing as tm
24
try:
35
from pandas.api.types import union_categoricals
46
except ImportError:
@@ -8,107 +10,136 @@
810
pass
911

1012

11-
class Categoricals(object):
13+
class Concat(object):
14+
1215
goal_time = 0.2
1316

1417
def setup(self):
15-
N = 100000
16-
self.s = pd.Series((list('aabbcd') * N)).astype('category')
18+
N = 10**5
19+
self.s = pd.Series(list('aabbcd') * N).astype('category')
20+
21+
self.a = pd.Categorical(list('aabbcd') * N)
22+
self.b = pd.Categorical(list('bbcdjk') * N)
23+
24+
def time_concat(self):
25+
pd.concat([self.s, self.s])
26+
27+
def time_union(self):
28+
union_categoricals([self.a, self.b])
29+
1730

18-
self.a = pd.Categorical((list('aabbcd') * N))
19-
self.b = pd.Categorical((list('bbcdjk') * N))
31+
class Constructor(object):
2032

33+
goal_time = 0.2
34+
35+
def setup(self):
36+
N = 10**5
2137
self.categories = list('abcde')
22-
self.cat_idx = Index(self.categories)
38+
self.cat_idx = pd.Index(self.categories)
2339
self.values = np.tile(self.categories, N)
2440
self.codes = np.tile(range(len(self.categories)), N)
2541

26-
self.datetimes = pd.Series(pd.date_range(
27-
'1995-01-01 00:00:00', periods=10000, freq='s'))
42+
self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00',
43+
periods=N / 10,
44+
freq='s'))
45+
self.datetimes_with_nat = self.datetimes.copy()
46+
self.datetimes_with_nat.iloc[-1] = pd.NaT
2847

2948
self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
3049
self.values_all_nan = [np.nan] * len(self.values)
3150

32-
def time_concat(self):
33-
concat([self.s, self.s])
34-
35-
def time_union(self):
36-
union_categoricals([self.a, self.b])
51+
def time_regular(self):
52+
pd.Categorical(self.values, self.categories)
3753

38-
def time_constructor_regular(self):
39-
Categorical(self.values, self.categories)
54+
def time_fastpath(self):
55+
pd.Categorical(self.codes, self.cat_idx, fastpath=True)
4056

41-
def time_constructor_fastpath(self):
42-
Categorical(self.codes, self.cat_idx, fastpath=True)
57+
def time_datetimes(self):
58+
pd.Categorical(self.datetimes)
4359

44-
def time_constructor_datetimes(self):
45-
Categorical(self.datetimes)
60+
def time_datetimes_with_nat(self):
61+
pd.Categorical(self.datetimes_with_nat)
4662

47-
def time_constructor_datetimes_with_nat(self):
48-
t = self.datetimes
49-
t.iloc[-1] = pd.NaT
50-
Categorical(t)
63+
def time_with_nan(self):
64+
pd.Categorical(self.values_some_nan)
5165

52-
def time_constructor_with_nan(self):
53-
Categorical(self.values_some_nan)
66+
def time_all_nan(self):
67+
pd.Categorical(self.values_all_nan)
5468

55-
def time_constructor_all_nan(self):
56-
Categorical(self.values_all_nan)
5769

70+
class ValueCounts(object):
5871

59-
class Categoricals2(object):
6072
goal_time = 0.2
6173

62-
def setup(self):
63-
n = 500000
74+
params = [True, False]
75+
param_names = ['dropna']
76+
77+
def setup(self, dropna):
78+
n = 5 * 10**5
6479
np.random.seed(2718281)
6580
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
66-
self.ts = Series(arr).astype('category')
81+
self.ts = pd.Series(arr).astype('category')
82+
83+
def time_value_counts(self, dropna):
84+
self.ts.value_counts(dropna=dropna)
85+
6786

68-
self.sel = self.ts.loc[[0]]
87+
class Repr(object):
6988

70-
def time_value_counts(self):
71-
self.ts.value_counts(dropna=False)
89+
goal_time = 0.2
7290

73-
def time_value_counts_dropna(self):
74-
self.ts.value_counts(dropna=True)
91+
def setup(self):
92+
self.sel = pd.Series(['s1234']).astype('category')
7593

7694
def time_rendering(self):
7795
str(self.sel)
7896

97+
98+
class SetCategories(object):
99+
100+
goal_time = 0.2
101+
102+
def setup(self):
103+
n = 5 * 10**5
104+
np.random.seed(2718281)
105+
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
106+
self.ts = pd.Series(arr).astype('category')
107+
79108
def time_set_categories(self):
80109
self.ts.cat.set_categories(self.ts.cat.categories[::2])
81110

82111

83-
class Categoricals3(object):
112+
class Rank(object):
113+
84114
goal_time = 0.2
85115

86116
def setup(self):
87-
N = 100000
117+
N = 10**5
88118
ncats = 100
119+
np.random.seed(1234)
89120

90-
self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats)))
91-
self.s1_cat = self.s1.astype('category')
92-
self.s1_cat_ordered = self.s1.astype('category', ordered=True)
121+
self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
122+
self.s_str_cat = self.s_str.astype('category')
123+
self.s_str_cat_ordered = self.s_str.astype('category', ordered=True)
93124

94-
self.s2 = Series(np.random.randint(0, ncats, size=N))
95-
self.s2_cat = self.s2.astype('category')
96-
self.s2_cat_ordered = self.s2.astype('category', ordered=True)
125+
self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
126+
self.s_int_cat = self.s_int.astype('category')
127+
self.s_int_cat_ordered = self.s_int.astype('category', ordered=True)
97128

98129
def time_rank_string(self):
99-
self.s1.rank()
130+
self.s_str.rank()
100131

101132
def time_rank_string_cat(self):
102-
self.s1_cat.rank()
133+
self.s_str_cat.rank()
103134

104135
def time_rank_string_cat_ordered(self):
105-
self.s1_cat_ordered.rank()
136+
self.s_str_cat_ordered.rank()
106137

107138
def time_rank_int(self):
108-
self.s2.rank()
139+
self.s_int.rank()
109140

110141
def time_rank_int_cat(self):
111-
self.s2_cat.rank()
142+
self.s_int_cat.rank()
112143

113144
def time_rank_int_cat_ordered(self):
114-
self.s2_cat_ordered.rank()
145+
self.s_int_cat_ordered.rank()

0 commit comments

Comments
 (0)