Skip to content

CLN: ASV categoricals benchmark #18465

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 25, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 82 additions & 51 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .pandas_vb_common import *
import numpy as np
import pandas as pd
import pandas.util.testing as tm
try:
from pandas.api.types import union_categoricals
except ImportError:
Expand All @@ -8,107 +10,136 @@
pass


class Categoricals(object):
class Concat(object):

goal_time = 0.2

def setup(self):
N = 100000
self.s = pd.Series((list('aabbcd') * N)).astype('category')
N = 10**5
self.s = pd.Series(list('aabbcd') * N).astype('category')

self.a = pd.Categorical(list('aabbcd') * N)
self.b = pd.Categorical(list('bbcdjk') * N)

def time_concat(self):
pd.concat([self.s, self.s])

def time_union(self):
union_categoricals([self.a, self.b])


self.a = pd.Categorical((list('aabbcd') * N))
self.b = pd.Categorical((list('bbcdjk') * N))
class Constructor(object):

goal_time = 0.2

def setup(self):
N = 10**5
self.categories = list('abcde')
self.cat_idx = Index(self.categories)
self.cat_idx = pd.Index(self.categories)
self.values = np.tile(self.categories, N)
self.codes = np.tile(range(len(self.categories)), N)

self.datetimes = pd.Series(pd.date_range(
'1995-01-01 00:00:00', periods=10000, freq='s'))
self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00',
periods=N / 10,
freq='s'))
self.datetimes_with_nat = self.datetimes.copy()
self.datetimes_with_nat.iloc[-1] = pd.NaT

self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
self.values_all_nan = [np.nan] * len(self.values)

def time_concat(self):
concat([self.s, self.s])

def time_union(self):
union_categoricals([self.a, self.b])
def time_regular(self):
pd.Categorical(self.values, self.categories)

def time_constructor_regular(self):
Categorical(self.values, self.categories)
def time_fastpath(self):
pd.Categorical(self.codes, self.cat_idx, fastpath=True)

def time_constructor_fastpath(self):
Categorical(self.codes, self.cat_idx, fastpath=True)
def time_datetimes(self):
pd.Categorical(self.datetimes)

def time_constructor_datetimes(self):
Categorical(self.datetimes)
def time_datetimes_with_nat(self):
pd.Categorical(self.datetimes_with_nat)

def time_constructor_datetimes_with_nat(self):
t = self.datetimes
t.iloc[-1] = pd.NaT
Categorical(t)
def time_with_nan(self):
pd.Categorical(self.values_some_nan)

def time_constructor_with_nan(self):
Categorical(self.values_some_nan)
def time_all_nan(self):
pd.Categorical(self.values_all_nan)

def time_constructor_all_nan(self):
Categorical(self.values_all_nan)

class ValueCounts(object):

class Categoricals2(object):
goal_time = 0.2

def setup(self):
n = 500000
params = [True, False]
param_names = ['dropna']

def setup(self, dropna):
n = 5 * 10**5
np.random.seed(2718281)
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
self.ts = Series(arr).astype('category')
self.ts = pd.Series(arr).astype('category')

def time_value_counts(self, dropna):
self.ts.value_counts(dropna=dropna)


self.sel = self.ts.loc[[0]]
class Repr(object):

def time_value_counts(self):
self.ts.value_counts(dropna=False)
goal_time = 0.2

def time_value_counts_dropna(self):
self.ts.value_counts(dropna=True)
def setup(self):
self.sel = pd.Series(['s1234']).astype('category')

def time_rendering(self):
str(self.sel)


class SetCategories(object):

goal_time = 0.2

def setup(self):
n = 5 * 10**5
np.random.seed(2718281)
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
self.ts = pd.Series(arr).astype('category')

def time_set_categories(self):
self.ts.cat.set_categories(self.ts.cat.categories[::2])


class Categoricals3(object):
class Rank(object):

goal_time = 0.2

def setup(self):
N = 100000
N = 10**5
ncats = 100
np.random.seed(1234)

self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats)))
self.s1_cat = self.s1.astype('category')
self.s1_cat_ordered = self.s1.astype('category', ordered=True)
self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
self.s_str_cat = self.s_str.astype('category')
self.s_str_cat_ordered = self.s_str.astype('category', ordered=True)

self.s2 = Series(np.random.randint(0, ncats, size=N))
self.s2_cat = self.s2.astype('category')
self.s2_cat_ordered = self.s2.astype('category', ordered=True)
self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
self.s_int_cat = self.s_int.astype('category')
self.s_int_cat_ordered = self.s_int.astype('category', ordered=True)

def time_rank_string(self):
self.s1.rank()
self.s_str.rank()

def time_rank_string_cat(self):
self.s1_cat.rank()
self.s_str_cat.rank()

def time_rank_string_cat_ordered(self):
self.s1_cat_ordered.rank()
self.s_str_cat_ordered.rank()

def time_rank_int(self):
self.s2.rank()
self.s_int.rank()

def time_rank_int_cat(self):
self.s2_cat.rank()
self.s_int_cat.rank()

def time_rank_int_cat_ordered(self):
self.s2_cat_ordered.rank()
self.s_int_cat_ordered.rank()