Skip to content

CLN: ASV reshape #18944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 26, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 53 additions & 58 deletions asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from .pandas_vb_common import *
from pandas import melt, wide_to_long
from itertools import product

import numpy as np
from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long

from .pandas_vb_common import setup # noqa


class Melt(object):

class melt_dataframe(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C'])
self.df['id1'] = np.random.randint(0, 10, 10000)
self.df['id2'] = np.random.randint(100, 1000, 10000)
Expand All @@ -16,110 +19,102 @@ def time_melt_dataframe(self):
melt(self.df, id_vars=['id1', 'id2'])


class reshape_pivot_time_series(object):
class Pivot(object):

goal_time = 0.2

def setup(self):
self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
self.index = date_range('1/1/2000', periods=10000, freq='h')
self.df = DataFrame(randn(10000, 50), index=self.index, columns=range(50))
self.pdf = self.unpivot(self.df)
self.f = (lambda : self.pdf.pivot('date', 'variable', 'value'))
N = 10000
index = date_range('1/1/2000', periods=N, freq='h')
data = {'value': np.random.randn(N * 50),
'variable': np.arange(50).repeat(N),
'date': np.tile(index.values, 50)}
self.df = DataFrame(data)

def time_reshape_pivot_time_series(self):
self.f()
self.df.pivot('date', 'variable', 'value')

def unpivot(self, frame):
(N, K) = frame.shape
self.data = {'value': frame.values.ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K), }
return DataFrame(self.data, columns=['date', 'variable', 'value'])

class SimpleReshape(object):

class reshape_stack_simple(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
arrays = [np.arange(100).repeat(100),
np.roll(np.tile(np.arange(100), 100), 25)]
index = MultiIndex.from_arrays(arrays)
self.df = DataFrame(np.random.randn(10000, 4), index=index)
self.udf = self.df.unstack(1)

def time_reshape_stack_simple(self):
def time_stack(self):
self.udf.stack()


class reshape_unstack_simple(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
self.df = DataFrame(np.random.randn(10000, 4), index=self.index)

def time_reshape_unstack_simple(self):
def time_unstack(self):
self.df.unstack(1)


class reshape_unstack_large_single_dtype(object):
class Unstack(object):

goal_time = 0.2

def setup(self):
m = 100
n = 1000

levels = np.arange(m)
index = pd.MultiIndex.from_product([levels]*2)
index = MultiIndex.from_product([levels] * 2)
columns = np.arange(n)
values = np.arange(m*m*n).reshape(m*m, n)
self.df = pd.DataFrame(values, index, columns)
values = np.arange(m * m * n).reshape(m * m, n)
self.df = DataFrame(values, index, columns)
self.df2 = self.df.iloc[:-1]

def time_unstack_full_product(self):
def time_full_product(self):
self.df.unstack()

def time_unstack_with_mask(self):
def time_without_last_row(self):
self.df2.unstack()


class unstack_sparse_keyspace(object):
class SparseIndex(object):

goal_time = 0.2

def setup(self):
self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)])
self.df = DataFrame(np.random.randn(10000, 4), index=self.index)
self.NUM_ROWS = 1000
for iter in range(10):
self.df = DataFrame({'A': np.random.randint(50, size=self.NUM_ROWS), 'B': np.random.randint(50, size=self.NUM_ROWS), 'C': np.random.randint((-10), 10, size=self.NUM_ROWS), 'D': np.random.randint((-10), 10, size=self.NUM_ROWS), 'E': np.random.randint(10, size=self.NUM_ROWS), 'F': np.random.randn(self.NUM_ROWS), })
self.idf = self.df.set_index(['A', 'B', 'C', 'D', 'E'])
if (len(self.idf.index.unique()) == self.NUM_ROWS):
break
NUM_ROWS = 1000
self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS),
'B': np.random.randint(50, size=NUM_ROWS),
'C': np.random.randint(-10, 10, size=NUM_ROWS),
'D': np.random.randint(-10, 10, size=NUM_ROWS),
'E': np.random.randint(10, size=NUM_ROWS),
'F': np.random.randn(NUM_ROWS)})
self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E'])

def time_unstack(self):
self.df.unstack()

def time_unstack_sparse_keyspace(self):
self.idf.unstack()

class WideToLong(object):

class wide_to_long_big(object):
goal_time = 0.2

def setup(self):
vars = 'ABCD'
nyrs = 20
nidvars = 20
N = 5000
yrvars = []
for var in vars:
for yr in range(1, nyrs + 1):
yrvars.append(var + str(yr))
self.letters = list('ABCD')
yrvars = [l + str(num)
for l, num in product(self.letters, range(1, nyrs + 1))]

self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)),
columns=list(range(nidvars)) + yrvars)
self.vars = vars
self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)),
columns=list(range(nidvars)) + yrvars)
self.df['id'] = self.df.index

def time_wide_to_long_big(self):
self.df['id'] = self.df.index
wide_to_long(self.df, list(self.vars), i='id', j='year')
wide_to_long(self.df, self.letters, i='id', j='year')


class PivotTable(object):

goal_time = 0.2

def setup(self):
Expand Down