1
- from .pandas_vb_common import *
2
- from pandas import melt , wide_to_long
1
+ from itertools import product
3
2
3
+ import numpy as np
4
+ from pandas import DataFrame , MultiIndex , date_range , melt , wide_to_long
5
+
6
+ from .pandas_vb_common import setup # noqa
7
+
8
+
9
+ class Melt (object ):
4
10
5
- class melt_dataframe (object ):
6
11
goal_time = 0.2
7
12
8
13
def setup (self ):
9
- self .index = MultiIndex .from_arrays ([np .arange (100 ).repeat (100 ), np .roll (np .tile (np .arange (100 ), 100 ), 25 )])
10
- self .df = DataFrame (np .random .randn (10000 , 4 ), index = self .index )
11
14
self .df = DataFrame (np .random .randn (10000 , 3 ), columns = ['A' , 'B' , 'C' ])
12
15
self .df ['id1' ] = np .random .randint (0 , 10 , 10000 )
13
16
self .df ['id2' ] = np .random .randint (100 , 1000 , 10000 )
@@ -16,110 +19,102 @@ def time_melt_dataframe(self):
16
19
melt (self .df , id_vars = ['id1' , 'id2' ])
17
20
18
21
19
- class reshape_pivot_time_series (object ):
22
+ class Pivot (object ):
23
+
20
24
goal_time = 0.2
21
25
22
26
def setup (self ):
23
- self . index = MultiIndex . from_arrays ([ np . arange ( 100 ). repeat ( 100 ), np . roll ( np . tile ( np . arange ( 100 ), 100 ), 25 )])
24
- self . df = DataFrame ( np . random . randn ( 10000 , 4 ), index = self . index )
25
- self . index = date_range ( '1/1/2000' , periods = 10000 , freq = 'h' )
26
- self . df = DataFrame ( randn ( 10000 , 50 ), index = self . index , columns = range (50 ))
27
- self . pdf = self . unpivot ( self . df )
28
- self .f = ( lambda : self . pdf . pivot ( 'date' , 'variable' , 'value' ) )
27
+ N = 10000
28
+ index = date_range ( '1/1/2000' , periods = N , freq = 'h' )
29
+ data = { 'value' : np . random . randn ( N * 50 ),
30
+ 'variable' : np . arange (50 ). repeat ( N ),
31
+ 'date' : np . tile ( index . values , 50 )}
32
+ self .df = DataFrame ( data )
29
33
30
34
def time_reshape_pivot_time_series (self ):
31
- self .f ( )
35
+ self .df . pivot ( 'date' , 'variable' , 'value' )
32
36
33
- def unpivot (self , frame ):
34
- (N , K ) = frame .shape
35
- self .data = {'value' : frame .values .ravel ('F' ), 'variable' : np .asarray (frame .columns ).repeat (N ), 'date' : np .tile (np .asarray (frame .index ), K ), }
36
- return DataFrame (self .data , columns = ['date' , 'variable' , 'value' ])
37
37
38
+ class SimpleReshape (object ):
38
39
39
- class reshape_stack_simple (object ):
40
40
goal_time = 0.2
41
41
42
42
def setup (self ):
43
- self .index = MultiIndex .from_arrays ([np .arange (100 ).repeat (100 ), np .roll (np .tile (np .arange (100 ), 100 ), 25 )])
44
- self .df = DataFrame (np .random .randn (10000 , 4 ), index = self .index )
43
+ arrays = [np .arange (100 ).repeat (100 ),
44
+ np .roll (np .tile (np .arange (100 ), 100 ), 25 )]
45
+ index = MultiIndex .from_arrays (arrays )
46
+ self .df = DataFrame (np .random .randn (10000 , 4 ), index = index )
45
47
self .udf = self .df .unstack (1 )
46
48
47
- def time_reshape_stack_simple (self ):
49
+ def time_stack (self ):
48
50
self .udf .stack ()
49
51
50
-
51
- class reshape_unstack_simple (object ):
52
- goal_time = 0.2
53
-
54
- def setup (self ):
55
- self .index = MultiIndex .from_arrays ([np .arange (100 ).repeat (100 ), np .roll (np .tile (np .arange (100 ), 100 ), 25 )])
56
- self .df = DataFrame (np .random .randn (10000 , 4 ), index = self .index )
57
-
58
- def time_reshape_unstack_simple (self ):
52
+ def time_unstack (self ):
59
53
self .df .unstack (1 )
60
54
61
55
62
- class reshape_unstack_large_single_dtype (object ):
56
+ class Unstack (object ):
57
+
63
58
goal_time = 0.2
64
59
65
60
def setup (self ):
66
61
m = 100
67
62
n = 1000
68
63
69
64
levels = np .arange (m )
70
- index = pd . MultiIndex .from_product ([levels ]* 2 )
65
+ index = MultiIndex .from_product ([levels ] * 2 )
71
66
columns = np .arange (n )
72
- values = np .arange (m * m * n ).reshape (m * m , n )
73
- self .df = pd . DataFrame (values , index , columns )
67
+ values = np .arange (m * m * n ).reshape (m * m , n )
68
+ self .df = DataFrame (values , index , columns )
74
69
self .df2 = self .df .iloc [:- 1 ]
75
70
76
- def time_unstack_full_product (self ):
71
+ def time_full_product (self ):
77
72
self .df .unstack ()
78
73
79
- def time_unstack_with_mask (self ):
74
+ def time_without_last_row (self ):
80
75
self .df2 .unstack ()
81
76
82
77
83
- class unstack_sparse_keyspace (object ):
78
+ class SparseIndex (object ):
79
+
84
80
goal_time = 0.2
85
81
86
82
def setup (self ):
87
- self .index = MultiIndex .from_arrays ([np .arange (100 ).repeat (100 ), np .roll (np .tile (np .arange (100 ), 100 ), 25 )])
88
- self .df = DataFrame (np .random .randn (10000 , 4 ), index = self .index )
89
- self .NUM_ROWS = 1000
90
- for iter in range (10 ):
91
- self .df = DataFrame ({'A' : np .random .randint (50 , size = self .NUM_ROWS ), 'B' : np .random .randint (50 , size = self .NUM_ROWS ), 'C' : np .random .randint ((- 10 ), 10 , size = self .NUM_ROWS ), 'D' : np .random .randint ((- 10 ), 10 , size = self .NUM_ROWS ), 'E' : np .random .randint (10 , size = self .NUM_ROWS ), 'F' : np .random .randn (self .NUM_ROWS ), })
92
- self .idf = self .df .set_index (['A' , 'B' , 'C' , 'D' , 'E' ])
93
- if (len (self .idf .index .unique ()) == self .NUM_ROWS ):
94
- break
83
+ NUM_ROWS = 1000
84
+ self .df = DataFrame ({'A' : np .random .randint (50 , size = NUM_ROWS ),
85
+ 'B' : np .random .randint (50 , size = NUM_ROWS ),
86
+ 'C' : np .random .randint (- 10 , 10 , size = NUM_ROWS ),
87
+ 'D' : np .random .randint (- 10 , 10 , size = NUM_ROWS ),
88
+ 'E' : np .random .randint (10 , size = NUM_ROWS ),
89
+ 'F' : np .random .randn (NUM_ROWS )})
90
+ self .df = self .df .set_index (['A' , 'B' , 'C' , 'D' , 'E' ])
91
+
92
+ def time_unstack (self ):
93
+ self .df .unstack ()
95
94
96
- def time_unstack_sparse_keyspace (self ):
97
- self .idf .unstack ()
98
95
96
+ class WideToLong (object ):
99
97
100
- class wide_to_long_big (object ):
101
98
goal_time = 0.2
102
99
103
100
def setup (self ):
104
- vars = 'ABCD'
105
101
nyrs = 20
106
102
nidvars = 20
107
103
N = 5000
108
- yrvars = []
109
- for var in vars :
110
- for yr in range (1 , nyrs + 1 ):
111
- yrvars .append (var + str (yr ))
104
+ self .letters = list ('ABCD' )
105
+ yrvars = [l + str (num )
106
+ for l , num in product (self .letters , range (1 , nyrs + 1 ))]
112
107
113
- self .df = pd . DataFrame (np .random .randn (N , nidvars + len (yrvars )),
114
- columns = list (range (nidvars )) + yrvars )
115
- self .vars = vars
108
+ self .df = DataFrame (np .random .randn (N , nidvars + len (yrvars )),
109
+ columns = list (range (nidvars )) + yrvars )
110
+ self .df [ 'id' ] = self . df . index
116
111
117
112
def time_wide_to_long_big (self ):
118
- self .df ['id' ] = self .df .index
119
- wide_to_long (self .df , list (self .vars ), i = 'id' , j = 'year' )
113
+ wide_to_long (self .df , self .letters , i = 'id' , j = 'year' )
120
114
121
115
122
116
class PivotTable (object ):
117
+
123
118
goal_time = 0.2
124
119
125
120
def setup (self ):
0 commit comments