Closed
Description
I have reached a corner case in the wonderful groupby().apply() method. A groupby with only one group causes the apply method to return the wrong output shape. Instead of a series with a multi-index the result is retunred as a DataFrame with last row index level as columns. 😦
In [1]: import numpy as np
...: import pandas as pd
...: from sklearn.cluster import DBSCAN as DBSCAN
...:
...: print pd.__version__
...:
...: # Generate Test DataFrame
...: NUM_ROWS = 1000
...: NUM_COLS = 10
...: col_names = ['A'+num for num in map(str,np.arange(NUM_COLS).tolist())]
...: index_cols = col_names[:5]
...:
...: # Set DataFrame to have 5 level Hierarchical Index.
...: # Sort the index!
...: df = pd.DataFrame(np.random.randint(5, size=(NUM_ROWS,NUM_COLS)), dtype=np.int64, columns=col_names)
...: df = df.set_index(index_cols).sort_index()
...: df
...:
...: # Group by first 4 index columns.
...: grp = df.groupby(level=index_cols[:4])
...:
...: # Find index of largest group.
...: big_loc = grp.size().idxmax()
...:
...: # Create function to apply clustering on groups
...: def grp_func(df):
...: """Run clustering on subgroup and return series of results."""
...: db = DBSCAN(eps=1, min_samples=1, metric='euclidean').fit(df.values)
...: return pd.Series(db.labels_, name='cluster_id', index=df.index.get_level_values(4))
...:
0.12.0
In [2]: # Apply clustering on each subgroup of DataFrame
...: out_good = grp.apply(grp_func)
...: out_good
...: out_good.shape
Out[2]: (1000L,)
In [3]: # Select out biggest group wihile keeping index levels and try same apply
...: out_bad = df[[big_loc == a[:4] for a in df.index.values]].groupby(level=index_cols[:4]).apply(grp_func)
...: out_bad
...: out_bad.shape
Out[3]: (1, 7)
In [4]: out_good
Out[4]:
A0 A1 A2 A3 A4
0 0 0 0 0 1
3 0
1 3 0
2 1 1
3 2
3 0
3 3 1
3 0
1 1 1 0
2 0 0
2 1
4 2
3 4 0
4 1
4 0 0
...
4 4 3 0 2 0
1 1 1
3 2
4 0
3 1 0
2 1
4 4 0
4 0 4 0
1 1 3
1 1
2 2
2 0
3 1 1
3 0
4 4 0
Name: cluster_id, Length: 1000, dtype: float64
In [5]: out_bad
Out[5]:
A4 1 1 2 3 3 3 4
A0 A1 A2 A3
3 0 0 3 6 5 3 0 1 4 2
# If you stack the bad result it comes out looking OK, but now I need a workaround for this corner case to use apply.
In [17]: out_bad.stack()
Out[17]: A0 A1 A2 A3 A4
3 0 0 3 1 3
1 6
2 5
3 1
3 4
3 0
4 2
dtype: float64