Removed changes to gotchas.rst and simplified example in groupby.rst

Patrick Park · Patrick Park · commit ca2a46b14f9a · 2018-07-16T08:28:50.000-07:00
diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
@@ -337,96 +337,3 @@ See `the NumPy documentation on byte order
 <https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html>`__ for more
 details.
 
-
-Alternative to storing lists in Pandas DataFrame Cells
-------------------------------------------------------
-Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
-
-Example of exploding nested lists into a DataFrame:
-
-.. ipython:: python
-
-   from collections import OrderedDict
-   df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), 
-                                   ('opponent', ['76ers', 'blazers', 'bobcats']), 
-                                   ('attribute x', ['A','B','C'])
-                                  ])
-                     ))
-   df
-
-   nn = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3
-   nn
-
-   # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
-   df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nn)], axis=1)
-   df2
-
-   # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
-   #    Note that only the index from the original df is retained - 
-   #    any other columns in the original df are not part of the new df
-   df3 = df2.set_index(['name', 'opponent'])
-   df3
-
-   # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
-   #    Note that at this point we have a Series, not a Dataframe
-   ser = df3.stack()
-   ser
-
-   # Step 4: Drop the extraneous index level created by the stack
-   ser.reset_index(level=2, drop=True, inplace=True)
-   ser
-
-   # Step 5: Create a Dataframe from the Series
-   df4 = ser.to_frame('nearest_neighbors')
-   df4
-
-   # All steps in one stack
-   df4 = (df2.set_index(['name', 'opponent'])
-           .stack()
-           .reset_index(level=2, drop=True)
-           .to_frame('nearest_neighbors'))
-   df4
-
-Example of exploding a list embedded in a dataframe:
-
-.. ipython:: python
-
-   df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), 
-                                   ('opponent', ['76ers', 'blazers', 'bobcats']), 
-                                   ('attribute x', ['A','B','C']),
-                                   ('nearest_neighbors', [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3)
-                                  ])
-                     ))
-   
-   df
-
-   # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
-   df2 = df.set_index(['name', 'opponent'])
-   df2
-
-   # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
-   #    Note that only the index from the original df is retained - 
-   #    any other columns in the original df are not part of the new df
-   df3 = df2.nearest_neighbors.apply(pd.Series)
-   df3
-
-   # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
-   #    Note that at this point we have a Series, not a Dataframe
-   ser = df3.stack()
-   ser
-
-   # Step 4: Drop the extraneous index level created by the stack
-   ser.reset_index(level=2, drop=True, inplace=True)
-   ser
-
-   # Step 5: Create a Dataframe from the Series
-   df4 = ser.to_frame('nearest_neighbors')
-   df4
-
-   # All steps in one stack
-   df4 = (df.set_index(['name', 'opponent'])
-           .nearest_neighbors.apply(pd.Series)
-           .stack()
-           .reset_index(level=2, drop=True)
-           .to_frame('nearest_neighbors'))
-   df4
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -1017,39 +1017,29 @@ The returned dtype of the grouped will *always* include *all* of the categories
    s.index.dtype
 
 .. note::
-   Decimal columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby.
+   Decimal and object columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby.
 
-   If you do wish to include decimal columns in the aggregation, you must do so explicitly:
+   If you do wish to include decimal or object columns in an aggregation with other non-nuisance data types, you must do so explicitly.
 
 .. ipython:: python
 
     from decimal import Decimal
     dec = pd.DataFrame(
-                {'name': ['foo', 'bar', 'foo', 'bar'],
-                    'title': ['boo', 'far', 'boo', 'far'],
-                    'id': [123, 456, 123, 456],
-                    'int_column': [1, 2, 3, 4],
-                    'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')],
-                    'dec_column2': [Decimal('0.20'), Decimal('0.30'), Decimal('0.55'), Decimal('0.60')]
-                },
-            columns=['name','title','id','int_column','dec_column1','dec_column2']
-            )
-
-    dec.head()
-
-    dec.dtypes
-
-    # Decimal columns excluded from sum by default
-    dec.groupby(['name', 'title', 'id'], as_index=False).sum()
+        {'id': [123, 456, 123, 456],
+        'int_column': [1, 2, 3, 4],
+        'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')]
+        },
+        columns=['id','int_column','dec_column']
+    )
 
     # Decimal columns can be sum'd explicitly by themselves...
-    dec.groupby(['name', 'title', 'id'], as_index=False)['dec_column1','dec_column2'].sum()
+    dec.groupby(['id'], as_index=False)['dec_column'].sum()
 
     # ...but cannot be combined with standard data types or they will be excluded
-    dec.groupby(['name', 'title', 'id'], as_index=False)['int_column','dec_column1','dec_column2'].sum()
+    dec.groupby(['id'], as_index=False)['int_column','dec_column'].sum()
 
     # Use .agg function to aggregate over standard and "nuisance" data types at the same time
-    dec.groupby(['name', 'title', 'id'], as_index=False).agg({'int_column': 'sum', 'dec_column1': 'sum', 'dec_column2': 'sum'})
+    dec.groupby(['id'], as_index=False).agg({'int_column': 'sum', 'dec_column': 'sum'})
 
 
 .. _groupby.missing: