@@ -337,96 +337,3 @@ See `the NumPy documentation on byte order
337
337
<https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html> `__ for more
338
338
details.
339
339
340
-
341
- Alternative to storing lists in Pandas DataFrame Cells
342
- ------------------------------------------------------
343
- Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
344
-
345
- Example of exploding nested lists into a DataFrame:
346
-
347
- .. ipython :: python
348
-
349
- from collections import OrderedDict
350
- df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
351
- (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
352
- (' attribute x' , [' A' ,' B' ,' C' ])
353
- ])
354
- ))
355
- df
356
-
357
- nn = [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3
358
- nn
359
-
360
- # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
361
- df2 = pd.concat([df[[' name' ,' opponent' ]], pd.DataFrame(nn)], axis = 1 )
362
- df2
363
-
364
- # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
365
- # Note that only the index from the original df is retained -
366
- # any other columns in the original df are not part of the new df
367
- df3 = df2.set_index([' name' , ' opponent' ])
368
- df3
369
-
370
- # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
371
- # Note that at this point we have a Series, not a Dataframe
372
- ser = df3.stack()
373
- ser
374
-
375
- # Step 4: Drop the extraneous index level created by the stack
376
- ser.reset_index(level = 2 , drop = True , inplace = True )
377
- ser
378
-
379
- # Step 5: Create a Dataframe from the Series
380
- df4 = ser.to_frame(' nearest_neighbors' )
381
- df4
382
-
383
- # All steps in one stack
384
- df4 = (df2.set_index([' name' , ' opponent' ])
385
- .stack()
386
- .reset_index(level = 2 , drop = True )
387
- .to_frame(' nearest_neighbors' ))
388
- df4
389
-
390
- Example of exploding a list embedded in a dataframe:
391
-
392
- .. ipython :: python
393
-
394
- df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
395
- (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
396
- (' attribute x' , [' A' ,' B' ,' C' ]),
397
- (' nearest_neighbors' , [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3 )
398
- ])
399
- ))
400
-
401
- df
402
-
403
- # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
404
- df2 = df.set_index([' name' , ' opponent' ])
405
- df2
406
-
407
- # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
408
- # Note that only the index from the original df is retained -
409
- # any other columns in the original df are not part of the new df
410
- df3 = df2.nearest_neighbors.apply(pd.Series)
411
- df3
412
-
413
- # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
414
- # Note that at this point we have a Series, not a Dataframe
415
- ser = df3.stack()
416
- ser
417
-
418
- # Step 4: Drop the extraneous index level created by the stack
419
- ser.reset_index(level = 2 , drop = True , inplace = True )
420
- ser
421
-
422
- # Step 5: Create a Dataframe from the Series
423
- df4 = ser.to_frame(' nearest_neighbors' )
424
- df4
425
-
426
- # All steps in one stack
427
- df4 = (df.set_index([' name' , ' opponent' ])
428
- .nearest_neighbors.apply(pd.Series)
429
- .stack()
430
- .reset_index(level = 2 , drop = True )
431
- .to_frame(' nearest_neighbors' ))
432
- df4
0 commit comments