Skip to content

PERF: concat slow, manual concat through reindexing enhances performance #50652

Closed
@bckim1318

Description

@bckim1318

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this issue exists on the latest version of pandas.

  • I have confirmed this issue exists on the main branch of pandas.

Reproducible Example

from itertools import product
import time
import numpy as np
import pandas as pd

NUM_ROWS = [100, 150, 200]
NUM_COLUMNS = [1000, 10000, 50000]
NUM_DFS = [3, 5, 7, 9]
test_cases = product(NUM_ROWS, NUM_COLUMNS, NUM_DFS)

def manual_concat(df_list):
    columns = [col for df in df_list for col in df.columns]
    columns = list(set(columns))
    index = np.hstack([df.index.values for df in df_list])
    df_list = [df.reindex(columns=columns) for df in df_list]
    values = np.vstack([df.values for df in df_list])
    return pd.DataFrame(values, index=index, columns=columns)

def compare_dataframes(df1, df2, margin=1e-4):
    return abs(df1.fillna(0) - df2.fillna(0)[df1.columns]).sum().sum() < margin

def generate_dataframes(num_dfs, num_rows, num_cols, all_cols):
    df_list = []
    for i in range(num_dfs):
        index = ['i%d'%i for i in range(i*num_rows, (i+1)*num_rows)]
        columns = np.random.choice(all_cols, num_cols, replace=False)
        values = np.random.uniform(-100, 100, [num_rows, num_cols])
        df_list.append(pd.DataFrame(values, index=index, columns=columns))
    return df_list

for ti, t in enumerate(test_cases):
    num_rows = t[0]
    num_cols = t[1]
    num_dfs = t[2]
    num_all_cols = num_cols * num_dfs * 4 // 5
    all_cols = ['c%i'%i for i in range(num_all_cols)]
    df_list = generate_dataframes(num_dfs, num_rows, num_cols, all_cols)
    
    t1 = time.time()
    df_pandas = pd.concat(df_list)
    t2 = time.time()
    df_manual = manual_concat(df_list)
    t3 = time.time()

    print('Testcase %d'%(ti + 1))
    print('NUM_ROWS: %d, NUM_COLS: %d, NUM_DFS: %d'%(num_rows, num_cols, num_dfs))
    print('Pandas: %.2f'%(t2-t1))
    print('Manual: %.2f'%(t3-t2))
    print(compare_dataframes(df_pandas, df_manual))

output:

Testcase 1
NUM_ROWS: 100, NUM_COLS: 1000, NUM_DFS: 3
Pandas: 0.07
Manual: 0.01
True
Testcase 2
NUM_ROWS: 100, NUM_COLS: 1000, NUM_DFS: 5
Pandas: 0.17
Manual: 0.01
True
Testcase 3
NUM_ROWS: 100, NUM_COLS: 1000, NUM_DFS: 7
Pandas: 0.28
Manual: 0.03
True
Testcase 4
NUM_ROWS: 100, NUM_COLS: 1000, NUM_DFS: 9
Pandas: 0.44
Manual: 0.05
True
Testcase 5
NUM_ROWS: 100, NUM_COLS: 10000, NUM_DFS: 3
Pandas: 0.76
Manual: 0.06
True
Testcase 6
NUM_ROWS: 100, NUM_COLS: 10000, NUM_DFS: 5
Pandas: 1.82
Manual: 0.13
True
Testcase 7
NUM_ROWS: 100, NUM_COLS: 10000, NUM_DFS: 7
Pandas: 3.21
Manual: 0.28
True
Testcase 8
NUM_ROWS: 100, NUM_COLS: 10000, NUM_DFS: 9
Pandas: 4.85
Manual: 0.44
True
Testcase 9
NUM_ROWS: 100, NUM_COLS: 50000, NUM_DFS: 3
Pandas: 4.21
Manual: 0.34
True
Testcase 10
NUM_ROWS: 100, NUM_COLS: 50000, NUM_DFS: 5
Pandas: 9.84
Manual: 0.77
True
Testcase 11
NUM_ROWS: 100, NUM_COLS: 50000, NUM_DFS: 7
Pandas: 16.49
Manual: 1.67
True
Testcase 12
NUM_ROWS: 100, NUM_COLS: 50000, NUM_DFS: 9
Pandas: 25.20
Manual: 2.77
True
Testcase 13
NUM_ROWS: 150, NUM_COLS: 1000, NUM_DFS: 3
Pandas: 0.20
Manual: 0.15
True
Testcase 14
NUM_ROWS: 150, NUM_COLS: 1000, NUM_DFS: 5
Pandas: 0.17
Manual: 0.02
True
Testcase 15
NUM_ROWS: 150, NUM_COLS: 1000, NUM_DFS: 7
Pandas: 0.30
Manual: 0.04
True
Testcase 16
NUM_ROWS: 150, NUM_COLS: 1000, NUM_DFS: 9
Pandas: 0.43
Manual: 0.06
True
Testcase 17
NUM_ROWS: 150, NUM_COLS: 10000, NUM_DFS: 3
Pandas: 0.73
Manual: 0.08
True
Testcase 18
NUM_ROWS: 150, NUM_COLS: 10000, NUM_DFS: 5
Pandas: 1.87
Manual: 0.21
True
Testcase 19
NUM_ROWS: 150, NUM_COLS: 10000, NUM_DFS: 7
Pandas: 3.25
Manual: 0.40
True
Testcase 20
NUM_ROWS: 150, NUM_COLS: 10000, NUM_DFS: 9
Pandas: 4.95
Manual: 0.62
True
Testcase 21
NUM_ROWS: 150, NUM_COLS: 50000, NUM_DFS: 3
Pandas: 4.26
Manual: 0.45
True
Testcase 22
NUM_ROWS: 150, NUM_COLS: 50000, NUM_DFS: 5
Pandas: 9.86
Manual: 1.23
True
Testcase 23
NUM_ROWS: 150, NUM_COLS: 50000, NUM_DFS: 7
Pandas: 17.29
Manual: 2.35
True
Testcase 24
NUM_ROWS: 150, NUM_COLS: 50000, NUM_DFS: 9
Pandas: 25.49
Manual: 3.79
True
Testcase 25
NUM_ROWS: 200, NUM_COLS: 1000, NUM_DFS: 3
Pandas: 0.22
Manual: 0.25
True
Testcase 26
NUM_ROWS: 200, NUM_COLS: 1000, NUM_DFS: 5
Pandas: 0.18
Manual: 0.03
True
Testcase 27
NUM_ROWS: 200, NUM_COLS: 1000, NUM_DFS: 7
Pandas: 0.34
Manual: 0.05
True
Testcase 28
NUM_ROWS: 200, NUM_COLS: 1000, NUM_DFS: 9
Pandas: 0.44
Manual: 0.08
True
Testcase 29
NUM_ROWS: 200, NUM_COLS: 10000, NUM_DFS: 3
Pandas: 0.74
Manual: 0.10
True
Testcase 30
NUM_ROWS: 200, NUM_COLS: 10000, NUM_DFS: 5
Pandas: 1.98
Manual: 0.26
True
Testcase 31
NUM_ROWS: 200, NUM_COLS: 10000, NUM_DFS: 7
Pandas: 3.28
Manual: 0.49
True
Testcase 32
NUM_ROWS: 200, NUM_COLS: 10000, NUM_DFS: 9
Pandas: 4.97
Manual: 0.78
True
Testcase 33
NUM_ROWS: 200, NUM_COLS: 50000, NUM_DFS: 3
Pandas: 4.44
Manual: 0.56
True
Testcase 34
NUM_ROWS: 200, NUM_COLS: 50000, NUM_DFS: 5
Pandas: 9.86
Manual: 1.43
True
Testcase 35
NUM_ROWS: 200, NUM_COLS: 50000, NUM_DFS: 7
Pandas: 17.43
Manual: 2.83
True
Testcase 36
NUM_ROWS: 200, NUM_COLS: 50000, NUM_DFS: 9
Pandas: 25.68
Manual: 5.25
True

Installed Versions

INSTALLED VERSIONS

commit : 8dab54d
python : 3.10.8.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.19045
machine : AMD64
processor : AMD64 Family 25 Model 33 Stepping 0, AuthenticAMD
byteorder : little
LC_ALL : None
LANG : None
LOCALE : Korean_Korea.949

pandas : 1.5.2
numpy : 1.24.1
pytz : 2022.7
dateutil : 2.8.2
setuptools : 63.2.0
pip : 22.3.1
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : None
pandas_datareader: None
bs4 : None
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : None
snappy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
zstandard : None
tzdata : None

Prior Performance

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    PerformanceMemory or execution speed performanceReshapingConcat, Merge/Join, Stack/Unstack, Explode

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions