Performance comparison#

This notebook compares the performance of cuDF and pandas. The comparisons performed are on identical data sizes. This notebook primarily showcases the factor of speedups users can have when the similar pandas APIs are run on GPUs using cudf.

The hardware details used to run these performance comparisons are at the end of this page.

Note: This notebook is written to measure performance on NVIDIA GPUs with large memory. If running on hardware with lower memory, please consider lowering the num_rows values. Performance results may vary by data size, as well as the CPU and GPU used.

import os
import time
import timeit
from io import BytesIO

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import cudf

print(f"{cudf.__version__=}")
cudf.__version__='24.04.00'
np.random.seed(0)

Concat, count & joins performance#

timeit_number = 30
num_rows = 300_000_000
sub_sample = int(num_rows / 30)
pdf = pd.DataFrame(
    {
        "numbers": np.random.randint(-1000, 1000, num_rows, dtype="int64"),
        "business": np.random.choice(
            ["McD", "Buckees", "Walmart", "Costco"], size=num_rows
        ),
    }
)
pdf
numbers business
0 -316 Costco
1 -441 Costco
2 653 Buckees
3 216 Buckees
4 -165 Walmart
... ... ...
299999995 -395 Walmart
299999996 -653 Buckees
299999997 364 Buckees
299999998 159 Buckees
299999999 -501 Walmart

300000000 rows × 2 columns

gdf = cudf.from_pandas(pdf)
gdf
numbers business
0 -316 Costco
1 -441 Costco
2 653 Buckees
3 216 Buckees
4 -165 Walmart
... ... ...
299999995 -395 Walmart
299999996 -653 Buckees
299999997 364 Buckees
299999998 159 Buckees
299999999 -501 Walmart

300000000 rows × 2 columns

def timeit_pandas_cudf(pd_obj, gd_obj, func, **kwargs):
    """
    A utility function to measure execution time of an
    API(`func`) in pandas & cudf.

    Parameters
    ----------
    pd_obj : Pandas object
    gd_obj : cuDF object
    func : callable
    """
    pandas_time = timeit.timeit(lambda: func(pd_obj), **kwargs)
    cudf_time = timeit.timeit(lambda: func(gd_obj), **kwargs)
    return pandas_time, cudf_time
pandas_value_counts, cudf_value_counts = timeit_pandas_cudf(
    pdf, gdf, lambda df: df.value_counts(), number=timeit_number
)
pdf = pdf.head(sub_sample)
gdf = gdf.head(sub_sample)
pandas_concat = timeit.timeit(
    lambda: pd.concat([pdf, pdf, pdf]), number=timeit_number
)
cudf_concat = timeit.timeit(
    lambda: cudf.concat([gdf, gdf, gdf]), number=timeit_number
)
pandas_groupby, cudf_groupby = timeit_pandas_cudf(
    pdf,
    gdf,
    lambda df: df.groupby("business").agg(["min", "max", "mean"]),
    number=timeit_number,
)
pdf = pd.DataFrame(
    {
        "numbers": np.random.randint(
            -1000, 1000, int(sub_sample / 10), dtype="int64"
        ),
        "business": np.random.choice(
            ["McD", "Buckees", "Walmart", "Costco"], size=int(sub_sample / 10)
        ),
    }
)
gdf = cudf.from_pandas(pdf)
pandas_merge, cudf_merge = timeit_pandas_cudf(
    pdf, gdf, lambda df: df.merge(df), number=10
)
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_value_counts / cudf_value_counts,
            pandas_concat / cudf_concat,
            pandas_groupby / cudf_groupby,
            pandas_merge / cudf_merge,
        ],
    },
    index=["value_counts", "concat", "groupby", "merge"],
)
performance_df
cudf speedup vs. pandas
value_counts 168.465151
concat 29.828922
groupby 46.671713
merge 45.633230
def performance_plot(df, xlabel=None):
    # ylim is 20% above max value
    ylim_max = df["cudf speedup vs. pandas"].max() + (
        df["cudf speedup vs. pandas"].max() / 20
    )
    ax = df.plot.bar(
        color="#7400ff",
        ylim=(1, ylim_max),
        rot=0,
        xlabel=xlabel,
        ylabel="Speedup factor",
    )
    ax.bar_label(ax.containers[0], fmt="%.0f")
    plt.show()
performance_plot(performance_df, xlabel="Operation")
../../../_images/8b91e72ac5db6239e2ec4cdb66e683446858a20f9dd995499fd3b06f8e8c6511.png
# Cleaning up used memory for later benchmarks
del pdf
del gdf
import gc

_ = gc.collect()

Strings Performance#

timeit_number = 20
num_rows = 300_000_000
pd_series = pd.Series(
    np.random.choice(
        ["123", "56.234", "Walmart", "Costco", "rapids ai"], size=num_rows
    )
)
gd_series = cudf.from_pandas(pd_series)
pandas_upper, cudf_upper = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.str.upper(), number=timeit_number
)
pandas_contains, cudf_contains = timeit_pandas_cudf(
    pd_series,
    gd_series,
    lambda s: s.str.contains(r"[0-9][a-z]"),
    number=timeit_number,
)
pandas_isalpha, cudf_isalpha = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.str.isalpha(), number=timeit_number
)
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_upper / cudf_upper,
            pandas_contains / cudf_contains,
            pandas_isalpha / cudf_isalpha,
        ],
    },
    index=["upper", "contains", "isalpha"],
)
performance_df
cudf speedup vs. pandas
upper 376.502445
contains 405.030084
isalpha 1974.166058
performance_plot(performance_df, xlabel="String method")
../../../_images/c35e2cb850a05bd5ae52d4ba306b921d9b48703f5338ab95a5e14b83ac81bf34.png

User-defined function (UDF) performance (with JIT overhead)#

The first UDF runs include JIT compilation overhead, due to which the performance of first run and average of next few runs are compared separately.

num_rows = 10_000_000
pdf_age = pd.DataFrame(
    {
        "age": np.random.randint(0, 100, num_rows),
    }
)
pdf_age
age
0 6
1 28
2 29
3 81
4 69
... ...
9999995 38
9999996 95
9999997 19
9999998 67
9999999 29

10000000 rows × 1 columns

gdf_age = cudf.from_pandas(pdf_age)
gdf_age
age
0 6
1 28
2 29
3 81
4 69
... ...
9999995 38
9999996 95
9999997 19
9999998 67
9999999 29

10000000 rows × 1 columns

def age_udf(row):
    if row["age"] < 18:
        return 0
    elif 18 <= row["age"] < 20:
        return 1
    elif 20 <= row["age"] < 30:
        return 2
    elif 30 <= row["age"] < 40:
        return 3
    elif 40 <= row["age"] < 50:
        return 4
    elif 50 <= row["age"] < 60:
        return 5
    elif 60 <= row["age"] < 70:
        return 6
    else:
        return 7
pandas_int_udf, cudf_int_udf = timeit_pandas_cudf(
    pdf_age, gdf_age, lambda df: df.apply(age_udf, axis=1), number=1
)
def str_isupper_udf(row):
    if row.isupper():
        return 0
    else:
        return 1
pd_series = pd.Series(
    np.random.choice(["ABC", "abc", "hello world", "AI"], size=num_rows),
    name="strings",
)
pd_series
0                  ABC
1          hello world
2          hello world
3                   AI
4                   AI
              ...     
9999995    hello world
9999996            abc
9999997            ABC
9999998            ABC
9999999             AI
Name: strings, Length: 10000000, dtype: object
gd_series = cudf.from_pandas(pd_series)
gd_series
0                  ABC
1          hello world
2          hello world
3                   AI
4                   AI
              ...     
9999995    hello world
9999996            abc
9999997            ABC
9999998            ABC
9999999             AI
Name: strings, Length: 10000000, dtype: object
pandas_str_udf, cudf_str_udf = timeit_pandas_cudf(
    pd_series, gd_series, lambda s: s.apply(str_isupper_udf), number=1
)
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_int_udf / cudf_int_udf,
            pandas_str_udf / cudf_str_udf,
        ]
    },
    index=["Numeric", "String"],
)
performance_df
cudf speedup vs. pandas
Numeric 20.335476
String 8.280955

Below is the plot showing performance speedup in case of Numeric UDFs & String UDFs on their first runs.

performance_plot(performance_df, xlabel="UDF Kind")
../../../_images/5c921564ed60f4449f0d2797bb605ded164b22858c472b822a11d96610d78f5b.png

User-defined function (UDF) performance (without JIT overhead)#

timeit_number = 10
pandas_int_udf, cudf_int_udf = timeit_pandas_cudf(
    pdf_age,
    gdf_age,
    lambda df: df.apply(age_udf, axis=1),
    number=timeit_number,
)
pandas_str_udf, cudf_str_udf = timeit_pandas_cudf(
    pd_series,
    gd_series,
    lambda s: s.apply(str_isupper_udf),
    number=timeit_number,
)
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_int_udf / cudf_int_udf,
            pandas_str_udf / cudf_str_udf,
        ]
    },
    index=["Numeric", "String"],
)
performance_df
cudf speedup vs. pandas
Numeric 21377.625003
String 37.422872

Below is the plot showing performance speedup in case of Numeric UDFs & String UDFs on their consequent runs. In this case the speedup is massive because of no JIT overhead present.

performance_plot(performance_df, xlabel="UDF Kind")
../../../_images/9bb5a2b15088df5d5365c77eb079bdd3121c82297c939495d7b956dc80a89bd2.png

UDF Performance in GroupBy#

num_rows = 100_000_000
timeit_number = 10
pdf = pd.DataFrame()
pdf["key"] = np.random.randint(0, 2, num_rows)
pdf["val"] = np.random.randint(0, 7, num_rows)


def custom_formula_udf(df):
    df["out"] = df["key"] * df["val"] - 10
    return df


gdf = cudf.from_pandas(pdf)
pandas_udf_groupby, cudf_udf_groupby = timeit_pandas_cudf(
    pdf,
    gdf,
    lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
    number=timeit_number,
)
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
/tmp/ipykernel_948063/2864685541.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  lambda df: df.groupby(["key"], group_keys=False).apply(custom_formula_udf),
performance_df = pd.DataFrame(
    {"cudf speedup vs. pandas": [pandas_udf_groupby / cudf_udf_groupby]},
    index=["Grouped UDF"],
)
performance_df
cudf speedup vs. pandas
Grouped UDF 88.879055
performance_plot(performance_df)
../../../_images/40af9fc9fe3ef82a4d281bf5aced40d77bec982aa4bbdf268e0ead709a21c7b9.png

System Configuration#

CPU Configuration#

!lscpu
Architecture:           x86_64
  CPU op-mode(s):       32-bit, 64-bit
  Address sizes:        52 bits physical, 57 bits virtual
  Byte Order:           Little Endian
CPU(s):                 224
  On-line CPU(s) list:  0-223
Vendor ID:              GenuineIntel
  Model name:           Intel(R) Xeon(R) Platinum 8480CL
    CPU family:         6
    Model:              143
    Thread(s) per core: 2
    Core(s) per socket: 56
    Socket(s):          2
    Stepping:           7
    CPU max MHz:        3800.0000
    CPU min MHz:        800.0000
    BogoMIPS:           4000.00
    Flags:              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
                         cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
                         tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art ar
                        ch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc 
                        cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 mon
                        itor ds_cpl smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pc
                        id dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_ti
                        mer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch 
                        cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single inte
                        l_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced fsg
                        sbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rd
                        t_a avx512f avx512dq rdseed adx smap avx512ifma clflusho
                        pt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsave
                        opt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_
                        total cqm_mbm_local split_lock_detect avx_vnni avx512_bf
                        16 wbnoinvd dtherm ida arat pln pts hwp hwp_act_window h
                        wp_epp hwp_pkg_req avx512vbmi umip pku ospke waitpkg avx
                        512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg
                         tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemot
                        e movdiri movdir64b enqcmd fsrm md_clear serialize tsxld
                        trk pconfig arch_lbr amx_bf16 avx512_fp16 amx_tile amx_i
                        nt8 flush_l1d arch_capabilities
Caches (sum of all):    
  L1d:                  5.3 MiB (112 instances)
  L1i:                  3.5 MiB (112 instances)
  L2:                   224 MiB (112 instances)
  L3:                   210 MiB (2 instances)
NUMA:                   
  NUMA node(s):         2
  NUMA node0 CPU(s):    0-55,112-167
  NUMA node1 CPU(s):    56-111,168-223
Vulnerabilities:        
  Gather data sampling: Not affected
  Itlb multihit:        Not affected
  L1tf:                 Not affected
  Mds:                  Not affected
  Meltdown:             Not affected
  Mmio stale data:      Not affected
  Retbleed:             Not affected
  Spec rstack overflow: Not affected
  Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl 
                        and seccomp
  Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer 
                        sanitization
  Spectre v2:           Mitigation; Enhanced IBRS, IBPB conditional, RSB filling
                        , PBRSB-eIBRS SW sequence
  Srbds:                Not affected
  Tsx async abort:      Not affected

GPU Configuration#

!nvidia-smi
Wed Mar  6 12:35:15 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA H100 80G...  On   | 00000000:1B:00.0 Off |                    0 |
| N/A   32C    P0   119W / 700W |  44191MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80G...  On   | 00000000:43:00.0 Off |                    0 |
| N/A   31C    P0    72W / 700W |      0MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA H100 80G...  On   | 00000000:52:00.0 Off |                    0 |
| N/A   34C    P0    70W / 700W |      0MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA H100 80G...  On   | 00000000:61:00.0 Off |                    0 |
| N/A   34C    P0    71W / 700W |      0MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   4  NVIDIA H100 80G...  On   | 00000000:9D:00.0 Off |                    0 |
| N/A   34C    P0   121W / 700W |   3473MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   5  NVIDIA H100 80G...  On   | 00000000:C3:00.0 Off |                    0 |
| N/A   30C    P0    72W / 700W |      0MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   6  NVIDIA H100 80G...  On   | 00000000:D1:00.0 Off |                    0 |
| N/A   32C    P0    73W / 700W |      0MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   7  NVIDIA H100 80G...  On   | 00000000:DF:00.0 Off |                    0 |
| N/A   35C    P0    73W / 700W |      0MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A   2218749      C   ...onserver/bin/tritonserver    22062MiB |
|    0   N/A  N/A   2343426      C   ...onserver/bin/tritonserver    22122MiB |
|    4   N/A  N/A    948063      C   ...i/envs/cudfdev/bin/python     3468MiB |
+-----------------------------------------------------------------------------+