RAPIDS GPU Acceleration¶
RAPIDS provides GPU-native replacements for common pandas and scikit-learn workflows.
| Data | Synthetic arrays and DataFrames |
| Task | DataFrame aggregation, classification, clustering |
| Framework | cuDF + cuML |
| Expected runtime | <1 min on GPU |
import numpy as np, time
from numba import cuda as numba_cuda
# Sanity check — the container's start.sh sets NUMBA_CUDA_DRIVER so numba-cuda
# can find libcuda.so injected by the NVIDIA runtime.
numba_cuda.select_device(0)
print(f"CUDA device : {numba_cuda.get_current_device().name.decode()}")CUDA device : NVIDIA RTXA6000-24Q
import cudf, cuml
print(f"cuDF version : {cudf.__version__}")cuDF version : 25.04.00
cuDF — pandas-compatible DataFrames on GPU¶
We compare a groupby().mean() on pandas vs cuDF across several dataset sizes.
import pandas as pd
import matplotlib.pyplot as plt
sizes = [100_000, 500_000, 2_000_000, 10_000_000]
cpu_t, gpu_t = [], []
for n in sizes:
data = {
"x": np.random.randn(n),
"y": np.random.randn(n),
"group": np.random.randint(0, 100, n),
}
df_pd = pd.DataFrame(data)
df_gpu = cudf.DataFrame(data)
t0 = time.perf_counter()
_ = df_pd.groupby("group")[["x", "y"]].mean()
cpu_t.append(time.perf_counter() - t0)
# warm-up
_ = df_gpu.groupby("group")[["x", "y"]].mean().to_pandas()
t0 = time.perf_counter()
_ = df_gpu.groupby("group")[["x", "y"]].mean().to_pandas()
gpu_t.append(time.perf_counter() - t0)
print(f"n={n:>10,d} pandas={cpu_t[-1]*1000:8.1f} ms cuDF={gpu_t[-1]*1000:7.1f} ms speedup={cpu_t[-1]/gpu_t[-1]:5.1f}x")
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
axes[0].loglog(sizes, [t*1000 for t in cpu_t], "o-", label="pandas")
axes[0].loglog(sizes, [t*1000 for t in gpu_t], "s-", label="cuDF")
axes[0].set_xlabel("# rows"); axes[0].set_ylabel("groupby time (ms)")
axes[0].set_title("groupby().mean()"); axes[0].legend(); axes[0].grid(True, which="both", alpha=0.3)
axes[1].semilogx(sizes, [c/g for c, g in zip(cpu_t, gpu_t)], "o-", color="darkgreen")
axes[1].set_xlabel("# rows"); axes[1].set_ylabel("Speedup (pandas / cuDF)")
axes[1].set_title("GPU speedup vs CPU"); axes[1].grid(True, which="both", alpha=0.3)
plt.tight_layout(); plt.show()n= 100,000 pandas= 4.0 ms cuDF= 2.9 ms speedup= 1.4x
n= 500,000 pandas= 11.4 ms cuDF= 4.8 ms speedup= 2.4x
n= 2,000,000 pandas= 37.7 ms cuDF= 4.7 ms speedup= 8.0x
n=10,000,000 pandas= 190.9 ms cuDF= 12.8 ms speedup= 15.0x
