scikit-learn vs cuML - DEDL Notebook Gallery

Compare CPU scikit-learn estimators with GPU-backed cuML equivalents using similar APIs.


Data	Synthetic classification and clustering datasets
Task	Random Forest, KNN, K-Means
Framework	scikit-learn + cuML
Expected runtime	~2 min on CPU

import sklearn, cuml
import numpy as np, matplotlib.pyplot as plt, time
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    ConfusionMatrixDisplay, classification_report, confusion_matrix,
)

print(f"scikit-learn : {sklearn.__version__}")
print(f"cuML         : {cuml.__version__}")

scikit-learn : 1.8.0
cuML         : 25.04.00

Dataset¶

X, y = make_classification(
    n_samples=200_000, n_features=20, n_informative=10,
    n_classes=3, n_clusters_per_class=2, random_state=42,
)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr).astype("float32")
X_te_s = scaler.transform(X_te).astype("float32")
y_tr_f = y_tr.astype("float32")
y_te_f = y_te.astype("float32")
print(f"Train: {X_tr_s.shape}  |  Test: {X_te_s.shape}")

Train: (160000, 20)  |  Test: (40000, 20)

1. Random Forest — sklearn (CPU) vs cuML (GPU)¶

from sklearn.ensemble import RandomForestClassifier
from cuml.ensemble import RandomForestClassifier as cuRF

t0 = time.perf_counter()
rf_cpu = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf_cpu.fit(X_tr_s, y_tr)
t_cpu = time.perf_counter() - t0
acc_cpu = rf_cpu.score(X_te_s, y_te)

t0 = time.perf_counter()
rf_gpu = cuRF(n_estimators=100, random_state=42)
rf_gpu.fit(X_tr_s, y_tr_f)
t_gpu = time.perf_counter() - t0
acc_gpu = float(rf_gpu.score(X_te_s, y_te_f))

print(f"{'':20s}  {'Time':>8s}  {'Accuracy':>10s}")
print(f"{'sklearn (CPU)':20s}  {t_cpu:>7.2f}s  {acc_cpu:>10.4f}")
print(f"{'cuML    (GPU)':20s}  {t_gpu:>7.2f}s  {acc_gpu:>10.4f}  (speedup {t_cpu/t_gpu:.1f}x)")

/opt/conda/lib/python3.12/site-packages/cuml/internals/api_decorators.py:317: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return init_func(self, *args, **kwargs)

                          Time    Accuracy
sklearn (CPU)           15.50s      0.9508
cuML    (GPU)            1.20s      0.9338  (speedup 12.9x)

Random Forest scaling — speedup grows with rows¶

sizes = [25_000, 50_000, 100_000, 250_000]
cpu_t, gpu_t = [], []
for n in sizes:
    Xs, ys = make_classification(n_samples=n, n_features=20, n_informative=10,
                                  n_classes=3, n_clusters_per_class=2, random_state=42)
    Xs = Xs.astype("float32"); ys_f = ys.astype("float32")

    t0 = time.perf_counter()
    RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42).fit(Xs, ys)
    cpu_t.append(time.perf_counter() - t0)

    t0 = time.perf_counter()
    cuRF(n_estimators=50, random_state=42).fit(Xs, ys_f)
    gpu_t.append(time.perf_counter() - t0)
    print(f"n={n:>7,d}  CPU={cpu_t[-1]:6.2f}s  GPU={gpu_t[-1]:5.2f}s  speedup={cpu_t[-1]/gpu_t[-1]:5.1f}x")

fig, axes = plt.subplots(1, 2, figsize=(11, 4))
axes[0].loglog(sizes, cpu_t, "o-", label="sklearn (CPU)")
axes[0].loglog(sizes, gpu_t, "s-", label="cuML (GPU)")
axes[0].set_xlabel("# rows"); axes[0].set_ylabel("Fit time (s)")
axes[0].set_title("RandomForest fit (50 trees)"); axes[0].legend(); axes[0].grid(True, which="both", alpha=0.3)
axes[1].semilogx(sizes, [c/g for c, g in zip(cpu_t, gpu_t)], "o-", color="darkgreen")
axes[1].set_xlabel("# rows"); axes[1].set_ylabel("Speedup")
axes[1].set_title("GPU speedup vs CPU"); axes[1].grid(True, which="both", alpha=0.3)
plt.tight_layout(); plt.show()

/opt/conda/lib/python3.12/site-packages/cuml/internals/api_decorators.py:317: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return init_func(self, *args, **kwargs)

n= 25,000  CPU=  0.96s  GPU= 0.20s  speedup=  4.7x

/opt/conda/lib/python3.12/site-packages/cuml/internals/api_decorators.py:317: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return init_func(self, *args, **kwargs)

n= 50,000  CPU=  2.13s  GPU= 0.23s  speedup=  9.4x

/opt/conda/lib/python3.12/site-packages/cuml/internals/api_decorators.py:317: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return init_func(self, *args, **kwargs)

n=100,000  CPU=  4.73s  GPU= 0.39s  speedup= 12.2x

/opt/conda/lib/python3.12/site-packages/cuml/internals/api_decorators.py:317: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
  return init_func(self, *args, **kwargs)

n=250,000  CPU= 14.00s  GPU= 0.79s  speedup= 17.7x

2. K-Nearest Neighbours — quadratic distance computation¶

from sklearn.neighbors import KNeighborsClassifier
from cuml.neighbors import KNeighborsClassifier as cuKNN

t0 = time.perf_counter()
knn_cpu = KNeighborsClassifier(n_neighbors=15, n_jobs=-1).fit(X_tr_s, y_tr)
_ = knn_cpu.predict(X_te_s[:10000])
t_cpu = time.perf_counter() - t0

t0 = time.perf_counter()
knn_gpu = cuKNN(n_neighbors=15).fit(X_tr_s, y_tr_f)
_ = knn_gpu.predict(X_te_s[:10000])
t_gpu = time.perf_counter() - t0
print(f"kNN  CPU: {t_cpu:6.2f}s   GPU: {t_gpu:6.2f}s   speedup: {t_cpu/t_gpu:.1f}x")

kNN  CPU:   1.61s   GPU:   0.19s   speedup: 8.5x

3. K-Means clustering¶

from cuml.cluster import KMeans as cuKMeans
from cuml.decomposition import PCA as cuPCA
from cuml.datasets import make_blobs as cu_make_blobs
from sklearn.cluster import KMeans as skKMeans

X_gpu, y_gpu = cu_make_blobs(n_samples=200_000, centers=8, n_features=20, random_state=42)
X_cpu = X_gpu.get() if hasattr(X_gpu, "get") else np.asarray(X_gpu)

t0 = time.perf_counter()
skKMeans(n_clusters=8, n_init=3, random_state=42).fit(X_cpu)
t_cpu = time.perf_counter() - t0

kmeans = cuKMeans(n_clusters=8, n_init=3, random_state=42)
t0 = time.perf_counter()
kmeans.fit(X_gpu)
t_gpu = time.perf_counter() - t0
print(f"KMeans  sklearn={t_cpu:6.2f}s   cuML={t_gpu:6.2f}s   speedup={t_cpu/t_gpu:.1f}x")

# 2D PCA projection for visualisation
pca = cuPCA(n_components=2, output_type="numpy")
X_2d = pca.fit_transform(X_gpu)
labels = kmeans.labels_.get()

plt.figure(figsize=(7, 5))
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap="tab10", s=2, alpha=0.5)
plt.title("cuML KMeans — 8 clusters (PCA projection)"); plt.tight_layout(); plt.show()

KMeans  sklearn=  0.49s   cuML=  0.17s   speedup=2.9x