PyTorch GPU Example¶
Use CUDA with PyTorch for matrix multiplication and small neural-network training workloads.
| Data | Synthetic matrices and two-moons samples |
| Task | GPU check, matmul benchmark, MLP training |
| Framework | PyTorch + optional Lightning |
| Expected runtime | ~2 min on CPU |
import torch
print(f"PyTorch : {torch.__version__}")
print(f"CUDA : {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU : {torch.cuda.get_device_name(0)}")
print(f"VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"CUDA ver : {torch.version.cuda}")
print(f"cuDNN : {torch.backends.cudnn.version()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")PyTorch : 2.10.0+cu128
CUDA : True
GPU : NVIDIA RTXA6000-24Q
VRAM : 25.5 GB
CUDA ver : 12.8
cuDNN : 91002
1. CPU vs GPU matrix multiply — single size¶
We compare a 4096×4096 dense matmul on CPU and GPU.
import time
def bench_matmul(device, N, repeats=5, warmup=2):
a = torch.randn(N, N, device=device)
b = torch.randn(N, N, device=device)
# warm-up (untimed)
for _ in range(warmup):
c = a @ b
if device.type == "cuda":
torch.cuda.synchronize()
# timed runs
times = []
for _ in range(repeats):
if device.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
c = a @ b
if device.type == "cuda":
torch.cuda.synchronize()
times.append(time.perf_counter() - t0)
return min(times) # use best of N — the rest are usually slowed by jitter
N = 4096
t_cpu = bench_matmul(device_cpu, N)
print(f"CPU matmul {N}x{N}: {t_cpu*1000:8.1f} ms")
if torch.cuda.is_available():
t_gpu = bench_matmul(device, N)
print(f"GPU matmul {N}x{N}: {t_gpu*1000:8.1f} ms (speedup: {t_cpu/t_gpu:5.1f}x)")CPU matmul 4096x4096: 406.2 ms
GPU matmul 4096x4096: 9.9 ms (speedup: 41.0x)
2. Scaling benchmark — how the gap widens with size¶
import matplotlib.pyplot as plt
import numpy as np
sizes = [256, 512, 1024, 2048]
cpu_times, gpu_times = [], []
for N in sizes:
cpu_times.append(bench_matmul(device_cpu, N) * 1000)
gpu_times.append(bench_matmul(device, N) * 1000 if torch.cuda.is_available() else float("nan"))
print(f"N={N:5d} CPU={cpu_times[-1]:8.1f} ms GPU={gpu_times[-1]:8.2f} ms")
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
axes[0].loglog(sizes, cpu_times, "o-", label="CPU")
axes[0].loglog(sizes, gpu_times, "s-", label="GPU")
axes[0].set_xlabel("Matrix size N"); axes[0].set_ylabel("Time (ms, log)")
axes[0].set_title("Matmul time — CPU vs GPU"); axes[0].legend(); axes[0].grid(True, which="both", alpha=0.3)
speedups = [c/g for c, g in zip(cpu_times, gpu_times)]
axes[1].semilogx(sizes, speedups, "o-", color="darkgreen")
axes[1].axhline(1, color="black", lw=0.5)
axes[1].set_xlabel("Matrix size N"); axes[1].set_ylabel("Speedup (CPU / GPU)")
axes[1].set_title("GPU speedup vs CPU"); axes[1].grid(True, which="both", alpha=0.3)
plt.tight_layout(); plt.show()N= 256 CPU= 0.1 ms GPU= 0.04 ms
N= 512 CPU= 0.8 ms GPU= 0.05 ms
N= 1024 CPU= 6.3 ms GPU= 0.14 ms
N= 2048 CPU= 55.3 ms GPU= 0.75 ms

3. Toy classification: 2-layer MLP on two-moons¶
import torch.nn as nn
from sklearn.datasets import make_moons
X_np, y_np = make_moons(n_samples=2000, noise=0.2, random_state=42)
X = torch.tensor(X_np, dtype=torch.float32, device=device)
y = torch.tensor(y_np, dtype=torch.long, device=device)
model = nn.Sequential(
nn.Linear(2, 64), nn.ReLU(),
nn.Linear(64, 64), nn.ReLU(),
nn.Linear(64, 2),
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
losses = []
for epoch in range(300):
optimizer.zero_grad()
loss = criterion(model(X), y)
loss.backward()
optimizer.step()
losses.append(loss.item())
plt.figure(figsize=(8, 3))
plt.plot(losses)
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title(f"MLP training loss ({device.type.upper()})")
plt.tight_layout(); plt.show()
print(f"Final loss: {losses[-1]:.4f}")
Final loss: 0.0737
4. CPU vs GPU training time — bigger MLP¶
from sklearn.datasets import make_classification
X_np, y_np = make_classification(
n_samples=200_000, n_features=128, n_informative=40,
n_classes=4, random_state=42,
)
X_np = X_np.astype("float32")
y_np = y_np.astype("int64")
def train_mlp(dev, epochs=20, batch_size=4096):
Xt = torch.from_numpy(X_np).to(dev)
yt = torch.from_numpy(y_np).to(dev)
net = nn.Sequential(
nn.Linear(128, 512), nn.ReLU(),
nn.Linear(512, 512), nn.ReLU(),
nn.Linear(512, 4),
).to(dev)
opt = torch.optim.Adam(net.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()
# warm-up
out = net(Xt[:batch_size]); crit(out, yt[:batch_size]).backward()
if dev.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
for _ in range(epochs):
perm = torch.randperm(Xt.shape[0], device=dev)
for i in range(0, Xt.shape[0], batch_size):
idx = perm[i:i+batch_size]
opt.zero_grad()
loss = crit(net(Xt[idx]), yt[idx])
loss.backward()
opt.step()
if dev.type == "cuda":
torch.cuda.synchronize()
return time.perf_counter() - t0
t_cpu = train_mlp(device_cpu)
print(f"CPU training : {t_cpu:6.2f} s")
if torch.cuda.is_available():
t_gpu = train_mlp(device)
print(f"GPU training : {t_gpu:6.2f} s (speedup: {t_cpu/t_gpu:.1f}x)")CPU training : 34.44 s
GPU training : 1.79 s (speedup: 19.3x)
5. Decision boundary (small MLP from step 3)¶
X_np_moons, y_np_moons = make_moons(n_samples=2000, noise=0.2, random_state=42)
xx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32, device=device)
with torch.no_grad():
preds = model(grid).argmax(1).cpu().numpy().reshape(xx.shape)
plt.figure(figsize=(6, 5))
plt.contourf(xx, yy, preds, alpha=0.4, cmap="RdBu")
plt.scatter(X_np_moons[:, 0], X_np_moons[:, 1], c=y_np_moons,
cmap="RdBu", s=10, edgecolors="k", linewidths=0.3)
plt.title(f"Decision boundary — PyTorch MLP on {device.type.upper()}")
plt.tight_layout(); plt.show()
6. Extending with PyTorch Lightning¶
import lightning as L
from torch.utils.data import TensorDataset, DataLoader
class LitMLP(L.LightningModule):
def __init__(self):
super().__init__()
self.net = nn.Sequential(nn.Linear(2, 64), nn.ReLU(), nn.Linear(64, 2))
def forward(self, x):
return self.net(x)
def training_step(self, batch, _):
x, y = batch
loss = nn.functional.cross_entropy(self(x), y)
self.log("train_loss", loss, prog_bar=True)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3)
X_moons = torch.tensor(X_np_moons, dtype=torch.float32)
y_moons = torch.tensor(y_np_moons, dtype=torch.long)
ds = TensorDataset(X_moons, y_moons)
dl = DataLoader(ds, batch_size=256, shuffle=True)
trainer = L.Trainer(
max_epochs=20,
accelerator="gpu" if torch.cuda.is_available() else "cpu",
devices=1,
enable_progress_bar=True,
)
trainer.fit(LitMLP(), dl)
print("Lightning training complete!")Loading...