Skip to article frontmatterSkip to article content

TensorFlow GPU example

Device checks, benchmarks (GPU vs CPU), and CNN training.

TensorFlow GPU Example

Run TensorFlow on GPU for dense linear algebra and small neural-network training workloads.

DataSynthetic matrices, Fashion-MNIST, two-moons samples
TaskGPU check, matmul benchmark, CNN training
FrameworkTensorFlow + Keras
Expected runtime~3 min on CPU  
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 
import tensorflow as tf

print(f"TensorFlow : {tf.__version__}")
gpus = tf.config.list_physical_devices("GPU")
print(f"GPUs found : {len(gpus)}")
for g in gpus:
    print(" ", g)
build = tf.sysconfig.get_build_info()
print(f"CUDA       : {build.get('cuda_version', 'n/a')}")
print(f"cuDNN      : {build.get('cudnn_version', 'n/a')}")
TensorFlow : 2.21.0
GPUs found : 1
  PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
CUDA       : 12.5.1
cuDNN      : 9

1. CPU vs GPU matrix multiply

import time

def bench_tf_matmul(device_str, N, repeats=5, warmup=2):
    with tf.device(device_str):
        a = tf.random.normal([N, N])
        b = tf.random.normal([N, N])
        for _ in range(warmup):
            _ = tf.matmul(a, b).numpy()
        ts = []
        for _ in range(repeats):
            t0 = time.perf_counter()
            _ = tf.matmul(a, b).numpy()
            ts.append(time.perf_counter() - t0)
        return min(ts)

N = 4096
t_cpu = bench_tf_matmul("/CPU:0", N)
print(f"CPU matmul {N}x{N}: {t_cpu*1000:8.1f} ms")
if gpus:
    t_gpu = bench_tf_matmul("/GPU:0", N)
    print(f"GPU matmul {N}x{N}: {t_gpu*1000:8.1f} ms   (speedup: {t_cpu/t_gpu:5.1f}x)")
CPU matmul 4096x4096:    380.1 ms
GPU matmul 4096x4096:     61.3 ms   (speedup:   6.2x)

2. Scaling plot — CPU vs GPU matmul as N grows

import matplotlib.pyplot as plt
import numpy as np

sizes = [256, 512, 1024, 2048]
cpu_t = [bench_tf_matmul("/CPU:0", n) * 1000 for n in sizes]
gpu_t = [bench_tf_matmul("/GPU:0", n) * 1000 if gpus else float("nan") for n in sizes]
for n, c, g in zip(sizes, cpu_t, gpu_t):
    print(f"N={n:5d}  CPU={c:8.1f} ms  GPU={g:8.2f} ms")

fig, axes = plt.subplots(1, 2, figsize=(11, 4))
axes[0].loglog(sizes, cpu_t, "o-", label="CPU")
axes[0].loglog(sizes, gpu_t, "s-", label="GPU")
axes[0].set_xlabel("N"); axes[0].set_ylabel("Time (ms)")
axes[0].set_title("matmul time"); axes[0].legend(); axes[0].grid(True, which="both", alpha=0.3)
speedup = [c/g for c, g in zip(cpu_t, gpu_t)]
axes[1].semilogx(sizes, speedup, "o-", color="darkgreen")
axes[1].axhline(1, color="black", lw=0.5)
axes[1].set_xlabel("N"); axes[1].set_ylabel("Speedup (CPU/GPU)")
axes[1].set_title("GPU speedup"); axes[1].grid(True, which="both", alpha=0.3)
plt.tight_layout(); plt.show()
N=  256  CPU=     0.3 ms  GPU=    0.25 ms
N=  512  CPU=     1.0 ms  GPU=    0.42 ms
N= 1024  CPU=     6.4 ms  GPU=    0.99 ms
N= 2048  CPU=    46.3 ms  GPU=    3.27 ms
<Figure size 1100x400 with 2 Axes>

3. CPU vs GPU CNN training — Fashion-MNIST

We train the same small CNN on Fashion-MNIST for a few epochs, once on each device.

(x_tr, y_tr), (x_te, y_te) = tf.keras.datasets.fashion_mnist.load_data()
x_tr = (x_tr.astype("float32") / 255.0)[..., None]
x_te = (x_te.astype("float32") / 255.0)[..., None]
print("Train:", x_tr.shape, "  Test:", x_te.shape)

def make_cnn():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(28, 28, 1)),
        tf.keras.layers.Conv2D(32, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(64, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(128, 3, activation="relu"),
        tf.keras.layers.GlobalAvgPool2D(),
        tf.keras.layers.Dense(10),
    ])

def train_on(device_str, epochs=3):
    with tf.device(device_str):
        m = make_cnn()
        m.compile(optimizer="adam",
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])
        # warm-up batch (avoids cuDNN kernel-search latency in the timing)
        m.train_on_batch(x_tr[:64], y_tr[:64])
        t0 = time.perf_counter()
        m.fit(x_tr, y_tr, batch_size=256, epochs=epochs, verbose=0)
        t = time.perf_counter() - t0
        acc = m.evaluate(x_te, y_te, verbose=0)[1]
    return t, acc

t_cpu, a_cpu = train_on("/CPU:0", epochs=2)
print(f"CPU  : {t_cpu:6.1f}s  acc={a_cpu:.4f}")
if gpus:
    t_gpu, a_gpu = train_on("/GPU:0", epochs=2)
    print(f"GPU  : {t_gpu:6.1f}s  acc={a_gpu:.4f}   (speedup: {t_cpu/t_gpu:.1f}x)")
Train: (60000, 28, 28, 1)   Test: (10000, 28, 28, 1)
CPU  :   54.6s  acc=0.8006
GPU  :   14.4s  acc=0.7938   (speedup: 3.8x)

4. Synthetic two-moons — dense net + decision boundary

from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler

X, y = make_moons(n_samples=3000, noise=0.2, random_state=42)
X = StandardScaler().fit_transform(X).astype("float32")
y = y.astype("float32")

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(2,)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X, y, epochs=50, batch_size=256, validation_split=0.2, verbose=0)

xx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))
grid = np.c_[xx.ravel(), yy.ravel()].astype("float32")
preds = model.predict(grid, verbose=0).reshape(xx.shape)
plt.figure(figsize=(6, 5))
plt.contourf(xx, yy, preds, levels=50, cmap="RdBu", alpha=0.7)
plt.colorbar(label="P(class=1)")
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="RdBu", s=10, edgecolors="k", linewidths=0.3)
plt.title("Decision boundary — TensorFlow"); plt.tight_layout(); plt.show()
<Figure size 600x500 with 2 Axes>