TensorFlow GPU example - DEDL Notebook Gallery

TensorFlow GPU Example¶

Run TensorFlow on GPU for dense linear algebra and small neural-network training workloads.


Data	Synthetic matrices, Fashion-MNIST, two-moons samples
Task	GPU check, matmul benchmark, CNN training
Framework	TensorFlow + Keras
Expected runtime	~3 min on CPU

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 
import tensorflow as tf

print(f"TensorFlow : {tf.__version__}")
gpus = tf.config.list_physical_devices("GPU")
print(f"GPUs found : {len(gpus)}")
for g in gpus:
    print(" ", g)
build = tf.sysconfig.get_build_info()
print(f"CUDA       : {build.get('cuda_version', 'n/a')}")
print(f"cuDNN      : {build.get('cudnn_version', 'n/a')}")

TensorFlow : 2.21.0
GPUs found : 1
  PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
CUDA       : 12.5.1
cuDNN      : 9

1. CPU vs GPU matrix multiply¶

import time

def bench_tf_matmul(device_str, N, repeats=5, warmup=2):
    with tf.device(device_str):
        a = tf.random.normal([N, N])
        b = tf.random.normal([N, N])
        for _ in range(warmup):
            _ = tf.matmul(a, b).numpy()
        ts = []
        for _ in range(repeats):
            t0 = time.perf_counter()
            _ = tf.matmul(a, b).numpy()
            ts.append(time.perf_counter() - t0)
        return min(ts)

N = 4096
t_cpu = bench_tf_matmul("/CPU:0", N)
print(f"CPU matmul {N}x{N}: {t_cpu*1000:8.1f} ms")
if gpus:
    t_gpu = bench_tf_matmul("/GPU:0", N)
    print(f"GPU matmul {N}x{N}: {t_gpu*1000:8.1f} ms   (speedup: {t_cpu/t_gpu:5.1f}x)")

CPU matmul 4096x4096:    380.1 ms
GPU matmul 4096x4096:     61.3 ms   (speedup:   6.2x)

2. Scaling plot — CPU vs GPU matmul as N grows¶

import matplotlib.pyplot as plt
import numpy as np

sizes = [256, 512, 1024, 2048]
cpu_t = [bench_tf_matmul("/CPU:0", n) * 1000 for n in sizes]
gpu_t = [bench_tf_matmul("/GPU:0", n) * 1000 if gpus else float("nan") for n in sizes]
for n, c, g in zip(sizes, cpu_t, gpu_t):
    print(f"N={n:5d}  CPU={c:8.1f} ms  GPU={g:8.2f} ms")

fig, axes = plt.subplots(1, 2, figsize=(11, 4))
axes[0].loglog(sizes, cpu_t, "o-", label="CPU")
axes[0].loglog(sizes, gpu_t, "s-", label="GPU")
axes[0].set_xlabel("N"); axes[0].set_ylabel("Time (ms)")
axes[0].set_title("matmul time"); axes[0].legend(); axes[0].grid(True, which="both", alpha=0.3)
speedup = [c/g for c, g in zip(cpu_t, gpu_t)]
axes[1].semilogx(sizes, speedup, "o-", color="darkgreen")
axes[1].axhline(1, color="black", lw=0.5)
axes[1].set_xlabel("N"); axes[1].set_ylabel("Speedup (CPU/GPU)")
axes[1].set_title("GPU speedup"); axes[1].grid(True, which="both", alpha=0.3)
plt.tight_layout(); plt.show()

N=  256  CPU=     0.3 ms  GPU=    0.25 ms
N=  512  CPU=     1.0 ms  GPU=    0.42 ms
N= 1024  CPU=     6.4 ms  GPU=    0.99 ms
N= 2048  CPU=    46.3 ms  GPU=    3.27 ms

3. CPU vs GPU CNN training — Fashion-MNIST¶

We train the same small CNN on Fashion-MNIST for a few epochs, once on each device.

(x_tr, y_tr), (x_te, y_te) = tf.keras.datasets.fashion_mnist.load_data()
x_tr = (x_tr.astype("float32") / 255.0)[..., None]
x_te = (x_te.astype("float32") / 255.0)[..., None]
print("Train:", x_tr.shape, "  Test:", x_te.shape)

def make_cnn():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(28, 28, 1)),
        tf.keras.layers.Conv2D(32, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(64, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(128, 3, activation="relu"),
        tf.keras.layers.GlobalAvgPool2D(),
        tf.keras.layers.Dense(10),
    ])

def train_on(device_str, epochs=3):
    with tf.device(device_str):
        m = make_cnn()
        m.compile(optimizer="adam",
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])
        # warm-up batch (avoids cuDNN kernel-search latency in the timing)
        m.train_on_batch(x_tr[:64], y_tr[:64])
        t0 = time.perf_counter()
        m.fit(x_tr, y_tr, batch_size=256, epochs=epochs, verbose=0)
        t = time.perf_counter() - t0
        acc = m.evaluate(x_te, y_te, verbose=0)[1]
    return t, acc

t_cpu, a_cpu = train_on("/CPU:0", epochs=2)
print(f"CPU  : {t_cpu:6.1f}s  acc={a_cpu:.4f}")
if gpus:
    t_gpu, a_gpu = train_on("/GPU:0", epochs=2)
    print(f"GPU  : {t_gpu:6.1f}s  acc={a_gpu:.4f}   (speedup: {t_cpu/t_gpu:.1f}x)")

Train: (60000, 28, 28, 1)   Test: (10000, 28, 28, 1)
CPU  :   54.6s  acc=0.8006
GPU  :   14.4s  acc=0.7938   (speedup: 3.8x)

4. Synthetic two-moons — dense net + decision boundary¶

from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler

X, y = make_moons(n_samples=3000, noise=0.2, random_state=42)
X = StandardScaler().fit_transform(X).astype("float32")
y = y.astype("float32")

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(2,)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X, y, epochs=50, batch_size=256, validation_split=0.2, verbose=0)

xx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))
grid = np.c_[xx.ravel(), yy.ravel()].astype("float32")
preds = model.predict(grid, verbose=0).reshape(xx.shape)
plt.figure(figsize=(6, 5))
plt.contourf(xx, yy, preds, levels=50, cmap="RdBu", alpha=0.7)
plt.colorbar(label="P(class=1)")
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="RdBu", s=10, edgecolors="k", linewidths=0.3)
plt.title("Decision boundary — TensorFlow"); plt.tight_layout(); plt.show()