Advanced

ML Implementations in NumPy

Five coding challenges that directly mirror what Google, Meta, and Amazon ask in ML engineer interviews. These are the building blocks of every neural network, implemented from scratch with numerically stable, fully vectorized NumPy code.

Challenge 1: Numerically Stable Softmax

Problem: Implement the softmax function for a batch of logits Z of shape (N, C). The naive implementation exp(z) / sum(exp(z)) overflows for large values. Implement the numerically stable version and handle both 1D and 2D inputs.

⚠

This is the #1 most asked NumPy question in ML interviews. If you can only memorize one solution, memorize this one. Interviewers specifically check for the numerical stability trick (subtracting the max).

import numpy as np

# ---- CHALLENGE ----
def softmax(Z):
    """Numerically stable softmax. Handle both 1D and 2D input."""
    pass

# ---- NAIVE (WRONG) SOLUTION ----
def softmax_naive(Z):
    """This OVERFLOWS for large values like Z = [1000, 1001, 1002]."""
    exp_Z = np.exp(Z)             # exp(1000) = inf!
    return exp_Z / exp_Z.sum()    # inf / inf = NaN

# ---- CORRECT SOLUTION ----
def softmax(Z):
    """Numerically stable softmax using the log-sum-exp trick."""
    if Z.ndim == 1:
        # 1D case: single sample
        Z_shifted = Z - np.max(Z)           # shift so max = 0
        exp_Z = np.exp(Z_shifted)
        return exp_Z / exp_Z.sum()
    else:
        # 2D case: batch (N, C)
        Z_shifted = Z - Z.max(axis=1, keepdims=True)  # (N, 1)
        exp_Z = np.exp(Z_shifted)                       # (N, C)
        return exp_Z / exp_Z.sum(axis=1, keepdims=True)  # (N, C)

# ---- VERIFICATION ----
# Test 1D
z = np.array([1.0, 2.0, 3.0])
s = softmax(z)
assert np.allclose(s.sum(), 1.0), "Softmax should sum to 1"
assert s[2] > s[1] > s[0], "Larger logit should have larger probability"

# Test numerical stability with large values
z_large = np.array([1000.0, 1001.0, 1002.0])
s_large = softmax(z_large)
assert not np.any(np.isnan(s_large)), "Should not produce NaN for large values"
assert np.allclose(s_large.sum(), 1.0), "Should still sum to 1"

# Test batch (2D)
Z = np.array([[1.0, 2.0, 3.0],
              [1000.0, 1001.0, 1002.0]])
S = softmax(Z)
assert S.shape == (2, 3), f"Expected (2, 3), got {S.shape}"
assert np.allclose(S.sum(axis=1), [1.0, 1.0]), "Each row should sum to 1"

# Verify: softmax of equal values should be uniform
z_equal = np.array([5.0, 5.0, 5.0])
assert np.allclose(softmax(z_equal), [1/3, 1/3, 1/3])
print("Challenge 1 passed!")

Challenge 2: Cross-Entropy Loss

Problem: Implement cross-entropy loss for a batch of predictions. Given softmax probabilities probs of shape (N, C) and integer labels labels of shape (N,), compute the average cross-entropy loss: L = -1/N * sum(log(probs[i, labels[i]])). Handle the numerical issue where log(0) = -inf.

import numpy as np

# ---- CHALLENGE ----
def cross_entropy_loss(logits, labels):
    """Compute cross-entropy loss from logits (not probabilities).
    Combines softmax + negative log-likelihood in a numerically stable way."""
    pass

# ---- SOLUTION ----
def cross_entropy_loss(logits, labels):
    """Numerically stable cross-entropy using log-softmax trick.

    Instead of: -log(softmax(z)[label])
    Compute:    -z[label] + log(sum(exp(z)))
    With stability: -z[label] + max(z) + log(sum(exp(z - max(z))))
    """
    N = logits.shape[0]

    # Numerically stable log-softmax
    z_max = logits.max(axis=1, keepdims=True)    # (N, 1)
    # log(sum(exp(z - max))) + max = log(sum(exp(z)))  (stable)
    log_sum_exp = np.log(np.exp(logits - z_max).sum(axis=1)) + z_max.squeeze()

    # Extract logit of correct class using fancy indexing
    correct_logits = logits[np.arange(N), labels]  # (N,)

    # Cross-entropy: -log(softmax(z)[label]) = -z[label] + log_sum_exp
    losses = -correct_logits + log_sum_exp  # (N,)

    return losses.mean()  # scalar: average over batch

# ---- VERIFICATION ----
np.random.seed(42)
N, C = 32, 10
logits = np.random.randn(N, C)
labels = np.random.randint(0, C, N)

loss = cross_entropy_loss(logits, labels)
assert np.isscalar(loss) or loss.ndim == 0, "Loss should be scalar"
assert loss > 0, "Cross-entropy loss should be positive"
assert not np.isnan(loss), "Loss should not be NaN"

# Perfect predictions should have low loss
perfect_logits = np.full((N, C), -10.0)
perfect_logits[np.arange(N), labels] = 10.0
perfect_loss = cross_entropy_loss(perfect_logits, labels)
assert perfect_loss < 0.001, f"Perfect predictions should have ~0 loss, got {perfect_loss}"

# Uniform predictions should have loss ~log(C)
uniform_logits = np.zeros((N, C))
uniform_loss = cross_entropy_loss(uniform_logits, labels)
assert abs(uniform_loss - np.log(C)) < 0.01, f"Uniform should give log(C)={np.log(C):.4f}, got {uniform_loss:.4f}"

# Test numerical stability with extreme values
extreme_logits = np.random.randn(N, C) * 1000
extreme_loss = cross_entropy_loss(extreme_logits, labels)
assert not np.isnan(extreme_loss), "Should handle extreme values"
print("Challenge 2 passed!")

Challenge 3: Gradient Descent Step

Problem: Implement the gradient computation and parameter update for a single linear layer with softmax cross-entropy loss. Given X (N, D), W (D, C), b (C,), and labels, compute the gradients dW, db and perform one gradient descent step.

import numpy as np

# ---- CHALLENGE ----
def softmax_grad_step(X, W, b, labels, lr=0.01):
    """One gradient descent step for linear softmax classifier.
    Return updated W, b, and the loss."""
    pass

# ---- SOLUTION ----
def softmax_grad_step(X, W, b, labels, lr=0.01):
    """Forward pass + backward pass + parameter update."""
    N = X.shape[0]

    # ---- Forward pass ----
    logits = X @ W + b                    # (N, C)
    # Stable softmax
    logits_shifted = logits - logits.max(axis=1, keepdims=True)
    exp_logits = np.exp(logits_shifted)
    probs = exp_logits / exp_logits.sum(axis=1, keepdims=True)  # (N, C)

    # Cross-entropy loss
    correct_probs = probs[np.arange(N), labels]
    loss = -np.log(correct_probs + 1e-12).mean()

    # ---- Backward pass ----
    # Gradient of cross-entropy + softmax: probs - one_hot(labels)
    dlogits = probs.copy()                          # (N, C)
    dlogits[np.arange(N), labels] -= 1.0           # subtract 1 at correct class
    dlogits /= N                                    # average over batch

    # Gradients for W and b
    # logits = X @ W + b
    dW = X.T @ dlogits                              # (D, N) @ (N, C) -> (D, C)
    db = dlogits.sum(axis=0)                        # (C,)

    # ---- Update ----
    W_new = W - lr * dW
    b_new = b - lr * db

    return W_new, b_new, loss

# ---- VERIFICATION ----
np.random.seed(42)
N, D, C = 64, 784, 10
X = np.random.randn(N, D)
W = np.random.randn(D, C) * 0.01
b = np.zeros(C)
labels = np.random.randint(0, C, N)

# Run multiple steps and verify loss decreases
losses = []
for step in range(100):
    W, b, loss = softmax_grad_step(X, W, b, labels, lr=0.1)
    losses.append(loss)

assert losses[-1] < losses[0], f"Loss should decrease: {losses[0]:.4f} -> {losses[-1]:.4f}"
assert losses[-1] < 0.5, f"Loss should converge to low value, got {losses[-1]:.4f}"
print(f"Loss: {losses[0]:.4f} -> {losses[-1]:.4f}")
print("Challenge 3 passed!")

Challenge 4: Batch Normalization

Problem: Implement batch normalization for a fully connected layer. Given activations X of shape (N, D), normalize across the batch dimension. Implement both training mode (compute batch stats) and inference mode (use running stats). Include learnable parameters gamma and beta.

import numpy as np

# ---- CHALLENGE ----
def batch_norm_forward(X, gamma, beta, running_mean, running_var,
                       training=True, momentum=0.1, eps=1e-5):
    """Batch normalization forward pass.
    Returns: output, cache (for backward), updated running stats."""
    pass

# ---- SOLUTION ----
def batch_norm_forward(X, gamma, beta, running_mean, running_var,
                       training=True, momentum=0.1, eps=1e-5):
    """Batch normalization forward pass."""
    if training:
        # Compute batch statistics
        batch_mean = X.mean(axis=0)                    # (D,)
        batch_var = X.var(axis=0)                      # (D,) population var

        # Normalize
        X_norm = (X - batch_mean) / np.sqrt(batch_var + eps)  # (N, D)

        # Scale and shift with learnable parameters
        out = gamma * X_norm + beta                    # (N, D)

        # Update running statistics (exponential moving average)
        running_mean = (1 - momentum) * running_mean + momentum * batch_mean
        running_var = (1 - momentum) * running_var + momentum * batch_var

        # Cache for backward pass
        cache = (X, X_norm, batch_mean, batch_var, gamma, eps)
    else:
        # Use running statistics for inference
        X_norm = (X - running_mean) / np.sqrt(running_var + eps)
        out = gamma * X_norm + beta
        cache = None

    return out, cache, running_mean, running_var

# ---- VERIFICATION ----
np.random.seed(42)
N, D = 64, 128
X = np.random.randn(N, D) * 5 + 3  # shifted and scaled

gamma = np.ones(D)
beta = np.zeros(D)
running_mean = np.zeros(D)
running_var = np.ones(D)

# Training mode
out, cache, rm, rv = batch_norm_forward(X, gamma, beta, running_mean, running_var, training=True)

# Output should have approximately zero mean and unit variance
assert np.allclose(out.mean(axis=0), 0, atol=1e-6), "Output mean should be ~0"
assert np.allclose(out.var(axis=0), 1, atol=1e-1), "Output var should be ~1"
assert out.shape == X.shape, "Shape should be preserved"

# Inference mode should use running stats
out_inf, _, _, _ = batch_norm_forward(X, gamma, beta, rm, rv, training=False)
assert out_inf.shape == X.shape, "Inference shape should be preserved"
print("Challenge 4 passed!")

Challenge 5: Cosine Similarity Matrix

Problem: Given two sets of vectors A of shape (M, D) and B of shape (N, D), compute the pairwise cosine similarity matrix of shape (M, N). This is the core operation in retrieval systems, contrastive learning (SimCLR, CLIP), and recommendation engines.

import numpy as np

# ---- CHALLENGE ----
def cosine_similarity_matrix(A, B):
    """Compute pairwise cosine similarity between rows of A and B.
    Return shape (M, N) where result[i,j] = cosine_sim(A[i], B[j])."""
    pass

# ---- SOLUTION ----
def cosine_similarity_matrix(A, B):
    """Vectorized pairwise cosine similarity.
    cosine_sim(a, b) = (a . b) / (||a|| * ||b||)
    """
    # L2 normalize each row
    A_norm = np.linalg.norm(A, axis=1, keepdims=True)  # (M, 1)
    B_norm = np.linalg.norm(B, axis=1, keepdims=True)  # (N, 1)

    # Handle zero vectors
    A_norm = np.where(A_norm == 0, 1.0, A_norm)
    B_norm = np.where(B_norm == 0, 1.0, B_norm)

    A_normalized = A / A_norm  # (M, D)
    B_normalized = B / B_norm  # (N, D)

    # Dot product of normalized vectors = cosine similarity
    # (M, D) @ (D, N) -> (M, N)
    sim_matrix = A_normalized @ B_normalized.T

    # Clip for numerical safety
    return np.clip(sim_matrix, -1.0, 1.0)

# ---- VERIFICATION ----
np.random.seed(42)
A = np.random.randn(100, 128)   # 100 query embeddings
B = np.random.randn(500, 128)   # 500 document embeddings

sim = cosine_similarity_matrix(A, B)
assert sim.shape == (100, 500), f"Expected (100, 500), got {sim.shape}"
assert np.all(sim >= -1.0 - 1e-6) and np.all(sim <= 1.0 + 1e-6), "Cosine sim should be in [-1, 1]"

# Self-similarity should be 1 on diagonal
self_sim = cosine_similarity_matrix(A, A)
assert np.allclose(np.diag(self_sim), 1.0, atol=1e-6), "Self-similarity should be 1"

# Identical vectors should have similarity 1
B_copy = A[:5].copy()
sim_copy = cosine_similarity_matrix(A[:5], B_copy)
assert np.allclose(np.diag(sim_copy), 1.0, atol=1e-6), "Identical vectors should have sim 1"

# Opposite vectors should have similarity -1
sim_neg = cosine_similarity_matrix(A[:5], -A[:5])
assert np.allclose(np.diag(sim_neg), -1.0, atol=1e-6), "Opposite vectors should have sim -1"
print("Challenge 5 passed!")

Key Takeaways

💡

Always subtract the max before exp in softmax — this is the log-sum-exp trick for numerical stability
Combine softmax + cross-entropy into one operation to avoid computing log(exp(x)), which cancels out
The gradient of softmax cross-entropy is beautifully simple: probs - one_hot(labels)
Batch normalization normalizes across the batch (axis=0), while layer normalization normalizes across features (axis=1)
Cosine similarity = dot product of L2-normalized vectors. Normalize first, then use matrix multiply for the full pairwise matrix

← Previous Statistical Operations Next → Distance & Similarity