ML Implementations in NumPy
Five coding challenges that directly mirror what Google, Meta, and Amazon ask in ML engineer interviews. These are the building blocks of every neural network, implemented from scratch with numerically stable, fully vectorized NumPy code.
Challenge 1: Numerically Stable Softmax
Problem: Implement the softmax function for a batch of logits Z of shape (N, C). The naive implementation exp(z) / sum(exp(z)) overflows for large values. Implement the numerically stable version and handle both 1D and 2D inputs.
import numpy as np
# ---- CHALLENGE ----
def softmax(Z):
"""Numerically stable softmax. Handle both 1D and 2D input."""
pass
# ---- NAIVE (WRONG) SOLUTION ----
def softmax_naive(Z):
"""This OVERFLOWS for large values like Z = [1000, 1001, 1002]."""
exp_Z = np.exp(Z) # exp(1000) = inf!
return exp_Z / exp_Z.sum() # inf / inf = NaN
# ---- CORRECT SOLUTION ----
def softmax(Z):
"""Numerically stable softmax using the log-sum-exp trick."""
if Z.ndim == 1:
# 1D case: single sample
Z_shifted = Z - np.max(Z) # shift so max = 0
exp_Z = np.exp(Z_shifted)
return exp_Z / exp_Z.sum()
else:
# 2D case: batch (N, C)
Z_shifted = Z - Z.max(axis=1, keepdims=True) # (N, 1)
exp_Z = np.exp(Z_shifted) # (N, C)
return exp_Z / exp_Z.sum(axis=1, keepdims=True) # (N, C)
# ---- VERIFICATION ----
# Test 1D
z = np.array([1.0, 2.0, 3.0])
s = softmax(z)
assert np.allclose(s.sum(), 1.0), "Softmax should sum to 1"
assert s[2] > s[1] > s[0], "Larger logit should have larger probability"
# Test numerical stability with large values
z_large = np.array([1000.0, 1001.0, 1002.0])
s_large = softmax(z_large)
assert not np.any(np.isnan(s_large)), "Should not produce NaN for large values"
assert np.allclose(s_large.sum(), 1.0), "Should still sum to 1"
# Test batch (2D)
Z = np.array([[1.0, 2.0, 3.0],
[1000.0, 1001.0, 1002.0]])
S = softmax(Z)
assert S.shape == (2, 3), f"Expected (2, 3), got {S.shape}"
assert np.allclose(S.sum(axis=1), [1.0, 1.0]), "Each row should sum to 1"
# Verify: softmax of equal values should be uniform
z_equal = np.array([5.0, 5.0, 5.0])
assert np.allclose(softmax(z_equal), [1/3, 1/3, 1/3])
print("Challenge 1 passed!")
Challenge 2: Cross-Entropy Loss
Problem: Implement cross-entropy loss for a batch of predictions. Given softmax probabilities probs of shape (N, C) and integer labels labels of shape (N,), compute the average cross-entropy loss: L = -1/N * sum(log(probs[i, labels[i]])). Handle the numerical issue where log(0) = -inf.
import numpy as np
# ---- CHALLENGE ----
def cross_entropy_loss(logits, labels):
"""Compute cross-entropy loss from logits (not probabilities).
Combines softmax + negative log-likelihood in a numerically stable way."""
pass
# ---- SOLUTION ----
def cross_entropy_loss(logits, labels):
"""Numerically stable cross-entropy using log-softmax trick.
Instead of: -log(softmax(z)[label])
Compute: -z[label] + log(sum(exp(z)))
With stability: -z[label] + max(z) + log(sum(exp(z - max(z))))
"""
N = logits.shape[0]
# Numerically stable log-softmax
z_max = logits.max(axis=1, keepdims=True) # (N, 1)
# log(sum(exp(z - max))) + max = log(sum(exp(z))) (stable)
log_sum_exp = np.log(np.exp(logits - z_max).sum(axis=1)) + z_max.squeeze()
# Extract logit of correct class using fancy indexing
correct_logits = logits[np.arange(N), labels] # (N,)
# Cross-entropy: -log(softmax(z)[label]) = -z[label] + log_sum_exp
losses = -correct_logits + log_sum_exp # (N,)
return losses.mean() # scalar: average over batch
# ---- VERIFICATION ----
np.random.seed(42)
N, C = 32, 10
logits = np.random.randn(N, C)
labels = np.random.randint(0, C, N)
loss = cross_entropy_loss(logits, labels)
assert np.isscalar(loss) or loss.ndim == 0, "Loss should be scalar"
assert loss > 0, "Cross-entropy loss should be positive"
assert not np.isnan(loss), "Loss should not be NaN"
# Perfect predictions should have low loss
perfect_logits = np.full((N, C), -10.0)
perfect_logits[np.arange(N), labels] = 10.0
perfect_loss = cross_entropy_loss(perfect_logits, labels)
assert perfect_loss < 0.001, f"Perfect predictions should have ~0 loss, got {perfect_loss}"
# Uniform predictions should have loss ~log(C)
uniform_logits = np.zeros((N, C))
uniform_loss = cross_entropy_loss(uniform_logits, labels)
assert abs(uniform_loss - np.log(C)) < 0.01, f"Uniform should give log(C)={np.log(C):.4f}, got {uniform_loss:.4f}"
# Test numerical stability with extreme values
extreme_logits = np.random.randn(N, C) * 1000
extreme_loss = cross_entropy_loss(extreme_logits, labels)
assert not np.isnan(extreme_loss), "Should handle extreme values"
print("Challenge 2 passed!")
Challenge 3: Gradient Descent Step
Problem: Implement the gradient computation and parameter update for a single linear layer with softmax cross-entropy loss. Given X (N, D), W (D, C), b (C,), and labels, compute the gradients dW, db and perform one gradient descent step.
import numpy as np
# ---- CHALLENGE ----
def softmax_grad_step(X, W, b, labels, lr=0.01):
"""One gradient descent step for linear softmax classifier.
Return updated W, b, and the loss."""
pass
# ---- SOLUTION ----
def softmax_grad_step(X, W, b, labels, lr=0.01):
"""Forward pass + backward pass + parameter update."""
N = X.shape[0]
# ---- Forward pass ----
logits = X @ W + b # (N, C)
# Stable softmax
logits_shifted = logits - logits.max(axis=1, keepdims=True)
exp_logits = np.exp(logits_shifted)
probs = exp_logits / exp_logits.sum(axis=1, keepdims=True) # (N, C)
# Cross-entropy loss
correct_probs = probs[np.arange(N), labels]
loss = -np.log(correct_probs + 1e-12).mean()
# ---- Backward pass ----
# Gradient of cross-entropy + softmax: probs - one_hot(labels)
dlogits = probs.copy() # (N, C)
dlogits[np.arange(N), labels] -= 1.0 # subtract 1 at correct class
dlogits /= N # average over batch
# Gradients for W and b
# logits = X @ W + b
dW = X.T @ dlogits # (D, N) @ (N, C) -> (D, C)
db = dlogits.sum(axis=0) # (C,)
# ---- Update ----
W_new = W - lr * dW
b_new = b - lr * db
return W_new, b_new, loss
# ---- VERIFICATION ----
np.random.seed(42)
N, D, C = 64, 784, 10
X = np.random.randn(N, D)
W = np.random.randn(D, C) * 0.01
b = np.zeros(C)
labels = np.random.randint(0, C, N)
# Run multiple steps and verify loss decreases
losses = []
for step in range(100):
W, b, loss = softmax_grad_step(X, W, b, labels, lr=0.1)
losses.append(loss)
assert losses[-1] < losses[0], f"Loss should decrease: {losses[0]:.4f} -> {losses[-1]:.4f}"
assert losses[-1] < 0.5, f"Loss should converge to low value, got {losses[-1]:.4f}"
print(f"Loss: {losses[0]:.4f} -> {losses[-1]:.4f}")
print("Challenge 3 passed!")
Challenge 4: Batch Normalization
Problem: Implement batch normalization for a fully connected layer. Given activations X of shape (N, D), normalize across the batch dimension. Implement both training mode (compute batch stats) and inference mode (use running stats). Include learnable parameters gamma and beta.
import numpy as np
# ---- CHALLENGE ----
def batch_norm_forward(X, gamma, beta, running_mean, running_var,
training=True, momentum=0.1, eps=1e-5):
"""Batch normalization forward pass.
Returns: output, cache (for backward), updated running stats."""
pass
# ---- SOLUTION ----
def batch_norm_forward(X, gamma, beta, running_mean, running_var,
training=True, momentum=0.1, eps=1e-5):
"""Batch normalization forward pass."""
if training:
# Compute batch statistics
batch_mean = X.mean(axis=0) # (D,)
batch_var = X.var(axis=0) # (D,) population var
# Normalize
X_norm = (X - batch_mean) / np.sqrt(batch_var + eps) # (N, D)
# Scale and shift with learnable parameters
out = gamma * X_norm + beta # (N, D)
# Update running statistics (exponential moving average)
running_mean = (1 - momentum) * running_mean + momentum * batch_mean
running_var = (1 - momentum) * running_var + momentum * batch_var
# Cache for backward pass
cache = (X, X_norm, batch_mean, batch_var, gamma, eps)
else:
# Use running statistics for inference
X_norm = (X - running_mean) / np.sqrt(running_var + eps)
out = gamma * X_norm + beta
cache = None
return out, cache, running_mean, running_var
# ---- VERIFICATION ----
np.random.seed(42)
N, D = 64, 128
X = np.random.randn(N, D) * 5 + 3 # shifted and scaled
gamma = np.ones(D)
beta = np.zeros(D)
running_mean = np.zeros(D)
running_var = np.ones(D)
# Training mode
out, cache, rm, rv = batch_norm_forward(X, gamma, beta, running_mean, running_var, training=True)
# Output should have approximately zero mean and unit variance
assert np.allclose(out.mean(axis=0), 0, atol=1e-6), "Output mean should be ~0"
assert np.allclose(out.var(axis=0), 1, atol=1e-1), "Output var should be ~1"
assert out.shape == X.shape, "Shape should be preserved"
# Inference mode should use running stats
out_inf, _, _, _ = batch_norm_forward(X, gamma, beta, rm, rv, training=False)
assert out_inf.shape == X.shape, "Inference shape should be preserved"
print("Challenge 4 passed!")
Challenge 5: Cosine Similarity Matrix
Problem: Given two sets of vectors A of shape (M, D) and B of shape (N, D), compute the pairwise cosine similarity matrix of shape (M, N). This is the core operation in retrieval systems, contrastive learning (SimCLR, CLIP), and recommendation engines.
import numpy as np
# ---- CHALLENGE ----
def cosine_similarity_matrix(A, B):
"""Compute pairwise cosine similarity between rows of A and B.
Return shape (M, N) where result[i,j] = cosine_sim(A[i], B[j])."""
pass
# ---- SOLUTION ----
def cosine_similarity_matrix(A, B):
"""Vectorized pairwise cosine similarity.
cosine_sim(a, b) = (a . b) / (||a|| * ||b||)
"""
# L2 normalize each row
A_norm = np.linalg.norm(A, axis=1, keepdims=True) # (M, 1)
B_norm = np.linalg.norm(B, axis=1, keepdims=True) # (N, 1)
# Handle zero vectors
A_norm = np.where(A_norm == 0, 1.0, A_norm)
B_norm = np.where(B_norm == 0, 1.0, B_norm)
A_normalized = A / A_norm # (M, D)
B_normalized = B / B_norm # (N, D)
# Dot product of normalized vectors = cosine similarity
# (M, D) @ (D, N) -> (M, N)
sim_matrix = A_normalized @ B_normalized.T
# Clip for numerical safety
return np.clip(sim_matrix, -1.0, 1.0)
# ---- VERIFICATION ----
np.random.seed(42)
A = np.random.randn(100, 128) # 100 query embeddings
B = np.random.randn(500, 128) # 500 document embeddings
sim = cosine_similarity_matrix(A, B)
assert sim.shape == (100, 500), f"Expected (100, 500), got {sim.shape}"
assert np.all(sim >= -1.0 - 1e-6) and np.all(sim <= 1.0 + 1e-6), "Cosine sim should be in [-1, 1]"
# Self-similarity should be 1 on diagonal
self_sim = cosine_similarity_matrix(A, A)
assert np.allclose(np.diag(self_sim), 1.0, atol=1e-6), "Self-similarity should be 1"
# Identical vectors should have similarity 1
B_copy = A[:5].copy()
sim_copy = cosine_similarity_matrix(A[:5], B_copy)
assert np.allclose(np.diag(sim_copy), 1.0, atol=1e-6), "Identical vectors should have sim 1"
# Opposite vectors should have similarity -1
sim_neg = cosine_similarity_matrix(A[:5], -A[:5])
assert np.allclose(np.diag(sim_neg), -1.0, atol=1e-6), "Opposite vectors should have sim -1"
print("Challenge 5 passed!")
Key Takeaways
- Always subtract the max before exp in softmax — this is the log-sum-exp trick for numerical stability
- Combine softmax + cross-entropy into one operation to avoid computing log(exp(x)), which cancels out
- The gradient of softmax cross-entropy is beautifully simple:
probs - one_hot(labels) - Batch normalization normalizes across the batch (axis=0), while layer normalization normalizes across features (axis=1)
- Cosine similarity = dot product of L2-normalized vectors. Normalize first, then use matrix multiply for the full pairwise matrix
Lilly Tech Systems