NumPy Interview Challenges
10 real NumPy coding challenges that appear in ML interviews at top tech companies. Each challenge includes the problem statement, constraints, a complete solution, and an explanation of what the interviewer is looking for.
Challenge 1: Softmax Function
import numpy as np
def softmax(X: np.ndarray) -> np.ndarray:
"""Numerically stable softmax applied row-wise.
Args:
X: Array of shape (n, d) with raw logits.
Returns:
Array of shape (n, d) with probabilities summing to 1 per row.
"""
# Subtract max per row for numerical stability
X_shifted = X - X.max(axis=1, keepdims=True)
exp_X = np.exp(X_shifted)
return exp_X / exp_X.sum(axis=1, keepdims=True)
# Test
logits = np.array([[1, 2, 3],
[1000, 1001, 1002]]) # Large values test stability
probs = softmax(logits)
print(probs)
# Row 1: [0.0900, 0.2447, 0.6652]
# Row 2: [0.0900, 0.2447, 0.6652] (same ratios, no overflow)
print(probs.sum(axis=1)) # [1.0, 1.0]
Interviewer focus: The max-subtraction trick for numerical stability is the #1 thing they check. Without it, np.exp(1000) overflows to inf. Using keepdims=True for correct broadcasting is the second signal of NumPy fluency.
Challenge 2: Batch Matrix Multiplication
import numpy as np
def batch_matmul(A: np.ndarray, B: np.ndarray) -> np.ndarray:
"""Batch matrix multiplication using einsum.
Args:
A: Shape (batch, m, k).
B: Shape (batch, k, n).
Returns:
Shape (batch, m, n).
"""
return np.einsum('bmk,bkn->bmn', A, B)
# Alternative using @ operator (NumPy >= 1.16)
def batch_matmul_v2(A: np.ndarray, B: np.ndarray) -> np.ndarray:
"""Batch matrix multiplication using @ operator."""
return A @ B # @ handles batch dimensions automatically
# Test
A = np.random.randn(5, 3, 4) # 5 batches of 3x4 matrices
B = np.random.randn(5, 4, 2) # 5 batches of 4x2 matrices
C = batch_matmul(A, B)
print(C.shape) # (5, 3, 2)
# Verify against loop-based approach
for i in range(5):
assert np.allclose(C[i], A[i] @ B[i])
Interviewer focus: They want to see if you know np.einsum or that the @ operator handles batched dimensions. Bonus points for mentioning that einsum is more flexible and can express operations that @ cannot.
Challenge 3: One-Hot Encoding Without Loops
import numpy as np
def one_hot(labels: np.ndarray, num_classes: int) -> np.ndarray:
"""One-hot encode labels using advanced indexing.
Args:
labels: 1D array of integers in [0, num_classes).
num_classes: Total number of classes.
Returns:
Binary matrix of shape (len(labels), num_classes).
"""
n = len(labels)
result = np.zeros((n, num_classes))
result[np.arange(n), labels] = 1
return result
# Alternative: using np.eye
def one_hot_v2(labels: np.ndarray, num_classes: int) -> np.ndarray:
"""One-hot encode using identity matrix indexing."""
return np.eye(num_classes)[labels]
# Test
labels = np.array([0, 2, 1, 3, 0])
encoded = one_hot(labels, num_classes=4)
print(encoded)
# [[1, 0, 0, 0],
# [0, 0, 1, 0],
# [0, 1, 0, 0],
# [0, 0, 0, 1],
# [1, 0, 0, 0]]
# Verify
assert np.array_equal(one_hot(labels, 4), one_hot_v2(labels, 4))
Interviewer focus: The np.eye trick is elegant but less well-known. The advanced indexing approach with result[np.arange(n), labels] shows deep NumPy understanding. Both are acceptable.
Challenge 4: Moving Average with Stride Tricks
n - window + 1. Use only NumPy (no pandas).import numpy as np
def moving_average(arr: np.ndarray, window: int) -> np.ndarray:
"""Compute moving average using cumulative sum trick.
Args:
arr: 1D numeric array.
window: Window size.
Returns:
Array of moving averages, length = len(arr) - window + 1.
"""
cumsum = np.cumsum(arr)
cumsum = np.insert(cumsum, 0, 0) # Prepend 0
return (cumsum[window:] - cumsum[:-window]) / window
# Alternative: using np.convolve
def moving_average_v2(arr: np.ndarray, window: int) -> np.ndarray:
"""Compute moving average using convolution."""
kernel = np.ones(window) / window
return np.convolve(arr, kernel, mode='valid')
# Test
data = np.array([1, 3, 5, 7, 9, 11], dtype=float)
result = moving_average(data, window=3)
print(result) # [3.0, 5.0, 7.0, 9.0]
# Verify
assert np.allclose(moving_average(data, 3), moving_average_v2(data, 3))
Interviewer focus: The cumsum trick runs in O(n) time regardless of window size, while the naive approach is O(n*k). The convolution approach is also O(n) and shows breadth of NumPy knowledge. Mentioning time complexity is a strong signal.
Challenge 5: Cosine Similarity Matrix
import numpy as np
def cosine_similarity_matrix(X: np.ndarray) -> np.ndarray:
"""Compute pairwise cosine similarity.
cos(a, b) = (a . b) / (||a|| * ||b||)
Args:
X: Shape (n, d).
Returns:
Similarity matrix of shape (n, n), values in [-1, 1].
"""
# Compute L2 norms
norms = np.linalg.norm(X, axis=1, keepdims=True) # (n, 1)
# Avoid division by zero for zero vectors
norms = np.where(norms == 0, 1, norms)
# Normalize rows to unit vectors
X_normalized = X / norms
# Cosine similarity = dot product of unit vectors
return X_normalized @ X_normalized.T
# Test
X = np.array([[1, 0, 0],
[0, 1, 0],
[1, 1, 0],
[0, 0, 0]], dtype=float) # Last row is zero vector
sim = cosine_similarity_matrix(X)
print(np.round(sim, 4))
# [[1. 0. 0.7071 0. ]
# [0. 1. 0.7071 0. ]
# [0.7071 0.7071 1. 0. ]
# [0. 0. 0. 0. ]]
Interviewer focus: Normalizing first then using matrix multiplication is the optimal approach. Handling zero vectors (which would cause division by zero) is the edge case they specifically look for.
Challenge 6: Top-K Values Per Row
import numpy as np
def topk_per_row(X: np.ndarray, k: int) -> np.ndarray:
"""Find indices of top-k values per row using argpartition.
Args:
X: Array of shape (n, d).
k: Number of top values to find.
Returns:
Array of shape (n, k) with column indices of top-k values,
sorted by value (largest first) within each row.
"""
# argpartition is O(n) vs O(n log n) for argsort
# It partitions so that the k largest are in the last k positions
top_indices = np.argpartition(X, -k, axis=1)[:, -k:]
# Sort within the top-k (optional, for ordered results)
# Gather top-k values for sorting
rows = np.arange(X.shape[0])[:, np.newaxis]
top_values = X[rows, top_indices]
sorted_order = np.argsort(-top_values, axis=1)
return top_indices[rows, sorted_order]
# Test
X = np.array([[5, 1, 9, 3, 7],
[2, 8, 4, 6, 0]])
result = topk_per_row(X, k=3)
print(result)
# [[2, 4, 0], -> values 9, 7, 5
# [1, 3, 2]] -> values 8, 6, 4
Interviewer focus: Using np.argpartition instead of np.argsort shows you understand the O(n) vs O(n log n) difference. This is a common ML operation (beam search, nearest neighbors) and the efficient approach matters at scale.
Challenge 7: Vectorized Sigmoid and Its Derivative
import numpy as np
def sigmoid(z: np.ndarray) -> np.ndarray:
"""Numerically stable sigmoid.
For z >= 0: 1 / (1 + exp(-z))
For z < 0: exp(z) / (1 + exp(z))
"""
pos_mask = z >= 0
neg_mask = ~pos_mask
result = np.zeros_like(z, dtype=float)
# For positive z
result[pos_mask] = 1.0 / (1.0 + np.exp(-z[pos_mask]))
# For negative z (avoid exp(large positive number))
exp_z = np.exp(z[neg_mask])
result[neg_mask] = exp_z / (1.0 + exp_z)
return result
def sigmoid_derivative(z: np.ndarray) -> np.ndarray:
"""Derivative of sigmoid: sigma(z) * (1 - sigma(z))."""
s = sigmoid(z)
return s * (1 - s)
def binary_cross_entropy(y_true: np.ndarray, logits: np.ndarray) -> float:
"""Numerically stable binary cross-entropy from logits.
BCE = -[y*log(sigma(z)) + (1-y)*log(1-sigma(z))]
= max(z, 0) - z*y + log(1 + exp(-|z|))
"""
return np.mean(
np.maximum(logits, 0) - logits * y_true +
np.log1p(np.exp(-np.abs(logits)))
)
# Test sigmoid
z = np.array([-1000, -1, 0, 1, 1000])
print(sigmoid(z)) # [0, 0.2689, 0.5, 0.7311, 1.0]
print(sigmoid_derivative(z)) # [0, 0.1966, 0.25, 0.1966, 0]
# Test BCE
y_true = np.array([1, 0, 1, 0])
logits = np.array([2.0, -1.0, 0.5, -2.0])
print(f"BCE: {binary_cross_entropy(y_true, logits):.4f}") # ~0.2536
Interviewer focus: The piecewise sigmoid implementation avoids overflow for both large positive and large negative inputs. The BCE loss using the log-sum-exp identity is the production-grade approach used in PyTorch and TensorFlow internally.
Challenge 8: Normalize Rows to Unit Vectors
import numpy as np
def l2_normalize(X: np.ndarray) -> np.ndarray:
"""Normalize each row to unit L2 norm.
Args:
X: Shape (n, d).
Returns:
Normalized array, same shape. Zero rows remain zero.
"""
norms = np.linalg.norm(X, axis=1, keepdims=True)
# Replace zero norms with 1 to avoid division by zero
norms = np.where(norms == 0, 1, norms)
return X / norms
def l1_normalize(X: np.ndarray) -> np.ndarray:
"""Normalize each row so absolute values sum to 1."""
norms = np.sum(np.abs(X), axis=1, keepdims=True)
norms = np.where(norms == 0, 1, norms)
return X / norms
# Test
X = np.array([[3, 4],
[0, 0],
[1, 0],
[-3, 4]], dtype=float)
X_l2 = l2_normalize(X)
print("L2 norms:", np.linalg.norm(X_l2, axis=1))
# [1.0, 0.0, 1.0, 1.0]
X_l1 = l1_normalize(X)
print("L1 norms:", np.sum(np.abs(X_l1), axis=1))
# [1.0, 0.0, 1.0, 1.0]
Interviewer focus: Simple but reveals whether you handle edge cases (zero vectors) and use keepdims=True correctly. This operation is fundamental in ML (embedding normalization, cosine similarity prep).
Challenge 9: Implement K-Nearest Neighbors Prediction
import numpy as np
def knn_predict(X_train: np.ndarray, y_train: np.ndarray,
X_test: np.ndarray, k: int = 5) -> np.ndarray:
"""K-Nearest Neighbors classification without loops.
Args:
X_train: Training features, shape (n, d).
y_train: Training labels, shape (n,), integer class labels.
X_test: Test features, shape (m, d).
k: Number of neighbors.
Returns:
Predicted labels for test data, shape (m,).
"""
# Compute pairwise distances: (m, n)
# ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a.b
train_sq = np.sum(X_train ** 2, axis=1) # (n,)
test_sq = np.sum(X_test ** 2, axis=1) # (m,)
cross = X_test @ X_train.T # (m, n)
dist_sq = test_sq[:, np.newaxis] + train_sq[np.newaxis, :] - 2 * cross
dist_sq = np.maximum(dist_sq, 0) # Numerical stability
# Find k nearest neighbors per test point
knn_indices = np.argpartition(dist_sq, k, axis=1)[:, :k] # (m, k)
# Gather labels of k nearest neighbors
knn_labels = y_train[knn_indices] # (m, k)
# Majority vote: find most common label per row
num_classes = int(y_train.max()) + 1
m = X_test.shape[0]
# Count votes using one-hot + sum
votes = np.eye(num_classes)[knn_labels] # (m, k, num_classes)
vote_counts = votes.sum(axis=1) # (m, num_classes)
return np.argmax(vote_counts, axis=1)
# Test
np.random.seed(42)
X_train = np.random.randn(100, 2)
y_train = (X_train[:, 0] + X_train[:, 1] > 0).astype(int)
X_test = np.array([[1, 1], [-1, -1], [0.5, -0.3]])
preds = knn_predict(X_train, y_train, X_test, k=5)
print(f"Predictions: {preds}") # Expected: [1, 0, 1] or similar
Interviewer focus: The fully vectorized approach without any Python loops is what separates senior from junior candidates. Key signals: using the squared-distance identity, argpartition for efficiency, and the one-hot voting trick for majority vote without loops.
Challenge 10: Implement Batch Gradient Descent
import numpy as np
def mini_batch_gd(X: np.ndarray, y: np.ndarray,
lr: float = 0.01, epochs: int = 100,
batch_size: int = 32) -> dict:
"""Mini-batch gradient descent for linear regression.
Args:
X: Features, shape (n, d).
y: Targets, shape (n,).
lr: Learning rate.
epochs: Number of passes over the data.
batch_size: Samples per mini-batch.
Returns:
Dict with 'weights', 'bias', 'loss_history'.
"""
n, d = X.shape
w = np.zeros(d)
b = 0.0
loss_history = []
for epoch in range(epochs):
# Shuffle data each epoch
indices = np.random.permutation(n)
X_shuffled = X[indices]
y_shuffled = y[indices]
epoch_loss = 0.0
num_batches = 0
for start in range(0, n, batch_size):
end = min(start + batch_size, n)
X_batch = X_shuffled[start:end]
y_batch = y_shuffled[start:end]
m = len(y_batch)
# Forward pass
y_pred = X_batch @ w + b
error = y_pred - y_batch
# MSE loss
batch_loss = np.mean(error ** 2)
epoch_loss += batch_loss
num_batches += 1
# Gradients
dw = (2 / m) * (X_batch.T @ error)
db = (2 / m) * np.sum(error)
# Update
w -= lr * dw
b -= lr * db
loss_history.append(epoch_loss / num_batches)
return {'weights': w, 'bias': b, 'loss_history': loss_history}
# Test
np.random.seed(42)
X = np.random.randn(200, 3)
true_w = np.array([2.0, -1.0, 0.5])
y = X @ true_w + 3.0 + np.random.randn(200) * 0.1
result = mini_batch_gd(X, y, lr=0.01, epochs=200, batch_size=32)
print(f"Learned weights: {result['weights'].round(3)}") # ~[2, -1, 0.5]
print(f"Learned bias: {result['bias']:.3f}") # ~3.0
print(f"Final loss: {result['loss_history'][-1]:.4f}") # ~0.01
Interviewer focus: Proper shuffling per epoch, correct gradient formulas, batch handling when n is not divisible by batch_size, and loss tracking. Bonus: mention that you would add learning rate scheduling and early stopping in production.
Lilly Tech Systems