Distance & Similarity
Five coding challenges covering the distance and similarity metrics at the heart of retrieval systems, KNN classifiers, clustering algorithms, and recommendation engines. Every solution is fully vectorized — no loops over data points.
Challenge 1: Euclidean Distance
Problem: Given two sets of points A of shape (M, D) and B of shape (N, D), compute the pairwise Euclidean distance matrix of shape (M, N). Do this without any Python loops, using the expanded form: ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * a . b.
np.linalg.norm(A[:, None] - B[None, :], axis=2) creates an intermediate array of shape (M, N, D) which can be enormous. The expanded form uses (M, D) @ (D, N) matrix multiply which is much more memory-efficient and faster.import numpy as np
# ---- CHALLENGE ----
def pairwise_euclidean(A, B):
"""Compute pairwise Euclidean distance matrix. Return shape (M, N)."""
pass
# ---- NAIVE SOLUTION (memory-intensive) ----
def pairwise_euclidean_naive(A, B):
"""Creates (M, N, D) intermediate array -- BAD for large datasets."""
diff = A[:, None, :] - B[None, :, :] # (M, N, D)
return np.sqrt((diff ** 2).sum(axis=2)) # (M, N)
# ---- OPTIMAL SOLUTION (expanded form) ----
def pairwise_euclidean(A, B):
"""Memory-efficient using ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b"""
# ||a||^2 for each row of A
A_sq = np.sum(A ** 2, axis=1, keepdims=True) # (M, 1)
# ||b||^2 for each row of B
B_sq = np.sum(B ** 2, axis=1, keepdims=True) # (N, 1)
# a . b for all pairs
AB = A @ B.T # (M, N)
# ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * a.b
dist_sq = A_sq + B_sq.T - 2 * AB # (M, N) via broadcasting
# Clip negative values (can happen due to floating point)
dist_sq = np.maximum(dist_sq, 0.0)
return np.sqrt(dist_sq)
# ---- VERIFICATION ----
np.random.seed(42)
A = np.random.randn(100, 64)
B = np.random.randn(200, 64)
dist = pairwise_euclidean(A, B)
assert dist.shape == (100, 200), f"Expected (100, 200), got {dist.shape}"
assert np.all(dist >= 0), "Distances should be non-negative"
# Self-distance should be 0 on diagonal
self_dist = pairwise_euclidean(A, A)
assert np.allclose(np.diag(self_dist), 0, atol=1e-6), "Self-distance should be 0"
# Compare with naive (should be the same)
dist_naive = pairwise_euclidean_naive(A[:10], B[:20])
dist_opt = pairwise_euclidean(A[:10], B[:20])
assert np.allclose(dist_naive, dist_opt, atol=1e-6), "Should match naive implementation"
# Triangle inequality: d(a,c) <= d(a,b) + d(b,c)
d = pairwise_euclidean(A[:3], A[:3])
assert d[0, 2] <= d[0, 1] + d[1, 2] + 1e-10, "Triangle inequality violated"
print("Challenge 1 passed!")
Challenge 2: Manhattan Distance
Problem: Compute the pairwise Manhattan (L1) distance matrix. Unlike Euclidean, there is no expanded form trick for L1, so you need to use broadcasting carefully. Also implement the Minkowski distance as a generalization: d_p(a, b) = (sum(|a_i - b_i|^p))^{1/p}.
import numpy as np
# ---- CHALLENGE ----
def pairwise_manhattan(A, B):
"""Pairwise Manhattan (L1) distance. Return shape (M, N)."""
pass
def pairwise_minkowski(A, B, p=2):
"""Pairwise Minkowski distance with parameter p. Return shape (M, N)."""
pass
# ---- SOLUTION ----
def pairwise_manhattan(A, B):
"""L1 distance: sum of absolute differences."""
# (M, 1, D) - (1, N, D) -> (M, N, D) then sum over D
# For memory efficiency with large datasets, process in chunks
return np.abs(A[:, None, :] - B[None, :, :]).sum(axis=2)
def pairwise_minkowski(A, B, p=2):
"""General Minkowski distance: (sum(|a-b|^p))^(1/p)."""
diff = np.abs(A[:, None, :] - B[None, :, :]) # (M, N, D)
if p == np.inf:
return diff.max(axis=2) # Chebyshev distance
return (diff ** p).sum(axis=2) ** (1.0 / p)
# ---- VERIFICATION ----
np.random.seed(42)
A = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
B = np.array([[1.0, 1.0], [2.0, 2.0]])
# Manhattan from (0,0) to (1,1) should be 2
dist_l1 = pairwise_manhattan(A, B)
assert dist_l1[0, 0] == 2.0, f"L1 from (0,0) to (1,1) should be 2, got {dist_l1[0, 0]}"
assert dist_l1.shape == (3, 2), f"Expected (3, 2), got {dist_l1.shape}"
# Minkowski with p=2 should match Euclidean
A_large = np.random.randn(50, 10)
B_large = np.random.randn(30, 10)
dist_mink2 = pairwise_minkowski(A_large, B_large, p=2)
dist_euc = pairwise_euclidean(A_large, B_large)
assert np.allclose(dist_mink2, dist_euc, atol=1e-6), "p=2 Minkowski should match Euclidean"
# Chebyshev distance (p=inf): max absolute difference
dist_inf = pairwise_minkowski(A, B, p=np.inf)
assert dist_inf[0, 0] == 1.0, "Chebyshev from (0,0) to (1,1) should be 1"
print("Challenge 2 passed!")
Challenge 3: Cosine Distance and Angular Distance
Problem: Implement cosine distance (1 - cosine_similarity) and angular distance (arccos(cosine_similarity) / pi). Given a query vector and a database of embeddings, return the K nearest neighbors by each metric. Discuss when cosine vs. Euclidean distance is preferred.
import numpy as np
# ---- CHALLENGE ----
def cosine_knn(query, database, k):
"""Find k nearest neighbors by cosine distance.
Return indices and distances."""
pass
def angular_distance(A, B):
"""Angular distance: arccos(cosine_sim) / pi. In [0, 1] range."""
pass
# ---- SOLUTION ----
def cosine_knn(query, database, k):
"""K-nearest neighbors using cosine distance."""
# Normalize query
q_norm = query / (np.linalg.norm(query) + 1e-12)
# Normalize all database vectors
db_norms = np.linalg.norm(database, axis=1, keepdims=True)
db_norms = np.where(db_norms == 0, 1.0, db_norms)
db_normalized = database / db_norms
# Cosine similarity via dot product
similarities = db_normalized @ q_norm # (N,)
# Cosine distance = 1 - similarity
distances = 1.0 - similarities
# Find top-k (smallest distances)
top_k_idx = np.argpartition(distances, k)[:k]
# Sort within top-k
top_k_idx = top_k_idx[np.argsort(distances[top_k_idx])]
return top_k_idx, distances[top_k_idx]
def angular_distance(A, B):
"""Angular distance normalized to [0, 1]."""
# Normalize
A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)
# Cosine similarity
cos_sim = A_norm @ B_norm.T # (M, N)
cos_sim = np.clip(cos_sim, -1.0, 1.0)
# Angular distance = arccos(cos_sim) / pi
return np.arccos(cos_sim) / np.pi
# ---- VERIFICATION ----
np.random.seed(42)
query = np.random.randn(128)
database = np.random.randn(10000, 128)
# Make entry 7777 very similar to query
database[7777] = query + np.random.randn(128) * 0.01
idx, dists = cosine_knn(query, database, k=5)
assert 7777 in idx, "Entry 7777 should be in top 5"
assert idx[0] == 7777, "Entry 7777 should be the nearest"
assert np.all(dists >= 0), "Cosine distances should be non-negative"
# Angular distance properties
A = np.random.randn(5, 64)
ang = angular_distance(A, A)
assert np.allclose(np.diag(ang), 0, atol=1e-6), "Self angular distance should be 0"
assert np.all(ang >= -1e-6) and np.all(ang <= 1 + 1e-6), "Angular distance should be in [0, 1]"
print("Challenge 3 passed!")
Challenge 4: Efficient Pairwise Distance Matrix
Problem: Implement a function that computes the pairwise distance matrix within a single set of points (self-distances). Since the result is symmetric with zeros on the diagonal, optimize by computing only the upper triangle. This is used in hierarchical clustering and t-SNE.
import numpy as np
# ---- CHALLENGE ----
def pdist_squared(X):
"""Compute pairwise squared Euclidean distance matrix for X (N, D).
Return symmetric (N, N) matrix with zeros on diagonal."""
pass
def gaussian_kernel_matrix(X, sigma=1.0):
"""Compute Gaussian (RBF) kernel matrix: K[i,j] = exp(-||x_i - x_j||^2 / (2*sigma^2))."""
pass
# ---- SOLUTION ----
def pdist_squared(X):
"""Self pairwise squared distances using expanded form."""
# ||x_i - x_j||^2 = ||x_i||^2 + ||x_j||^2 - 2 * x_i . x_j
X_sq = np.sum(X ** 2, axis=1) # (N,)
# Gram matrix: X @ X^T
gram = X @ X.T # (N, N)
# Broadcasting: (N, 1) + (1, N) - 2 * (N, N) -> (N, N)
dist_sq = X_sq[:, None] + X_sq[None, :] - 2 * gram
# Ensure non-negative and zero diagonal
dist_sq = np.maximum(dist_sq, 0.0)
np.fill_diagonal(dist_sq, 0.0)
return dist_sq
def gaussian_kernel_matrix(X, sigma=1.0):
"""RBF kernel matrix."""
dist_sq = pdist_squared(X)
return np.exp(-dist_sq / (2 * sigma ** 2))
# ---- VERIFICATION ----
np.random.seed(42)
X = np.random.randn(100, 10)
D = pdist_squared(X)
assert D.shape == (100, 100), f"Expected (100, 100), got {D.shape}"
assert np.allclose(D, D.T, atol=1e-10), "Distance matrix should be symmetric"
assert np.allclose(np.diag(D), 0), "Diagonal should be zero"
assert np.all(D >= -1e-10), "Distances should be non-negative"
K = gaussian_kernel_matrix(X, sigma=1.0)
assert K.shape == (100, 100), f"Expected (100, 100), got {K.shape}"
assert np.allclose(np.diag(K), 1.0), "Self-kernel should be 1"
assert np.all(K >= 0) and np.all(K <= 1), "Kernel values should be in [0, 1]"
assert np.allclose(K, K.T), "Kernel matrix should be symmetric"
print("Challenge 4 passed!")
Challenge 5: K-Nearest Neighbor Prediction
Problem: Implement a complete KNN classifier using only NumPy. Given training data X_train (N, D), labels y_train (N,), and test data X_test (M, D), predict the label for each test point using majority vote of the K nearest training points. Support both uniform and distance-weighted voting.
import numpy as np
# ---- CHALLENGE ----
def knn_predict(X_train, y_train, X_test, k=5, weighted=False):
"""KNN classification. Return predicted labels for X_test."""
pass
# ---- SOLUTION ----
def knn_predict(X_train, y_train, X_test, k=5, weighted=False):
"""Full KNN classifier in pure NumPy."""
# Step 1: Compute pairwise distances (M, N)
# Using expanded Euclidean distance
test_sq = np.sum(X_test ** 2, axis=1, keepdims=True) # (M, 1)
train_sq = np.sum(X_train ** 2, axis=1, keepdims=True) # (N, 1)
cross = X_test @ X_train.T # (M, N)
dist_sq = test_sq + train_sq.T - 2 * cross # (M, N)
dist_sq = np.maximum(dist_sq, 0.0)
# Step 2: Find k nearest neighbors for each test point
# argpartition is O(N) per row, vs O(N log N) for argsort
knn_indices = np.argpartition(dist_sq, k, axis=1)[:, :k] # (M, k)
# Step 3: Get labels and distances of neighbors
knn_labels = y_train[knn_indices] # (M, k)
if weighted:
# Distance-weighted voting
knn_distances = np.sqrt(
np.take_along_axis(dist_sq, knn_indices, axis=1) # (M, k)
)
# Weights: inverse distance (add epsilon to avoid division by zero)
weights = 1.0 / (knn_distances + 1e-12) # (M, k)
else:
weights = np.ones_like(knn_labels, dtype=float)
# Step 4: Majority vote (weighted)
num_classes = int(y_train.max()) + 1
M = X_test.shape[0]
predictions = np.zeros(M, dtype=int)
for i in range(M):
# Accumulate votes per class
votes = np.zeros(num_classes)
for j in range(k):
votes[knn_labels[i, j]] += weights[i, j]
predictions[i] = np.argmax(votes)
return predictions
# ---- VERIFICATION ----
np.random.seed(42)
# Create simple 2-class dataset
N_train = 200
X_train = np.vstack([
np.random.randn(N_train // 2, 2) + [2, 2], # class 0: centered at (2, 2)
np.random.randn(N_train // 2, 2) + [-2, -2], # class 1: centered at (-2, -2)
])
y_train = np.array([0] * (N_train // 2) + [1] * (N_train // 2))
# Test points near each cluster center
X_test = np.array([[2.0, 2.0], [-2.0, -2.0], [0.0, 0.0], [3.0, 3.0]])
# Uniform voting
preds = knn_predict(X_train, y_train, X_test, k=5)
assert preds[0] == 0, "Point near (2,2) should be class 0"
assert preds[1] == 1, "Point near (-2,-2) should be class 1"
assert preds[3] == 0, "Point near (3,3) should be class 0"
# Distance-weighted voting
preds_w = knn_predict(X_train, y_train, X_test, k=5, weighted=True)
assert preds_w[0] == 0 and preds_w[1] == 1, "Weighted predictions should also work"
# Accuracy on larger test set
X_test_large = np.vstack([
np.random.randn(100, 2) + [2, 2],
np.random.randn(100, 2) + [-2, -2],
])
y_test_large = np.array([0] * 100 + [1] * 100)
preds_large = knn_predict(X_train, y_train, X_test_large, k=5)
accuracy = (preds_large == y_test_large).mean()
assert accuracy > 0.9, f"Accuracy should be > 90%, got {accuracy:.2%}"
print(f"KNN accuracy: {accuracy:.2%}")
print("Challenge 5 passed!")
Key Takeaways
- Use the expanded form
||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.bfor memory-efficient pairwise Euclidean distance - Cosine distance is preferred when magnitude does not matter (text embeddings, normalized features)
- Euclidean distance is preferred for spatial data and when magnitude carries information
np.argpartitionfinds the k smallest/largest in O(n) vs O(n log n) for full sort- The Gaussian kernel
exp(-d^2 / 2*sigma^2)converts distances to similarities for kernel methods - Distance-weighted KNN gives closer neighbors more influence and often outperforms uniform voting
Lilly Tech Systems