Advanced

Distance & Similarity

Five coding challenges covering the distance and similarity metrics at the heart of retrieval systems, KNN classifiers, clustering algorithms, and recommendation engines. Every solution is fully vectorized — no loops over data points.

Challenge 1: Euclidean Distance

Problem: Given two sets of points A of shape (M, D) and B of shape (N, D), compute the pairwise Euclidean distance matrix of shape (M, N). Do this without any Python loops, using the expanded form: ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * a . b.

💡
Why the expanded form? The naive approach np.linalg.norm(A[:, None] - B[None, :], axis=2) creates an intermediate array of shape (M, N, D) which can be enormous. The expanded form uses (M, D) @ (D, N) matrix multiply which is much more memory-efficient and faster.
import numpy as np

# ---- CHALLENGE ----
def pairwise_euclidean(A, B):
    """Compute pairwise Euclidean distance matrix. Return shape (M, N)."""
    pass

# ---- NAIVE SOLUTION (memory-intensive) ----
def pairwise_euclidean_naive(A, B):
    """Creates (M, N, D) intermediate array -- BAD for large datasets."""
    diff = A[:, None, :] - B[None, :, :]   # (M, N, D)
    return np.sqrt((diff ** 2).sum(axis=2))  # (M, N)

# ---- OPTIMAL SOLUTION (expanded form) ----
def pairwise_euclidean(A, B):
    """Memory-efficient using ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b"""
    # ||a||^2 for each row of A
    A_sq = np.sum(A ** 2, axis=1, keepdims=True)  # (M, 1)
    # ||b||^2 for each row of B
    B_sq = np.sum(B ** 2, axis=1, keepdims=True)  # (N, 1)
    # a . b for all pairs
    AB = A @ B.T                                    # (M, N)

    # ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * a.b
    dist_sq = A_sq + B_sq.T - 2 * AB               # (M, N) via broadcasting

    # Clip negative values (can happen due to floating point)
    dist_sq = np.maximum(dist_sq, 0.0)

    return np.sqrt(dist_sq)

# ---- VERIFICATION ----
np.random.seed(42)
A = np.random.randn(100, 64)
B = np.random.randn(200, 64)

dist = pairwise_euclidean(A, B)
assert dist.shape == (100, 200), f"Expected (100, 200), got {dist.shape}"
assert np.all(dist >= 0), "Distances should be non-negative"

# Self-distance should be 0 on diagonal
self_dist = pairwise_euclidean(A, A)
assert np.allclose(np.diag(self_dist), 0, atol=1e-6), "Self-distance should be 0"

# Compare with naive (should be the same)
dist_naive = pairwise_euclidean_naive(A[:10], B[:20])
dist_opt = pairwise_euclidean(A[:10], B[:20])
assert np.allclose(dist_naive, dist_opt, atol=1e-6), "Should match naive implementation"

# Triangle inequality: d(a,c) <= d(a,b) + d(b,c)
d = pairwise_euclidean(A[:3], A[:3])
assert d[0, 2] <= d[0, 1] + d[1, 2] + 1e-10, "Triangle inequality violated"
print("Challenge 1 passed!")

Challenge 2: Manhattan Distance

Problem: Compute the pairwise Manhattan (L1) distance matrix. Unlike Euclidean, there is no expanded form trick for L1, so you need to use broadcasting carefully. Also implement the Minkowski distance as a generalization: d_p(a, b) = (sum(|a_i - b_i|^p))^{1/p}.

import numpy as np

# ---- CHALLENGE ----
def pairwise_manhattan(A, B):
    """Pairwise Manhattan (L1) distance. Return shape (M, N)."""
    pass

def pairwise_minkowski(A, B, p=2):
    """Pairwise Minkowski distance with parameter p. Return shape (M, N)."""
    pass

# ---- SOLUTION ----
def pairwise_manhattan(A, B):
    """L1 distance: sum of absolute differences."""
    # (M, 1, D) - (1, N, D) -> (M, N, D) then sum over D
    # For memory efficiency with large datasets, process in chunks
    return np.abs(A[:, None, :] - B[None, :, :]).sum(axis=2)

def pairwise_minkowski(A, B, p=2):
    """General Minkowski distance: (sum(|a-b|^p))^(1/p)."""
    diff = np.abs(A[:, None, :] - B[None, :, :])  # (M, N, D)
    if p == np.inf:
        return diff.max(axis=2)  # Chebyshev distance
    return (diff ** p).sum(axis=2) ** (1.0 / p)

# ---- VERIFICATION ----
np.random.seed(42)
A = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
B = np.array([[1.0, 1.0], [2.0, 2.0]])

# Manhattan from (0,0) to (1,1) should be 2
dist_l1 = pairwise_manhattan(A, B)
assert dist_l1[0, 0] == 2.0, f"L1 from (0,0) to (1,1) should be 2, got {dist_l1[0, 0]}"
assert dist_l1.shape == (3, 2), f"Expected (3, 2), got {dist_l1.shape}"

# Minkowski with p=2 should match Euclidean
A_large = np.random.randn(50, 10)
B_large = np.random.randn(30, 10)
dist_mink2 = pairwise_minkowski(A_large, B_large, p=2)
dist_euc = pairwise_euclidean(A_large, B_large)
assert np.allclose(dist_mink2, dist_euc, atol=1e-6), "p=2 Minkowski should match Euclidean"

# Chebyshev distance (p=inf): max absolute difference
dist_inf = pairwise_minkowski(A, B, p=np.inf)
assert dist_inf[0, 0] == 1.0, "Chebyshev from (0,0) to (1,1) should be 1"
print("Challenge 2 passed!")

Challenge 3: Cosine Distance and Angular Distance

Problem: Implement cosine distance (1 - cosine_similarity) and angular distance (arccos(cosine_similarity) / pi). Given a query vector and a database of embeddings, return the K nearest neighbors by each metric. Discuss when cosine vs. Euclidean distance is preferred.

import numpy as np

# ---- CHALLENGE ----
def cosine_knn(query, database, k):
    """Find k nearest neighbors by cosine distance.
    Return indices and distances."""
    pass

def angular_distance(A, B):
    """Angular distance: arccos(cosine_sim) / pi. In [0, 1] range."""
    pass

# ---- SOLUTION ----
def cosine_knn(query, database, k):
    """K-nearest neighbors using cosine distance."""
    # Normalize query
    q_norm = query / (np.linalg.norm(query) + 1e-12)

    # Normalize all database vectors
    db_norms = np.linalg.norm(database, axis=1, keepdims=True)
    db_norms = np.where(db_norms == 0, 1.0, db_norms)
    db_normalized = database / db_norms

    # Cosine similarity via dot product
    similarities = db_normalized @ q_norm  # (N,)

    # Cosine distance = 1 - similarity
    distances = 1.0 - similarities

    # Find top-k (smallest distances)
    top_k_idx = np.argpartition(distances, k)[:k]
    # Sort within top-k
    top_k_idx = top_k_idx[np.argsort(distances[top_k_idx])]

    return top_k_idx, distances[top_k_idx]

def angular_distance(A, B):
    """Angular distance normalized to [0, 1]."""
    # Normalize
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

    # Cosine similarity
    cos_sim = A_norm @ B_norm.T  # (M, N)
    cos_sim = np.clip(cos_sim, -1.0, 1.0)

    # Angular distance = arccos(cos_sim) / pi
    return np.arccos(cos_sim) / np.pi

# ---- VERIFICATION ----
np.random.seed(42)
query = np.random.randn(128)
database = np.random.randn(10000, 128)
# Make entry 7777 very similar to query
database[7777] = query + np.random.randn(128) * 0.01

idx, dists = cosine_knn(query, database, k=5)
assert 7777 in idx, "Entry 7777 should be in top 5"
assert idx[0] == 7777, "Entry 7777 should be the nearest"
assert np.all(dists >= 0), "Cosine distances should be non-negative"

# Angular distance properties
A = np.random.randn(5, 64)
ang = angular_distance(A, A)
assert np.allclose(np.diag(ang), 0, atol=1e-6), "Self angular distance should be 0"
assert np.all(ang >= -1e-6) and np.all(ang <= 1 + 1e-6), "Angular distance should be in [0, 1]"
print("Challenge 3 passed!")

Challenge 4: Efficient Pairwise Distance Matrix

Problem: Implement a function that computes the pairwise distance matrix within a single set of points (self-distances). Since the result is symmetric with zeros on the diagonal, optimize by computing only the upper triangle. This is used in hierarchical clustering and t-SNE.

import numpy as np

# ---- CHALLENGE ----
def pdist_squared(X):
    """Compute pairwise squared Euclidean distance matrix for X (N, D).
    Return symmetric (N, N) matrix with zeros on diagonal."""
    pass

def gaussian_kernel_matrix(X, sigma=1.0):
    """Compute Gaussian (RBF) kernel matrix: K[i,j] = exp(-||x_i - x_j||^2 / (2*sigma^2))."""
    pass

# ---- SOLUTION ----
def pdist_squared(X):
    """Self pairwise squared distances using expanded form."""
    # ||x_i - x_j||^2 = ||x_i||^2 + ||x_j||^2 - 2 * x_i . x_j
    X_sq = np.sum(X ** 2, axis=1)        # (N,)
    # Gram matrix: X @ X^T
    gram = X @ X.T                        # (N, N)
    # Broadcasting: (N, 1) + (1, N) - 2 * (N, N) -> (N, N)
    dist_sq = X_sq[:, None] + X_sq[None, :] - 2 * gram
    # Ensure non-negative and zero diagonal
    dist_sq = np.maximum(dist_sq, 0.0)
    np.fill_diagonal(dist_sq, 0.0)
    return dist_sq

def gaussian_kernel_matrix(X, sigma=1.0):
    """RBF kernel matrix."""
    dist_sq = pdist_squared(X)
    return np.exp(-dist_sq / (2 * sigma ** 2))

# ---- VERIFICATION ----
np.random.seed(42)
X = np.random.randn(100, 10)

D = pdist_squared(X)
assert D.shape == (100, 100), f"Expected (100, 100), got {D.shape}"
assert np.allclose(D, D.T, atol=1e-10), "Distance matrix should be symmetric"
assert np.allclose(np.diag(D), 0), "Diagonal should be zero"
assert np.all(D >= -1e-10), "Distances should be non-negative"

K = gaussian_kernel_matrix(X, sigma=1.0)
assert K.shape == (100, 100), f"Expected (100, 100), got {K.shape}"
assert np.allclose(np.diag(K), 1.0), "Self-kernel should be 1"
assert np.all(K >= 0) and np.all(K <= 1), "Kernel values should be in [0, 1]"
assert np.allclose(K, K.T), "Kernel matrix should be symmetric"
print("Challenge 4 passed!")

Challenge 5: K-Nearest Neighbor Prediction

Problem: Implement a complete KNN classifier using only NumPy. Given training data X_train (N, D), labels y_train (N,), and test data X_test (M, D), predict the label for each test point using majority vote of the K nearest training points. Support both uniform and distance-weighted voting.

import numpy as np

# ---- CHALLENGE ----
def knn_predict(X_train, y_train, X_test, k=5, weighted=False):
    """KNN classification. Return predicted labels for X_test."""
    pass

# ---- SOLUTION ----
def knn_predict(X_train, y_train, X_test, k=5, weighted=False):
    """Full KNN classifier in pure NumPy."""
    # Step 1: Compute pairwise distances (M, N)
    # Using expanded Euclidean distance
    test_sq = np.sum(X_test ** 2, axis=1, keepdims=True)    # (M, 1)
    train_sq = np.sum(X_train ** 2, axis=1, keepdims=True)  # (N, 1)
    cross = X_test @ X_train.T                               # (M, N)
    dist_sq = test_sq + train_sq.T - 2 * cross               # (M, N)
    dist_sq = np.maximum(dist_sq, 0.0)

    # Step 2: Find k nearest neighbors for each test point
    # argpartition is O(N) per row, vs O(N log N) for argsort
    knn_indices = np.argpartition(dist_sq, k, axis=1)[:, :k]  # (M, k)

    # Step 3: Get labels and distances of neighbors
    knn_labels = y_train[knn_indices]                          # (M, k)

    if weighted:
        # Distance-weighted voting
        knn_distances = np.sqrt(
            np.take_along_axis(dist_sq, knn_indices, axis=1)   # (M, k)
        )
        # Weights: inverse distance (add epsilon to avoid division by zero)
        weights = 1.0 / (knn_distances + 1e-12)               # (M, k)
    else:
        weights = np.ones_like(knn_labels, dtype=float)

    # Step 4: Majority vote (weighted)
    num_classes = int(y_train.max()) + 1
    M = X_test.shape[0]
    predictions = np.zeros(M, dtype=int)

    for i in range(M):
        # Accumulate votes per class
        votes = np.zeros(num_classes)
        for j in range(k):
            votes[knn_labels[i, j]] += weights[i, j]
        predictions[i] = np.argmax(votes)

    return predictions

# ---- VERIFICATION ----
np.random.seed(42)
# Create simple 2-class dataset
N_train = 200
X_train = np.vstack([
    np.random.randn(N_train // 2, 2) + [2, 2],   # class 0: centered at (2, 2)
    np.random.randn(N_train // 2, 2) + [-2, -2],  # class 1: centered at (-2, -2)
])
y_train = np.array([0] * (N_train // 2) + [1] * (N_train // 2))

# Test points near each cluster center
X_test = np.array([[2.0, 2.0], [-2.0, -2.0], [0.0, 0.0], [3.0, 3.0]])

# Uniform voting
preds = knn_predict(X_train, y_train, X_test, k=5)
assert preds[0] == 0, "Point near (2,2) should be class 0"
assert preds[1] == 1, "Point near (-2,-2) should be class 1"
assert preds[3] == 0, "Point near (3,3) should be class 0"

# Distance-weighted voting
preds_w = knn_predict(X_train, y_train, X_test, k=5, weighted=True)
assert preds_w[0] == 0 and preds_w[1] == 1, "Weighted predictions should also work"

# Accuracy on larger test set
X_test_large = np.vstack([
    np.random.randn(100, 2) + [2, 2],
    np.random.randn(100, 2) + [-2, -2],
])
y_test_large = np.array([0] * 100 + [1] * 100)
preds_large = knn_predict(X_train, y_train, X_test_large, k=5)
accuracy = (preds_large == y_test_large).mean()
assert accuracy > 0.9, f"Accuracy should be > 90%, got {accuracy:.2%}"
print(f"KNN accuracy: {accuracy:.2%}")
print("Challenge 5 passed!")

Key Takeaways

💡
  • Use the expanded form ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b for memory-efficient pairwise Euclidean distance
  • Cosine distance is preferred when magnitude does not matter (text embeddings, normalized features)
  • Euclidean distance is preferred for spatial data and when magnitude carries information
  • np.argpartition finds the k smallest/largest in O(n) vs O(n log n) for full sort
  • The Gaussian kernel exp(-d^2 / 2*sigma^2) converts distances to similarities for kernel methods
  • Distance-weighted KNN gives closer neighbors more influence and often outperforms uniform voting