Advanced

Distance & Similarity

Five coding challenges covering the distance and similarity metrics at the heart of retrieval systems, KNN classifiers, clustering algorithms, and recommendation engines. Every solution is fully vectorized — no loops over data points.

Challenge 1: Euclidean Distance

Problem: Given two sets of points A of shape (M, D) and B of shape (N, D), compute the pairwise Euclidean distance matrix of shape (M, N). Do this without any Python loops, using the expanded form: ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * a . b.

💡

Why the expanded form? The naive approach np.linalg.norm(A[:, None] - B[None, :], axis=2) creates an intermediate array of shape (M, N, D) which can be enormous. The expanded form uses (M, D) @ (D, N) matrix multiply which is much more memory-efficient and faster.

import numpy as np

# ---- CHALLENGE ----
def pairwise_euclidean(A, B):
    """Compute pairwise Euclidean distance matrix. Return shape (M, N)."""
    pass

# ---- NAIVE SOLUTION (memory-intensive) ----
def pairwise_euclidean_naive(A, B):
    """Creates (M, N, D) intermediate array -- BAD for large datasets."""
    diff = A[:, None, :] - B[None, :, :]   # (M, N, D)
    return np.sqrt((diff ** 2).sum(axis=2))  # (M, N)

# ---- OPTIMAL SOLUTION (expanded form) ----
def pairwise_euclidean(A, B):
    """Memory-efficient using ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b"""
    # ||a||^2 for each row of A
    A_sq = np.sum(A ** 2, axis=1, keepdims=True)  # (M, 1)
    # ||b||^2 for each row of B
    B_sq = np.sum(B ** 2, axis=1, keepdims=True)  # (N, 1)
    # a . b for all pairs
    AB = A @ B.T                                    # (M, N)

    # ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * a.b
    dist_sq = A_sq + B_sq.T - 2 * AB               # (M, N) via broadcasting

    # Clip negative values (can happen due to floating point)
    dist_sq = np.maximum(dist_sq, 0.0)

    return np.sqrt(dist_sq)

# ---- VERIFICATION ----
np.random.seed(42)
A = np.random.randn(100, 64)
B = np.random.randn(200, 64)

dist = pairwise_euclidean(A, B)
assert dist.shape == (100, 200), f"Expected (100, 200), got {dist.shape}"
assert np.all(dist >= 0), "Distances should be non-negative"

# Self-distance should be 0 on diagonal
self_dist = pairwise_euclidean(A, A)
assert np.allclose(np.diag(self_dist), 0, atol=1e-6), "Self-distance should be 0"

# Compare with naive (should be the same)
dist_naive = pairwise_euclidean_naive(A[:10], B[:20])
dist_opt = pairwise_euclidean(A[:10], B[:20])
assert np.allclose(dist_naive, dist_opt, atol=1e-6), "Should match naive implementation"

# Triangle inequality: d(a,c) <= d(a,b) + d(b,c)
d = pairwise_euclidean(A[:3], A[:3])
assert d[0, 2] <= d[0, 1] + d[1, 2] + 1e-10, "Triangle inequality violated"
print("Challenge 1 passed!")

Challenge 2: Manhattan Distance

Problem: Compute the pairwise Manhattan (L1) distance matrix. Unlike Euclidean, there is no expanded form trick for L1, so you need to use broadcasting carefully. Also implement the Minkowski distance as a generalization: d_p(a, b) = (sum(|a_i - b_i|^p))^{1/p}.

import numpy as np

# ---- CHALLENGE ----
def pairwise_manhattan(A, B):
    """Pairwise Manhattan (L1) distance. Return shape (M, N)."""
    pass

def pairwise_minkowski(A, B, p=2):
    """Pairwise Minkowski distance with parameter p. Return shape (M, N)."""
    pass

# ---- SOLUTION ----
def pairwise_manhattan(A, B):
    """L1 distance: sum of absolute differences."""
    # (M, 1, D) - (1, N, D) -> (M, N, D) then sum over D
    # For memory efficiency with large datasets, process in chunks
    return np.abs(A[:, None, :] - B[None, :, :]).sum(axis=2)

def pairwise_minkowski(A, B, p=2):
    """General Minkowski distance: (sum(|a-b|^p))^(1/p)."""
    diff = np.abs(A[:, None, :] - B[None, :, :])  # (M, N, D)
    if p == np.inf:
        return diff.max(axis=2)  # Chebyshev distance
    return (diff ** p).sum(axis=2) ** (1.0 / p)

# ---- VERIFICATION ----
np.random.seed(42)
A = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
B = np.array([[1.0, 1.0], [2.0, 2.0]])

# Manhattan from (0,0) to (1,1) should be 2
dist_l1 = pairwise_manhattan(A, B)
assert dist_l1[0, 0] == 2.0, f"L1 from (0,0) to (1,1) should be 2, got {dist_l1[0, 0]}"
assert dist_l1.shape == (3, 2), f"Expected (3, 2), got {dist_l1.shape}"

# Minkowski with p=2 should match Euclidean
A_large = np.random.randn(50, 10)
B_large = np.random.randn(30, 10)
dist_mink2 = pairwise_minkowski(A_large, B_large, p=2)
dist_euc = pairwise_euclidean(A_large, B_large)
assert np.allclose(dist_mink2, dist_euc, atol=1e-6), "p=2 Minkowski should match Euclidean"

# Chebyshev distance (p=inf): max absolute difference
dist_inf = pairwise_minkowski(A, B, p=np.inf)
assert dist_inf[0, 0] == 1.0, "Chebyshev from (0,0) to (1,1) should be 1"
print("Challenge 2 passed!")

Challenge 3: Cosine Distance and Angular Distance

Problem: Implement cosine distance (1 - cosine_similarity) and angular distance (arccos(cosine_similarity) / pi). Given a query vector and a database of embeddings, return the K nearest neighbors by each metric. Discuss when cosine vs. Euclidean distance is preferred.

import numpy as np

# ---- CHALLENGE ----
def cosine_knn(query, database, k):
    """Find k nearest neighbors by cosine distance.
    Return indices and distances."""
    pass

def angular_distance(A, B):
    """Angular distance: arccos(cosine_sim) / pi. In [0, 1] range."""
    pass

# ---- SOLUTION ----
def cosine_knn(query, database, k):
    """K-nearest neighbors using cosine distance."""
    # Normalize query
    q_norm = query / (np.linalg.norm(query) + 1e-12)

    # Normalize all database vectors
    db_norms = np.linalg.norm(database, axis=1, keepdims=True)
    db_norms = np.where(db_norms == 0, 1.0, db_norms)
    db_normalized = database / db_norms

    # Cosine similarity via dot product
    similarities = db_normalized @ q_norm  # (N,)

    # Cosine distance = 1 - similarity
    distances = 1.0 - similarities

    # Find top-k (smallest distances)
    top_k_idx = np.argpartition(distances, k)[:k]
    # Sort within top-k
    top_k_idx = top_k_idx[np.argsort(distances[top_k_idx])]

    return top_k_idx, distances[top_k_idx]

def angular_distance(A, B):
    """Angular distance normalized to [0, 1]."""
    # Normalize
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

    # Cosine similarity
    cos_sim = A_norm @ B_norm.T  # (M, N)
    cos_sim = np.clip(cos_sim, -1.0, 1.0)

    # Angular distance = arccos(cos_sim) / pi
    return np.arccos(cos_sim) / np.pi

# ---- VERIFICATION ----
np.random.seed(42)
query = np.random.randn(128)
database = np.random.randn(10000, 128)
# Make entry 7777 very similar to query
database[7777] = query + np.random.randn(128) * 0.01

idx, dists = cosine_knn(query, database, k=5)
assert 7777 in idx, "Entry 7777 should be in top 5"
assert idx[0] == 7777, "Entry 7777 should be the nearest"
assert np.all(dists >= 0), "Cosine distances should be non-negative"

# Angular distance properties
A = np.random.randn(5, 64)
ang = angular_distance(A, A)
assert np.allclose(np.diag(ang), 0, atol=1e-6), "Self angular distance should be 0"
assert np.all(ang >= -1e-6) and np.all(ang <= 1 + 1e-6), "Angular distance should be in [0, 1]"
print("Challenge 3 passed!")

Challenge 4: Efficient Pairwise Distance Matrix

Problem: Implement a function that computes the pairwise distance matrix within a single set of points (self-distances). Since the result is symmetric with zeros on the diagonal, optimize by computing only the upper triangle. This is used in hierarchical clustering and t-SNE.

import numpy as np

# ---- CHALLENGE ----
def pdist_squared(X):
    """Compute pairwise squared Euclidean distance matrix for X (N, D).
    Return symmetric (N, N) matrix with zeros on diagonal."""
    pass

def gaussian_kernel_matrix(X, sigma=1.0):
    """Compute Gaussian (RBF) kernel matrix: K[i,j] = exp(-||x_i - x_j||^2 / (2*sigma^2))."""
    pass

# ---- SOLUTION ----
def pdist_squared(X):
    """Self pairwise squared distances using expanded form."""
    # ||x_i - x_j||^2 = ||x_i||^2 + ||x_j||^2 - 2 * x_i . x_j
    X_sq = np.sum(X ** 2, axis=1)        # (N,)
    # Gram matrix: X @ X^T
    gram = X @ X.T                        # (N, N)
    # Broadcasting: (N, 1) + (1, N) - 2 * (N, N) -> (N, N)
    dist_sq = X_sq[:, None] + X_sq[None, :] - 2 * gram
    # Ensure non-negative and zero diagonal
    dist_sq = np.maximum(dist_sq, 0.0)
    np.fill_diagonal(dist_sq, 0.0)
    return dist_sq

def gaussian_kernel_matrix(X, sigma=1.0):
    """RBF kernel matrix."""
    dist_sq = pdist_squared(X)
    return np.exp(-dist_sq / (2 * sigma ** 2))

# ---- VERIFICATION ----
np.random.seed(42)
X = np.random.randn(100, 10)

D = pdist_squared(X)
assert D.shape == (100, 100), f"Expected (100, 100), got {D.shape}"
assert np.allclose(D, D.T, atol=1e-10), "Distance matrix should be symmetric"
assert np.allclose(np.diag(D), 0), "Diagonal should be zero"
assert np.all(D >= -1e-10), "Distances should be non-negative"

K = gaussian_kernel_matrix(X, sigma=1.0)
assert K.shape == (100, 100), f"Expected (100, 100), got {K.shape}"
assert np.allclose(np.diag(K), 1.0), "Self-kernel should be 1"
assert np.all(K >= 0) and np.all(K <= 1), "Kernel values should be in [0, 1]"
assert np.allclose(K, K.T), "Kernel matrix should be symmetric"
print("Challenge 4 passed!")

Challenge 5: K-Nearest Neighbor Prediction

Problem: Implement a complete KNN classifier using only NumPy. Given training data X_train (N, D), labels y_train (N,), and test data X_test (M, D), predict the label for each test point using majority vote of the K nearest training points. Support both uniform and distance-weighted voting.

import numpy as np

# ---- CHALLENGE ----
def knn_predict(X_train, y_train, X_test, k=5, weighted=False):
    """KNN classification. Return predicted labels for X_test."""
    pass

# ---- SOLUTION ----
def knn_predict(X_train, y_train, X_test, k=5, weighted=False):
    """Full KNN classifier in pure NumPy."""
    # Step 1: Compute pairwise distances (M, N)
    # Using expanded Euclidean distance
    test_sq = np.sum(X_test ** 2, axis=1, keepdims=True)    # (M, 1)
    train_sq = np.sum(X_train ** 2, axis=1, keepdims=True)  # (N, 1)
    cross = X_test @ X_train.T                               # (M, N)
    dist_sq = test_sq + train_sq.T - 2 * cross               # (M, N)
    dist_sq = np.maximum(dist_sq, 0.0)

    # Step 2: Find k nearest neighbors for each test point
    # argpartition is O(N) per row, vs O(N log N) for argsort
    knn_indices = np.argpartition(dist_sq, k, axis=1)[:, :k]  # (M, k)

    # Step 3: Get labels and distances of neighbors
    knn_labels = y_train[knn_indices]                          # (M, k)

    if weighted:
        # Distance-weighted voting
        knn_distances = np.sqrt(
            np.take_along_axis(dist_sq, knn_indices, axis=1)   # (M, k)
        )
        # Weights: inverse distance (add epsilon to avoid division by zero)
        weights = 1.0 / (knn_distances + 1e-12)               # (M, k)
    else:
        weights = np.ones_like(knn_labels, dtype=float)

    # Step 4: Majority vote (weighted)
    num_classes = int(y_train.max()) + 1
    M = X_test.shape[0]
    predictions = np.zeros(M, dtype=int)

    for i in range(M):
        # Accumulate votes per class
        votes = np.zeros(num_classes)
        for j in range(k):
            votes[knn_labels[i, j]] += weights[i, j]
        predictions[i] = np.argmax(votes)

    return predictions

# ---- VERIFICATION ----
np.random.seed(42)
# Create simple 2-class dataset
N_train = 200
X_train = np.vstack([
    np.random.randn(N_train // 2, 2) + [2, 2],   # class 0: centered at (2, 2)
    np.random.randn(N_train // 2, 2) + [-2, -2],  # class 1: centered at (-2, -2)
])
y_train = np.array([0] * (N_train // 2) + [1] * (N_train // 2))

# Test points near each cluster center
X_test = np.array([[2.0, 2.0], [-2.0, -2.0], [0.0, 0.0], [3.0, 3.0]])

# Uniform voting
preds = knn_predict(X_train, y_train, X_test, k=5)
assert preds[0] == 0, "Point near (2,2) should be class 0"
assert preds[1] == 1, "Point near (-2,-2) should be class 1"
assert preds[3] == 0, "Point near (3,3) should be class 0"

# Distance-weighted voting
preds_w = knn_predict(X_train, y_train, X_test, k=5, weighted=True)
assert preds_w[0] == 0 and preds_w[1] == 1, "Weighted predictions should also work"

# Accuracy on larger test set
X_test_large = np.vstack([
    np.random.randn(100, 2) + [2, 2],
    np.random.randn(100, 2) + [-2, -2],
])
y_test_large = np.array([0] * 100 + [1] * 100)
preds_large = knn_predict(X_train, y_train, X_test_large, k=5)
accuracy = (preds_large == y_test_large).mean()
assert accuracy > 0.9, f"Accuracy should be > 90%, got {accuracy:.2%}"
print(f"KNN accuracy: {accuracy:.2%}")
print("Challenge 5 passed!")

Key Takeaways

💡

Use the expanded form ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b for memory-efficient pairwise Euclidean distance
Cosine distance is preferred when magnitude does not matter (text embeddings, normalized features)
Euclidean distance is preferred for spatial data and when magnitude carries information
np.argpartition finds the k smallest/largest in O(n) vs O(n log n) for full sort
The Gaussian kernel exp(-d^2 / 2*sigma^2) converts distances to similarities for kernel methods
Distance-weighted KNN gives closer neighbors more influence and often outperforms uniform voting

← Previous ML Implementations Next → Performance & Vectorization