Intermediate
Implement K-Means & KNN
Two classic algorithms that test your understanding of distance metrics, iterative optimization, and instance-based learning. Both are frequently asked in ML coding interviews because they are simple enough to implement in 30 minutes but reveal deep understanding through follow-up questions.
Distance Metrics You Must Know
Both K-Means and KNN rely on distance computations. Interviewers often ask you to implement or discuss these:
import numpy as np
def euclidean_distance(a, b):
"""L2 distance - most common, default for both K-Means and KNN."""
return np.sqrt(np.sum((a - b) ** 2))
def manhattan_distance(a, b):
"""L1 distance - robust to outliers, useful for high-dimensional data."""
return np.sum(np.abs(a - b))
def cosine_distance(a, b):
"""1 - cosine_similarity. Used for text/embeddings, ignores magnitude."""
cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-15)
return 1 - cos_sim
# Test
a = np.array([1.0, 2.0, 3.0])
b = np.array([4.0, 5.0, 6.0])
print(f"Euclidean: {euclidean_distance(a, b):.4f}") # 5.1962
print(f"Manhattan: {manhattan_distance(a, b):.4f}") # 9.0
print(f"Cosine: {cosine_distance(a, b):.4f}") # 0.0254
Interview tip: When asked “which distance metric would you use?”, always ask about the data. Euclidean for low-dimensional numeric data. Manhattan for sparse or high-dimensional data. Cosine for text embeddings where magnitude does not matter.
Interview Question #1: “Implement K-Means clustering from scratch. Support K-Means++ initialization. Include a method to compute the inertia (within-cluster sum of squares).”
K-Means: Complete Solution
import numpy as np
class KMeans:
"""
K-Means clustering with K-Means++ initialization.
Interview-ready implementation.
"""
def __init__(self, n_clusters=3, max_iterations=100, tol=1e-4,
init='kmeans++'):
self.k = n_clusters
self.max_iter = max_iterations
self.tol = tol
self.init = init
self.centroids = None
self.labels = None
self.inertia_ = None
def _init_centroids(self, X):
"""Initialize centroids using random or K-Means++ strategy."""
n_samples = X.shape[0]
if self.init == 'random':
indices = np.random.choice(n_samples, self.k, replace=False)
return X[indices].copy()
# K-Means++ initialization
centroids = [X[np.random.randint(n_samples)]]
for _ in range(1, self.k):
# Compute distance from each point to nearest centroid
distances = np.array([
min(np.sum((x - c) ** 2) for c in centroids)
for x in X
])
# Choose next centroid with probability proportional to distance^2
probabilities = distances / distances.sum()
next_idx = np.random.choice(n_samples, p=probabilities)
centroids.append(X[next_idx])
return np.array(centroids)
def _assign_clusters(self, X):
"""Assign each point to the nearest centroid."""
# Compute distances from each point to each centroid
distances = np.array([
np.sqrt(np.sum((X - centroid) ** 2, axis=1))
for centroid in self.centroids
]) # Shape: (k, n_samples)
return np.argmin(distances, axis=0)
def _update_centroids(self, X, labels):
"""Recompute centroids as the mean of assigned points."""
new_centroids = np.array([
X[labels == i].mean(axis=0) if np.sum(labels == i) > 0
else self.centroids[i] # Keep old centroid if cluster is empty
for i in range(self.k)
])
return new_centroids
def fit(self, X):
"""Run K-Means until convergence or max iterations."""
self.centroids = self._init_centroids(X)
for iteration in range(self.max_iter):
# Step 1: Assign clusters
self.labels = self._assign_clusters(X)
# Step 2: Update centroids
new_centroids = self._update_centroids(X, self.labels)
# Check convergence
shift = np.sum((new_centroids - self.centroids) ** 2)
self.centroids = new_centroids
if shift < self.tol:
break
# Compute inertia (within-cluster sum of squares)
self.inertia_ = sum(
np.sum((X[self.labels == i] - self.centroids[i]) ** 2)
for i in range(self.k)
)
return self
def predict(self, X):
"""Assign new points to nearest centroid."""
return self._assign_clusters(X)
# ---- Test with clearly separated clusters ----
np.random.seed(42)
# Three clusters centered at (0,0), (5,5), (10,0)
cluster1 = np.random.randn(50, 2) + [0, 0]
cluster2 = np.random.randn(50, 2) + [5, 5]
cluster3 = np.random.randn(50, 2) + [10, 0]
X = np.vstack([cluster1, cluster2, cluster3])
kmeans = KMeans(n_clusters=3, init='kmeans++')
kmeans.fit(X)
print(f"Centroids:\n{kmeans.centroids}")
print(f"Inertia: {kmeans.inertia_:.2f}")
print(f"Labels (first 10): {kmeans.labels[:10]}") # Should all be same cluster
What interviewers look for:
- K-Means++ initialization — choosing initial centroids proportional to squared distance. This is the most common follow-up question.
- Empty cluster handling — what happens if a cluster gets no points? Your code should not crash.
- Convergence check — stopping when centroids stop moving rather than always running max iterations.
- Inertia computation — understanding the objective function K-Means minimizes.
Follow-up question: “Why does K-Means sometimes converge to a bad solution?” Answer: K-Means is sensitive to initialization and converges to local optima. That is why K-Means++ was invented and why in practice we run K-Means multiple times with different seeds and keep the result with the lowest inertia.
Interview Question #2: “Implement a K-Nearest Neighbors classifier from scratch. Support both uniform and distance-weighted voting. Explain the time complexity of prediction.”
KNN Classifier: Complete Solution
import numpy as np
from collections import Counter
class KNNClassifier:
"""
K-Nearest Neighbors classifier from scratch.
Supports uniform and distance-weighted voting.
"""
def __init__(self, k=5, weights='uniform'):
self.k = k
self.weights = weights # 'uniform' or 'distance'
self.X_train = None
self.y_train = None
def fit(self, X, y):
"""
KNN is a lazy learner - just store the training data.
No actual training happens here.
"""
self.X_train = X.copy()
self.y_train = y.copy()
return self
def _compute_distances(self, x):
"""Compute Euclidean distances from x to all training points."""
return np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
def _predict_single(self, x):
"""Predict the class for a single sample."""
distances = self._compute_distances(x)
# Get k nearest neighbors
k_indices = np.argsort(distances)[:self.k]
k_labels = self.y_train[k_indices]
k_distances = distances[k_indices]
if self.weights == 'uniform':
# Simple majority vote
return Counter(k_labels).most_common(1)[0][0]
# Distance-weighted voting
# Inverse distance weights (add epsilon to avoid division by zero)
weights = 1.0 / (k_distances + 1e-10)
class_weights = {}
for label, weight in zip(k_labels, weights):
class_weights[label] = class_weights.get(label, 0) + weight
return max(class_weights, key=class_weights.get)
def predict(self, X):
"""Predict class labels for all samples."""
return np.array([self._predict_single(x) for x in X])
def accuracy(self, X, y):
"""Compute classification accuracy."""
return np.mean(self.predict(X) == y)
# ---- Test ----
np.random.seed(42)
# Create 3-class dataset
X_train = np.vstack([
np.random.randn(30, 2) + [0, 0], # Class 0
np.random.randn(30, 2) + [3, 3], # Class 1
np.random.randn(30, 2) + [6, 0], # Class 2
])
y_train = np.array([0]*30 + [1]*30 + [2]*30)
X_test = np.array([[0.5, 0.5], [3.5, 3.5], [5.5, 0.5], [1.5, 1.5]])
y_test = np.array([0, 1, 2, 0])
# Test uniform voting
knn_uniform = KNNClassifier(k=5, weights='uniform')
knn_uniform.fit(X_train, y_train)
print(f"Uniform predictions: {knn_uniform.predict(X_test)}")
print(f"Expected: {y_test}")
# Test distance-weighted voting
knn_weighted = KNNClassifier(k=5, weights='distance')
knn_weighted.fit(X_train, y_train)
print(f"Weighted predictions: {knn_weighted.predict(X_test)}")
print(f"Training accuracy (k=5): {knn_uniform.accuracy(X_train, y_train):.2%}")
What interviewers look for:
- Understanding that KNN is a lazy learner — no training phase, all computation at prediction time
- Time complexity: O(n*d) per prediction where n = training samples, d = features. For the full test set: O(m*n*d)
- Distance-weighted voting — closer neighbors should have more influence
- Knowing when KNN fails: high dimensions (curse of dimensionality), large datasets (slow prediction), imbalanced classes
Interview Question #3: “How would you choose the optimal K for KNN? Implement the elbow method for K-Means.”
Choosing Hyperparameters
Elbow Method for K-Means
def elbow_method(X, k_range=range(1, 11)):
"""
Run K-Means for different K values and return inertias.
Plot the elbow curve to find the optimal K.
"""
inertias = []
for k in k_range:
kmeans = KMeans(n_clusters=k, init='kmeans++')
kmeans.fit(X)
inertias.append(kmeans.inertia_)
print(f"K={k}: inertia={kmeans.inertia_:.2f}")
return list(k_range), inertias
# Run it
ks, inertias = elbow_method(X)
# Look for the "elbow" - where inertia stops decreasing rapidly
# For our 3-cluster data, K=3 should be the elbow point
Cross-Validation for KNN's K
def knn_cross_validate(X, y, k_range=range(1, 21), n_folds=5):
"""
Simple K-fold cross-validation to find optimal K for KNN.
"""
n = len(X)
fold_size = n // n_folds
results = {}
for k in k_range:
fold_accuracies = []
for fold in range(n_folds):
# Split into train/validation
val_start = fold * fold_size
val_end = val_start + fold_size
X_val = X[val_start:val_end]
y_val = y[val_start:val_end]
X_tr = np.vstack([X[:val_start], X[val_end:]])
y_tr = np.concatenate([y[:val_start], y[val_end:]])
# Train and evaluate
knn = KNNClassifier(k=k, weights='uniform')
knn.fit(X_tr, y_tr)
acc = knn.accuracy(X_val, y_val)
fold_accuracies.append(acc)
mean_acc = np.mean(fold_accuracies)
results[k] = mean_acc
print(f"K={k}: mean accuracy = {mean_acc:.2%}")
best_k = max(results, key=results.get)
print(f"\nBest K: {best_k} (accuracy: {results[best_k]:.2%})")
return results
# Shuffle data first
indices = np.random.permutation(len(X_train))
results = knn_cross_validate(X_train[indices], y_train[indices], k_range=range(1, 16))
Interview tip: Always mention that K should be odd for binary classification to avoid ties. For multi-class, use distance-weighted voting to break ties. Also note that very small K (K=1) overfits to noise, while very large K underfits by over-smoothing decision boundaries.
Lilly Tech Systems