Intermediate
Implement Evaluation Metrics
Interviewers frequently ask you to implement metrics from scratch to verify you understand what they actually measure — not just how to call sklearn functions. Master precision, recall, F1, AUC-ROC, confusion matrix, and cross-validation.
Interview Question #1: “Implement a confusion matrix, then compute precision, recall, F1-score, and accuracy from it. Handle the edge case where a class has no predictions.”
Confusion Matrix & Core Metrics
import numpy as np
def confusion_matrix(y_true, y_pred, n_classes=None):
"""
Build a confusion matrix from scratch.
Returns:
np.ndarray of shape (n_classes, n_classes)
Row i, Column j = count of samples with true label i, predicted label j
"""
if n_classes is None:
n_classes = max(max(y_true), max(y_pred)) + 1
matrix = np.zeros((n_classes, n_classes), dtype=int)
for true, pred in zip(y_true, y_pred):
matrix[true][pred] += 1
return matrix
def binary_metrics(y_true, y_pred):
"""
Compute precision, recall, F1, and accuracy for binary classification.
Handles edge cases (no positive predictions, no positive labels).
"""
# Count TP, FP, FN, TN
tp = sum(1 for t, p in zip(y_true, y_pred) if t == 1 and p == 1)
fp = sum(1 for t, p in zip(y_true, y_pred) if t == 0 and p == 1)
fn = sum(1 for t, p in zip(y_true, y_pred) if t == 1 and p == 0)
tn = sum(1 for t, p in zip(y_true, y_pred) if t == 0 and p == 0)
# Precision: of all positive predictions, how many were correct?
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
# Recall: of all actual positives, how many did we find?
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
# F1: harmonic mean of precision and recall
f1 = (2 * precision * recall / (precision + recall)
if (precision + recall) > 0 else 0.0)
# Accuracy: overall correct predictions
accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0.0
return {
'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn,
'precision': precision,
'recall': recall,
'f1': f1,
'accuracy': accuracy
}
# ---- Test ----
y_true = [1, 1, 1, 0, 0, 0, 1, 0, 1, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0, 0, 0]
cm = confusion_matrix(y_true, y_pred, n_classes=2)
print("Confusion Matrix:")
print(f" Pred 0 Pred 1")
print(f"Actual 0: {cm[0,0]} {cm[0,1]}")
print(f"Actual 1: {cm[1,0]} {cm[1,1]}")
metrics = binary_metrics(y_true, y_pred)
print(f"\nTP={metrics['tp']}, FP={metrics['fp']}, "
f"FN={metrics['fn']}, TN={metrics['tn']}")
print(f"Precision: {metrics['precision']:.4f}") # 3/4 = 0.75
print(f"Recall: {metrics['recall']:.4f}") # 3/5 = 0.60
print(f"F1 Score: {metrics['f1']:.4f}") # 2*0.75*0.6/(0.75+0.6) = 0.6667
print(f"Accuracy: {metrics['accuracy']:.4f}") # 7/10 = 0.70
What interviewers look for:
- Clear definitions: precision = TP/(TP+FP), recall = TP/(TP+FN)
- Edge case handling: what if no positive predictions? (precision undefined)
- Understanding when accuracy is misleading (imbalanced datasets)
- Knowing that F1 is the harmonic mean (not arithmetic mean) — and why
Common follow-up: “When would you optimize for precision vs recall?” Answer: Precision when false positives are costly (spam filter, content moderation). Recall when false negatives are costly (cancer detection, fraud detection). F1 when you need to balance both.
Interview Question #2: “Implement AUC-ROC from scratch. Explain what the ROC curve represents and why AUC is threshold-independent.”
AUC-ROC: Complete Implementation
def roc_curve(y_true, y_scores):
"""
Compute the ROC curve from scratch.
Args:
y_true: binary labels (0 or 1)
y_scores: predicted probabilities for class 1
Returns:
fpr_list: false positive rates at each threshold
tpr_list: true positive rates at each threshold
thresholds: sorted thresholds used
"""
# Sort by predicted probability (descending)
sorted_indices = np.argsort(-np.array(y_scores))
y_true_sorted = np.array(y_true)[sorted_indices]
y_scores_sorted = np.array(y_scores)[sorted_indices]
# Get unique thresholds
thresholds = np.unique(y_scores_sorted)
thresholds = np.sort(thresholds)[::-1] # Descending
total_positives = np.sum(y_true)
total_negatives = len(y_true) - total_positives
fpr_list = [0.0]
tpr_list = [0.0]
for threshold in thresholds:
# Predict positive if score >= threshold
y_pred = (np.array(y_scores) >= threshold).astype(int)
tp = np.sum((y_pred == 1) & (np.array(y_true) == 1))
fp = np.sum((y_pred == 1) & (np.array(y_true) == 0))
tpr = tp / total_positives if total_positives > 0 else 0
fpr = fp / total_negatives if total_negatives > 0 else 0
fpr_list.append(fpr)
tpr_list.append(tpr)
# Add the point (1, 1)
fpr_list.append(1.0)
tpr_list.append(1.0)
return np.array(fpr_list), np.array(tpr_list), thresholds
def auc_score(fpr, tpr):
"""
Compute Area Under the ROC Curve using the trapezoidal rule.
"""
# Sort by FPR to ensure correct integration
sorted_indices = np.argsort(fpr)
fpr_sorted = fpr[sorted_indices]
tpr_sorted = tpr[sorted_indices]
# Trapezoidal rule: sum of trapezoid areas
auc = 0.0
for i in range(1, len(fpr_sorted)):
width = fpr_sorted[i] - fpr_sorted[i-1]
height = (tpr_sorted[i] + tpr_sorted[i-1]) / 2
auc += width * height
return auc
# ---- Test ----
y_true = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
y_scores = [0.1, 0.2, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.95]
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
auc = auc_score(fpr, tpr)
print(f"AUC-ROC: {auc:.4f}") # Should be close to 1.0 for this data
# Test with random predictions
y_random_scores = np.random.rand(10)
fpr_r, tpr_r, _ = roc_curve(y_true, y_random_scores)
auc_random = auc_score(fpr_r, tpr_r)
print(f"Random AUC: {auc_random:.4f}") # Should be ~0.5
What interviewers look for:
- Understanding that ROC plots TPR (sensitivity) vs FPR (1 - specificity) at every threshold
- AUC = 0.5 means random, AUC = 1.0 means perfect separation
- AUC is threshold-independent because it evaluates the model across all thresholds
- Using the trapezoidal rule for numerical integration
Interview Question #3: “Implement k-fold cross-validation from scratch. Explain why it is better than a single train/test split and when stratified cross-validation is necessary.”
K-Fold Cross-Validation
def k_fold_split(X, y, k=5, shuffle=True, random_state=42):
"""
Generate k-fold cross-validation splits from scratch.
Yields:
(X_train, y_train, X_val, y_val) for each fold
"""
n = len(X)
indices = np.arange(n)
if shuffle:
rng = np.random.RandomState(random_state)
rng.shuffle(indices)
fold_size = n // k
remainder = n % k
folds = []
start = 0
for i in range(k):
# Distribute remainder samples across first folds
end = start + fold_size + (1 if i < remainder else 0)
folds.append(indices[start:end])
start = end
for i in range(k):
val_idx = folds[i]
train_idx = np.concatenate([folds[j] for j in range(k) if j != i])
yield X[train_idx], y[train_idx], X[val_idx], y[val_idx]
def cross_validate(model_class, model_params, X, y, k=5, metric='accuracy'):
"""
Run k-fold cross-validation and return scores.
"""
scores = []
for fold, (X_train, y_train, X_val, y_val) in enumerate(
k_fold_split(X, y, k=k)
):
# Create and train a fresh model for each fold
model = model_class(**model_params)
model.fit(X_train, y_train)
# Evaluate
y_pred = model.predict(X_val)
if metric == 'accuracy':
score = np.mean(y_pred == y_val)
elif metric == 'mse':
score = np.mean((y_pred - y_val) ** 2)
scores.append(score)
print(f"Fold {fold+1}: {metric} = {score:.4f}")
mean_score = np.mean(scores)
std_score = np.std(scores)
print(f"\nMean {metric}: {mean_score:.4f} (+/- {std_score:.4f})")
return scores
def stratified_k_fold_split(X, y, k=5, random_state=42):
"""
Stratified k-fold: ensures each fold has the same class distribution.
Critical for imbalanced datasets.
"""
rng = np.random.RandomState(random_state)
classes = np.unique(y)
# Group indices by class
class_indices = {}
for cls in classes:
idx = np.where(y == cls)[0]
rng.shuffle(idx)
class_indices[cls] = idx
# Distribute each class's samples across folds
folds = [[] for _ in range(k)]
for cls in classes:
indices = class_indices[cls]
fold_sizes = [len(indices) // k] * k
for i in range(len(indices) % k):
fold_sizes[i] += 1
start = 0
for i in range(k):
folds[i].extend(indices[start:start + fold_sizes[i]])
start += fold_sizes[i]
for i in range(k):
val_idx = np.array(folds[i])
train_idx = np.concatenate([np.array(folds[j])
for j in range(k) if j != i])
yield X[train_idx], y[train_idx], X[val_idx], y[val_idx]
# ---- Test stratified split ----
# Create imbalanced dataset: 90% class 0, 10% class 1
y_imbalanced = np.array([0]*90 + [1]*10)
X_dummy = np.random.randn(100, 2)
print("Stratified K-Fold class distributions:")
for fold, (X_tr, y_tr, X_val, y_val) in enumerate(
stratified_k_fold_split(X_dummy, y_imbalanced, k=5)
):
train_ratio = np.mean(y_tr == 1)
val_ratio = np.mean(y_val == 1)
print(f"Fold {fold+1}: train positive rate = {train_ratio:.2%}, "
f"val positive rate = {val_ratio:.2%}")
# Both should be ~10%
Interview tip: Always mention stratified K-fold when dealing with imbalanced datasets. Regular K-fold might create folds with no positive samples, leading to unreliable evaluation. Also, K-fold is better than a single split because it uses all data for both training and validation, giving a more reliable performance estimate with variance information.
Interview Question #4: “Implement multi-class precision, recall, and F1 using both macro and weighted averaging. Explain the difference.”
Multi-Class Metrics
def multiclass_metrics(y_true, y_pred, average='macro'):
"""
Compute precision, recall, F1 for multi-class classification.
average:
'macro' - unweighted mean across classes (treats all classes equally)
'weighted' - weighted by class support (accounts for imbalance)
'per_class' - return per-class metrics
"""
classes = np.unique(np.concatenate([y_true, y_pred]))
n_classes = len(classes)
per_class = []
for cls in classes:
tp = np.sum((y_pred == cls) & (y_true == cls))
fp = np.sum((y_pred == cls) & (y_true != cls))
fn = np.sum((y_pred != cls) & (y_true == cls))
support = np.sum(y_true == cls)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = (2 * precision * recall / (precision + recall)
if (precision + recall) > 0 else 0.0)
per_class.append({
'class': cls,
'precision': precision,
'recall': recall,
'f1': f1,
'support': support
})
if average == 'per_class':
return per_class
if average == 'macro':
# Simple unweighted average
return {
'precision': np.mean([m['precision'] for m in per_class]),
'recall': np.mean([m['recall'] for m in per_class]),
'f1': np.mean([m['f1'] for m in per_class]),
}
if average == 'weighted':
# Weighted by support (number of true instances per class)
total = sum(m['support'] for m in per_class)
return {
'precision': sum(m['precision'] * m['support']
for m in per_class) / total,
'recall': sum(m['recall'] * m['support']
for m in per_class) / total,
'f1': sum(m['f1'] * m['support']
for m in per_class) / total,
}
# ---- Test ----
y_true = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2])
y_pred = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 0])
print("Per-class metrics:")
for m in multiclass_metrics(y_true, y_pred, average='per_class'):
print(f" Class {m['class']}: P={m['precision']:.3f}, "
f"R={m['recall']:.3f}, F1={m['f1']:.3f}, "
f"Support={m['support']}")
macro = multiclass_metrics(y_true, y_pred, average='macro')
print(f"\nMacro avg: P={macro['precision']:.3f}, "
f"R={macro['recall']:.3f}, F1={macro['f1']:.3f}")
weighted = multiclass_metrics(y_true, y_pred, average='weighted')
print(f"Weighted avg: P={weighted['precision']:.3f}, "
f"R={weighted['recall']:.3f}, F1={weighted['f1']:.3f}")
Macro vs Weighted: Use macro when all classes are equally important regardless of size. Use weighted when you want the metric to reflect the overall performance proportional to class frequency.
Lilly Tech Systems