Intermediate

Implement Evaluation Metrics

Interviewers frequently ask you to implement metrics from scratch to verify you understand what they actually measure — not just how to call sklearn functions. Master precision, recall, F1, AUC-ROC, confusion matrix, and cross-validation.

📝

Interview Question #1: “Implement a confusion matrix, then compute precision, recall, F1-score, and accuracy from it. Handle the edge case where a class has no predictions.”

Confusion Matrix & Core Metrics

import numpy as np

def confusion_matrix(y_true, y_pred, n_classes=None):
    """
    Build a confusion matrix from scratch.

    Returns:
        np.ndarray of shape (n_classes, n_classes)
        Row i, Column j = count of samples with true label i, predicted label j
    """
    if n_classes is None:
        n_classes = max(max(y_true), max(y_pred)) + 1

    matrix = np.zeros((n_classes, n_classes), dtype=int)

    for true, pred in zip(y_true, y_pred):
        matrix[true][pred] += 1

    return matrix


def binary_metrics(y_true, y_pred):
    """
    Compute precision, recall, F1, and accuracy for binary classification.
    Handles edge cases (no positive predictions, no positive labels).
    """
    # Count TP, FP, FN, TN
    tp = sum(1 for t, p in zip(y_true, y_pred) if t == 1 and p == 1)
    fp = sum(1 for t, p in zip(y_true, y_pred) if t == 0 and p == 1)
    fn = sum(1 for t, p in zip(y_true, y_pred) if t == 1 and p == 0)
    tn = sum(1 for t, p in zip(y_true, y_pred) if t == 0 and p == 0)

    # Precision: of all positive predictions, how many were correct?
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0

    # Recall: of all actual positives, how many did we find?
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    # F1: harmonic mean of precision and recall
    f1 = (2 * precision * recall / (precision + recall)
           if (precision + recall) > 0 else 0.0)

    # Accuracy: overall correct predictions
    accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0.0

    return {
        'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy
    }


# ---- Test ----
y_true = [1, 1, 1, 0, 0, 0, 1, 0, 1, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0, 0, 0]

cm = confusion_matrix(y_true, y_pred, n_classes=2)
print("Confusion Matrix:")
print(f"            Pred 0  Pred 1")
print(f"Actual 0:     {cm[0,0]}       {cm[0,1]}")
print(f"Actual 1:     {cm[1,0]}       {cm[1,1]}")

metrics = binary_metrics(y_true, y_pred)
print(f"\nTP={metrics['tp']}, FP={metrics['fp']}, "
      f"FN={metrics['fn']}, TN={metrics['tn']}")
print(f"Precision: {metrics['precision']:.4f}")  # 3/4 = 0.75
print(f"Recall:    {metrics['recall']:.4f}")     # 3/5 = 0.60
print(f"F1 Score:  {metrics['f1']:.4f}")         # 2*0.75*0.6/(0.75+0.6) = 0.6667
print(f"Accuracy:  {metrics['accuracy']:.4f}")   # 7/10 = 0.70

What interviewers look for:

Clear definitions: precision = TP/(TP+FP), recall = TP/(TP+FN)
Edge case handling: what if no positive predictions? (precision undefined)
Understanding when accuracy is misleading (imbalanced datasets)
Knowing that F1 is the harmonic mean (not arithmetic mean) — and why

💡

Common follow-up: “When would you optimize for precision vs recall?” Answer: Precision when false positives are costly (spam filter, content moderation). Recall when false negatives are costly (cancer detection, fraud detection). F1 when you need to balance both.

📝

Interview Question #2: “Implement AUC-ROC from scratch. Explain what the ROC curve represents and why AUC is threshold-independent.”

AUC-ROC: Complete Implementation

def roc_curve(y_true, y_scores):
    """
    Compute the ROC curve from scratch.

    Args:
        y_true: binary labels (0 or 1)
        y_scores: predicted probabilities for class 1

    Returns:
        fpr_list: false positive rates at each threshold
        tpr_list: true positive rates at each threshold
        thresholds: sorted thresholds used
    """
    # Sort by predicted probability (descending)
    sorted_indices = np.argsort(-np.array(y_scores))
    y_true_sorted = np.array(y_true)[sorted_indices]
    y_scores_sorted = np.array(y_scores)[sorted_indices]

    # Get unique thresholds
    thresholds = np.unique(y_scores_sorted)
    thresholds = np.sort(thresholds)[::-1]  # Descending

    total_positives = np.sum(y_true)
    total_negatives = len(y_true) - total_positives

    fpr_list = [0.0]
    tpr_list = [0.0]

    for threshold in thresholds:
        # Predict positive if score >= threshold
        y_pred = (np.array(y_scores) >= threshold).astype(int)

        tp = np.sum((y_pred == 1) & (np.array(y_true) == 1))
        fp = np.sum((y_pred == 1) & (np.array(y_true) == 0))

        tpr = tp / total_positives if total_positives > 0 else 0
        fpr = fp / total_negatives if total_negatives > 0 else 0

        fpr_list.append(fpr)
        tpr_list.append(tpr)

    # Add the point (1, 1)
    fpr_list.append(1.0)
    tpr_list.append(1.0)

    return np.array(fpr_list), np.array(tpr_list), thresholds


def auc_score(fpr, tpr):
    """
    Compute Area Under the ROC Curve using the trapezoidal rule.
    """
    # Sort by FPR to ensure correct integration
    sorted_indices = np.argsort(fpr)
    fpr_sorted = fpr[sorted_indices]
    tpr_sorted = tpr[sorted_indices]

    # Trapezoidal rule: sum of trapezoid areas
    auc = 0.0
    for i in range(1, len(fpr_sorted)):
        width = fpr_sorted[i] - fpr_sorted[i-1]
        height = (tpr_sorted[i] + tpr_sorted[i-1]) / 2
        auc += width * height

    return auc


# ---- Test ----
y_true = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
y_scores = [0.1, 0.2, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.95]

fpr, tpr, thresholds = roc_curve(y_true, y_scores)
auc = auc_score(fpr, tpr)
print(f"AUC-ROC: {auc:.4f}")  # Should be close to 1.0 for this data

# Test with random predictions
y_random_scores = np.random.rand(10)
fpr_r, tpr_r, _ = roc_curve(y_true, y_random_scores)
auc_random = auc_score(fpr_r, tpr_r)
print(f"Random AUC: {auc_random:.4f}")  # Should be ~0.5

What interviewers look for:

Understanding that ROC plots TPR (sensitivity) vs FPR (1 - specificity) at every threshold
AUC = 0.5 means random, AUC = 1.0 means perfect separation
AUC is threshold-independent because it evaluates the model across all thresholds
Using the trapezoidal rule for numerical integration

📝

Interview Question #3: “Implement k-fold cross-validation from scratch. Explain why it is better than a single train/test split and when stratified cross-validation is necessary.”

K-Fold Cross-Validation

def k_fold_split(X, y, k=5, shuffle=True, random_state=42):
    """
    Generate k-fold cross-validation splits from scratch.

    Yields:
        (X_train, y_train, X_val, y_val) for each fold
    """
    n = len(X)
    indices = np.arange(n)

    if shuffle:
        rng = np.random.RandomState(random_state)
        rng.shuffle(indices)

    fold_size = n // k
    remainder = n % k

    folds = []
    start = 0
    for i in range(k):
        # Distribute remainder samples across first folds
        end = start + fold_size + (1 if i < remainder else 0)
        folds.append(indices[start:end])
        start = end

    for i in range(k):
        val_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])

        yield X[train_idx], y[train_idx], X[val_idx], y[val_idx]


def cross_validate(model_class, model_params, X, y, k=5, metric='accuracy'):
    """
    Run k-fold cross-validation and return scores.
    """
    scores = []

    for fold, (X_train, y_train, X_val, y_val) in enumerate(
        k_fold_split(X, y, k=k)
    ):
        # Create and train a fresh model for each fold
        model = model_class(**model_params)
        model.fit(X_train, y_train)

        # Evaluate
        y_pred = model.predict(X_val)

        if metric == 'accuracy':
            score = np.mean(y_pred == y_val)
        elif metric == 'mse':
            score = np.mean((y_pred - y_val) ** 2)

        scores.append(score)
        print(f"Fold {fold+1}: {metric} = {score:.4f}")

    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"\nMean {metric}: {mean_score:.4f} (+/- {std_score:.4f})")

    return scores


def stratified_k_fold_split(X, y, k=5, random_state=42):
    """
    Stratified k-fold: ensures each fold has the same class distribution.
    Critical for imbalanced datasets.
    """
    rng = np.random.RandomState(random_state)
    classes = np.unique(y)

    # Group indices by class
    class_indices = {}
    for cls in classes:
        idx = np.where(y == cls)[0]
        rng.shuffle(idx)
        class_indices[cls] = idx

    # Distribute each class's samples across folds
    folds = [[] for _ in range(k)]
    for cls in classes:
        indices = class_indices[cls]
        fold_sizes = [len(indices) // k] * k
        for i in range(len(indices) % k):
            fold_sizes[i] += 1

        start = 0
        for i in range(k):
            folds[i].extend(indices[start:start + fold_sizes[i]])
            start += fold_sizes[i]

    for i in range(k):
        val_idx = np.array(folds[i])
        train_idx = np.concatenate([np.array(folds[j])
                                     for j in range(k) if j != i])
        yield X[train_idx], y[train_idx], X[val_idx], y[val_idx]


# ---- Test stratified split ----
# Create imbalanced dataset: 90% class 0, 10% class 1
y_imbalanced = np.array([0]*90 + [1]*10)
X_dummy = np.random.randn(100, 2)

print("Stratified K-Fold class distributions:")
for fold, (X_tr, y_tr, X_val, y_val) in enumerate(
    stratified_k_fold_split(X_dummy, y_imbalanced, k=5)
):
    train_ratio = np.mean(y_tr == 1)
    val_ratio = np.mean(y_val == 1)
    print(f"Fold {fold+1}: train positive rate = {train_ratio:.2%}, "
          f"val positive rate = {val_ratio:.2%}")
    # Both should be ~10%

💡

Interview tip: Always mention stratified K-fold when dealing with imbalanced datasets. Regular K-fold might create folds with no positive samples, leading to unreliable evaluation. Also, K-fold is better than a single split because it uses all data for both training and validation, giving a more reliable performance estimate with variance information.

📝

Interview Question #4: “Implement multi-class precision, recall, and F1 using both macro and weighted averaging. Explain the difference.”

Multi-Class Metrics

def multiclass_metrics(y_true, y_pred, average='macro'):
    """
    Compute precision, recall, F1 for multi-class classification.

    average:
        'macro' - unweighted mean across classes (treats all classes equally)
        'weighted' - weighted by class support (accounts for imbalance)
        'per_class' - return per-class metrics
    """
    classes = np.unique(np.concatenate([y_true, y_pred]))
    n_classes = len(classes)

    per_class = []
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        support = np.sum(y_true == cls)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall / (precision + recall)
              if (precision + recall) > 0 else 0.0)

        per_class.append({
            'class': cls,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': support
        })

    if average == 'per_class':
        return per_class

    if average == 'macro':
        # Simple unweighted average
        return {
            'precision': np.mean([m['precision'] for m in per_class]),
            'recall': np.mean([m['recall'] for m in per_class]),
            'f1': np.mean([m['f1'] for m in per_class]),
        }

    if average == 'weighted':
        # Weighted by support (number of true instances per class)
        total = sum(m['support'] for m in per_class)
        return {
            'precision': sum(m['precision'] * m['support']
                           for m in per_class) / total,
            'recall': sum(m['recall'] * m['support']
                        for m in per_class) / total,
            'f1': sum(m['f1'] * m['support']
                     for m in per_class) / total,
        }


# ---- Test ----
y_true = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2])
y_pred = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 0])

print("Per-class metrics:")
for m in multiclass_metrics(y_true, y_pred, average='per_class'):
    print(f"  Class {m['class']}: P={m['precision']:.3f}, "
          f"R={m['recall']:.3f}, F1={m['f1']:.3f}, "
          f"Support={m['support']}")

macro = multiclass_metrics(y_true, y_pred, average='macro')
print(f"\nMacro avg: P={macro['precision']:.3f}, "
      f"R={macro['recall']:.3f}, F1={macro['f1']:.3f}")

weighted = multiclass_metrics(y_true, y_pred, average='weighted')
print(f"Weighted avg: P={weighted['precision']:.3f}, "
      f"R={weighted['recall']:.3f}, F1={weighted['f1']:.3f}")

Macro vs Weighted: Use macro when all classes are equally important regardless of size. Use weighted when you want the metric to reflect the overall performance proportional to class frequency.

← Previous Data Processing Challenges Next → Practice Problems & Tips