Step 6: Evaluation & A/B Testing Advanced

A recommendation engine is only as good as its evaluation. This lesson covers the standard offline metrics used to compare recommendation algorithms, then moves to online A/B testing for measuring real-world impact on user engagement and business outcomes.

Offline Metrics

We implement the three most important ranking metrics for recommendation evaluation:

Precision@K and Recall@K

Python
import numpy as np
from collections import defaultdict

def precision_at_k(recommended, relevant, k):
    """Precision@K: fraction of top-K recommendations that are relevant.

    Args:
        recommended: list of recommended item indices (ordered by score)
        relevant: set of relevant item indices (ground truth)
        k: cutoff position
    Returns:
        float: precision score in [0, 1]
    """
    top_k = recommended[:k]
    hits = len(set(top_k) & set(relevant))
    return hits / k


def recall_at_k(recommended, relevant, k):
    """Recall@K: fraction of relevant items that appear in top-K.

    Args:
        recommended: list of recommended item indices
        relevant: set of relevant item indices
        k: cutoff position
    Returns:
        float: recall score in [0, 1]
    """
    if len(relevant) == 0:
        return 0.0
    top_k = recommended[:k]
    hits = len(set(top_k) & set(relevant))
    return hits / len(relevant)

Normalized Discounted Cumulative Gain (NDCG)

Python
def dcg_at_k(scores, k):
    """Discounted Cumulative Gain at position K.

    DCG = sum(relevance_i / log2(i + 1)) for i in 1..k
    """
    scores = np.array(scores[:k])
    gains = scores / np.log2(np.arange(2, len(scores) + 2))
    return np.sum(gains)


def ndcg_at_k(recommended, relevant_with_scores, k):
    """Normalized DCG@K.

    Args:
        recommended: list of recommended item indices (ordered)
        relevant_with_scores: dict of {item_idx: relevance_score}
        k: cutoff position
    Returns:
        float: NDCG score in [0, 1]
    """
    # Actual DCG from the recommendation order
    actual_scores = [
        relevant_with_scores.get(item, 0)
        for item in recommended[:k]
    ]
    actual_dcg = dcg_at_k(actual_scores, k)

    # Ideal DCG (best possible ordering)
    ideal_scores = sorted(relevant_with_scores.values(), reverse=True)
    ideal_dcg = dcg_at_k(ideal_scores, k)

    if ideal_dcg == 0:
        return 0.0

    return actual_dcg / ideal_dcg

Full Evaluation Pipeline

Python
def evaluate_recommender(model, test_df, train_matrix, k_values=[5, 10, 20],
                             relevance_threshold=4):
    """Evaluate a recommender model across multiple metrics and K values.

    Args:
        model: recommender with .recommend(user_idx, n) method
        test_df: test ratings DataFrame
        train_matrix: training user-item matrix
        k_values: list of K cutoffs
        relevance_threshold: minimum rating to consider "relevant"
    """
    # Group test ratings by user
    user_test = defaultdict(dict)
    for _, row in test_df.iterrows():
        user_id = row["user_id"]
        item_id = row["item_id"]
        if user_id in user_map and item_id in item_map:
            user_idx = user_map[user_id]
            item_idx = item_map[item_id]
            user_test[user_idx][item_idx] = row["rating"]

    results = {k: {"precision": [], "recall": [], "ndcg": []} for k in k_values}

    max_k = max(k_values)
    evaluated_users = 0

    for user_idx, test_items in user_test.items():
        # Only evaluate users with relevant items in test set
        relevant = {i for i, r in test_items.items() if r >= relevance_threshold}
        if len(relevant) == 0:
            continue

        # Generate recommendations
        recs = model.recommend(user_idx, n=max_k)
        rec_items = [item_idx for item_idx, _ in recs]

        # Compute metrics at each K
        for k in k_values:
            results[k]["precision"].append(precision_at_k(rec_items, relevant, k))
            results[k]["recall"].append(recall_at_k(rec_items, relevant, k))
            results[k]["ndcg"].append(ndcg_at_k(rec_items, test_items, k))

        evaluated_users += 1

    # Aggregate results
    print(f"\nEvaluated {evaluated_users} users\n")
    print(f"{'K':>4} | {'Precision':>10} | {'Recall':>10} | {'NDCG':>10}")
    print("-" * 45)
    for k in k_values:
        p = np.mean(results[k]["precision"])
        r = np.mean(results[k]["recall"])
        n = np.mean(results[k]["ndcg"])
        print(f"{k:>4} | {p:>10.4f} | {r:>10.4f} | {n:>10.4f}")

    return results


# Evaluate all models
print("=== User-Based CF ===")
evaluate_recommender(user_cf, test_df, train_matrix)

print("\n=== Item-Based CF ===")
evaluate_recommender(item_cf, test_df, train_matrix)

# Example output:
#    K |  Precision |     Recall |       NDCG
# ---------------------------------------------
#    5 |     0.1280 |     0.0623 |     0.1456
#   10 |     0.1040 |     0.0987 |     0.1312
#   20 |     0.0820 |     0.1534 |     0.1198

A/B Testing Design

Offline metrics tell you which model is better in theory. A/B testing tells you which model actually drives better user outcomes in production.

Experiment Framework

Python
import hashlib
from dataclasses import dataclass
from typing import Dict

@dataclass
class ABExperiment:
    """A/B test configuration for recommendation models."""
    name: str
    control_model: str      # e.g., "item_cf"
    treatment_model: str    # e.g., "ncf"
    traffic_split: float    # fraction in treatment (e.g., 0.5)
    metrics: list           # metrics to track

    def assign_group(self, user_id: int) -> str:
        """Deterministically assign user to control or treatment.

        Uses hashing for consistent, reproducible assignment.
        """
        hash_input = f"{self.name}:{user_id}"
        hash_val = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        bucket = (hash_val % 1000) / 1000
        return "treatment" if bucket < self.traffic_split else "control"


# Define experiment
experiment = ABExperiment(
    name="ncf_vs_itemcf_v1",
    control_model="item_cf",
    treatment_model="ncf",
    traffic_split=0.5,
    metrics=["click_through_rate", "watch_time", "conversion_rate"]
)

Statistical Significance

Python
from scipy import stats

def check_significance(control_metric, treatment_metric, alpha=0.05):
    """Two-sample t-test for A/B experiment results.

    Args:
        control_metric: array of metric values for control group
        treatment_metric: array of metric values for treatment group
        alpha: significance level
    Returns:
        dict with test results
    """
    t_stat, p_value = stats.ttest_ind(control_metric, treatment_metric)

    control_mean = np.mean(control_metric)
    treatment_mean = np.mean(treatment_metric)
    lift = (treatment_mean - control_mean) / control_mean * 100

    return {
        "control_mean": round(control_mean, 4),
        "treatment_mean": round(treatment_mean, 4),
        "lift_pct": round(lift, 2),
        "t_statistic": round(t_stat, 4),
        "p_value": round(p_value, 6),
        "significant": p_value < alpha,
        "recommendation": (
            "Deploy treatment" if p_value < alpha and lift > 0
            else "Keep control" if p_value < alpha
            else "Continue experiment (not yet significant)"
        )
    }


def required_sample_size(baseline_rate, minimum_detectable_effect,
                         alpha=0.05, power=0.8):
    """Calculate required sample size per group for an A/B test.

    Uses the formula for two-proportion z-test.
    """
    p1 = baseline_rate
    p2 = baseline_rate * (1 + minimum_detectable_effect)
    pooled = (p1 + p2) / 2

    z_alpha = stats.norm.ppf(1 - alpha / 2)
    z_beta = stats.norm.ppf(power)

    n = (2 * pooled * (1 - pooled) * (z_alpha + z_beta) ** 2) / (p1 - p2) ** 2
    return int(np.ceil(n))


# Example: How many users do we need?
n = required_sample_size(
    baseline_rate=0.05,          # 5% baseline CTR
    minimum_detectable_effect=0.10  # detect 10% relative lift
)
print(f"Required sample size per group: {n:,}")
# Output: Required sample size per group: ~30,000

Metrics Summary Table

Metric Type What It Measures Range
Precision@K Offline How many recommended items are relevant [0, 1]
Recall@K Offline How many relevant items are recommended [0, 1]
NDCG@K Offline Ranking quality (position-aware) [0, 1]
CTR Online Click-through rate on recommendations [0, 1]
Watch Time Online Engagement depth (seconds/minutes) [0, inf)
Conversion Online Purchase/subscription driven by recommendation [0, 1]
Offline vs Online Gap: A model that wins on offline metrics does not always win online. Netflix found that RMSE improvements of less than 1% can sometimes translate to measurable engagement gains, but the correlation is not guaranteed. Always validate with A/B tests before full deployment.

Next: Enhancements & Next Steps

Learn about real-time updates, diversity, cold start solutions, and production scaling strategies.

Enhancements & Next Steps →