Step 3: Content-Based Filtering Intermediate

Content-based filtering recommends items similar to what a user has liked before, based on item features rather than user behavior patterns. This lesson builds a content-based recommender using TF-IDF vectorization of movie genres and metadata, then combines it with collaborative filtering into a hybrid model.

Build Genre Feature Vectors

MovieLens 100K encodes genres as binary columns. We create a text representation of each movie's features for TF-IDF processing:

Python
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Genre columns in MovieLens 100K
genre_cols = [
    "unknown", "Action", "Adventure", "Animation", "Children",
    "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
    "Sci-Fi", "Thriller", "War", "Western"
]

def create_movie_descriptions(movies_df):
    """Create text descriptions from genre columns for TF-IDF."""
    descriptions = []
    for _, row in movies_df.iterrows():
        genres = [col for col in genre_cols if row.get(col, 0) == 1]
        # Combine title words and genres into a text description
        title_words = str(row["title"]).replace("(", "").replace(")", "")
        desc = f"{title_words} {' '.join(genres)}"
        descriptions.append(desc)
    return descriptions

movies["description"] = create_movie_descriptions(movies)
print(movies[["title", "description"]].head())

# Example output:
# Toy Story (1995)  ->  "Toy Story 1995 Animation Children Comedy"
# GoldenEye (1995)  ->  "GoldenEye 1995 Action Adventure Thriller"

TF-IDF Vectorization

Python
class ContentBasedRecommender:
    """Content-Based Filtering using TF-IDF and cosine similarity."""

    def __init__(self, max_features=5000):
        self.tfidf = TfidfVectorizer(
            max_features=max_features,
            stop_words="english",
            ngram_range=(1, 2)
        )
        self.tfidf_matrix = None
        self.item_sim = None
        self.movies_df = None

    def fit(self, movies_df, description_col="description"):
        """Fit TF-IDF and compute item-item content similarity."""
        self.movies_df = movies_df.reset_index(drop=True)

        # Fit TF-IDF
        self.tfidf_matrix = self.tfidf.fit_transform(
            self.movies_df[description_col].fillna("")
        )

        # Compute pairwise cosine similarity
        self.item_sim = cosine_similarity(self.tfidf_matrix)
        np.fill_diagonal(self.item_sim, 0)

        print(f"ContentBased fitted: {self.tfidf_matrix.shape[0]} items, "
              f"{self.tfidf_matrix.shape[1]} features")

    def get_similar_items(self, item_idx, n=10):
        """Find the n most similar items to a given item."""
        sim_scores = self.item_sim[item_idx]
        top_indices = np.argsort(sim_scores)[-n:][::-1]
        return [
            (idx, sim_scores[idx]) for idx in top_indices
        ]

    def recommend_for_user(self, user_ratings, n=10):
        """Recommend items based on a user's rating history.

        Args:
            user_ratings: dict of {item_idx: rating}
            n: number of recommendations
        Returns:
            list of (item_idx, score) tuples
        """
        # Build user profile as weighted average of rated item vectors
        rated_indices = list(user_ratings.keys())
        rated_scores = np.array([user_ratings[i] for i in rated_indices])

        # Normalize ratings to weights (higher rating = higher weight)
        weights = (rated_scores - rated_scores.mean()) / (rated_scores.std() + 1e-8)
        weights = np.maximum(weights, 0)  # Only use positively-rated items

        if weights.sum() == 0:
            weights = np.ones_like(weights) / len(weights)
        else:
            weights = weights / weights.sum()

        # Weighted sum of similarities
        scores = np.zeros(self.item_sim.shape[0])
        for idx, weight in zip(rated_indices, weights):
            scores += weight * self.item_sim[idx]

        # Exclude already-rated items
        for idx in rated_indices:
            scores[idx] = -1

        # Return top N
        top_indices = np.argsort(scores)[-n:][::-1]
        return [(idx, scores[idx]) for idx in top_indices]


# Usage
cb = ContentBasedRecommender()
cb.fit(movies)

# Find movies similar to "Toy Story (1995)" (index 0)
similar = cb.get_similar_items(item_idx=0, n=5)
for idx, score in similar:
    print(f"  {movies.iloc[idx]['title']}: {score:.3f}")

Hybrid Recommender

The best results come from combining collaborative filtering and content-based signals. Our hybrid blends scores from item-based CF and content-based filtering with a tunable weight parameter:

Python
class HybridRecommender:
    """Hybrid recommender combining CF and content-based scores.

    score = alpha * cf_score + (1 - alpha) * content_score
    """

    def __init__(self, cf_model, cb_model, alpha=0.7):
        self.cf = cf_model
        self.cb = cb_model
        self.alpha = alpha

    def recommend(self, user_idx, user_ratings, n=10):
        """Generate hybrid recommendations.

        Args:
            user_idx: user index for CF model
            user_ratings: dict of {item_idx: rating} for content model
            n: number of recommendations
        """
        # Get CF scores
        cf_recs = self.cf.recommend(user_idx, n=n * 3)
        cf_scores = {idx: score for idx, score in cf_recs}

        # Get content-based scores
        cb_recs = self.cb.recommend_for_user(user_ratings, n=n * 3)
        cb_scores = {idx: score for idx, score in cb_recs}

        # Normalize scores to [0, 1]
        def normalize(scores_dict):
            if not scores_dict:
                return scores_dict
            vals = list(scores_dict.values())
            min_v, max_v = min(vals), max(vals)
            rng = max_v - min_v if max_v != min_v else 1
            return {k: (v - min_v) / rng for k, v in scores_dict.items()}

        cf_norm = normalize(cf_scores)
        cb_norm = normalize(cb_scores)

        # Merge scores
        all_items = set(cf_norm.keys()) | set(cb_norm.keys())
        hybrid_scores = {}
        for item in all_items:
            cf_s = cf_norm.get(item, 0)
            cb_s = cb_norm.get(item, 0)
            hybrid_scores[item] = self.alpha * cf_s + (1 - self.alpha) * cb_s

        # Sort and return top N
        sorted_items = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_items[:n]


# Usage
hybrid = HybridRecommender(item_cf, cb, alpha=0.7)

# Build user_ratings dict from training data
user_idx = 0
user_row = train_matrix.toarray()[user_idx]
user_ratings = {i: r for i, r in enumerate(user_row) if r > 0}

recs = hybrid.recommend(user_idx, user_ratings, n=10)
for item_idx, score in recs:
    title = movies.iloc[item_idx]["title"]
    print(f"  {title}: {score:.3f}")
Tuning Alpha: The alpha parameter controls the blend. alpha=1.0 is pure CF, alpha=0.0 is pure content-based. Start with alpha=0.7 (CF-dominant) and tune using your evaluation metrics from Lesson 7.

Next: Neural Collaborative Filtering

Move beyond traditional methods with a deep learning approach using PyTorch embeddings and a two-tower architecture.

Step 4: Neural Collaborative Filtering →