Step 3: Content-Based Filtering Intermediate
Content-based filtering recommends items similar to what a user has liked before, based on item features rather than user behavior patterns. This lesson builds a content-based recommender using TF-IDF vectorization of movie genres and metadata, then combines it with collaborative filtering into a hybrid model.
Build Genre Feature Vectors
MovieLens 100K encodes genres as binary columns. We create a text representation of each movie's features for TF-IDF processing:
Python
import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Genre columns in MovieLens 100K genre_cols = [ "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ] def create_movie_descriptions(movies_df): """Create text descriptions from genre columns for TF-IDF.""" descriptions = [] for _, row in movies_df.iterrows(): genres = [col for col in genre_cols if row.get(col, 0) == 1] # Combine title words and genres into a text description title_words = str(row["title"]).replace("(", "").replace(")", "") desc = f"{title_words} {' '.join(genres)}" descriptions.append(desc) return descriptions movies["description"] = create_movie_descriptions(movies) print(movies[["title", "description"]].head()) # Example output: # Toy Story (1995) -> "Toy Story 1995 Animation Children Comedy" # GoldenEye (1995) -> "GoldenEye 1995 Action Adventure Thriller"
TF-IDF Vectorization
Python
class ContentBasedRecommender: """Content-Based Filtering using TF-IDF and cosine similarity.""" def __init__(self, max_features=5000): self.tfidf = TfidfVectorizer( max_features=max_features, stop_words="english", ngram_range=(1, 2) ) self.tfidf_matrix = None self.item_sim = None self.movies_df = None def fit(self, movies_df, description_col="description"): """Fit TF-IDF and compute item-item content similarity.""" self.movies_df = movies_df.reset_index(drop=True) # Fit TF-IDF self.tfidf_matrix = self.tfidf.fit_transform( self.movies_df[description_col].fillna("") ) # Compute pairwise cosine similarity self.item_sim = cosine_similarity(self.tfidf_matrix) np.fill_diagonal(self.item_sim, 0) print(f"ContentBased fitted: {self.tfidf_matrix.shape[0]} items, " f"{self.tfidf_matrix.shape[1]} features") def get_similar_items(self, item_idx, n=10): """Find the n most similar items to a given item.""" sim_scores = self.item_sim[item_idx] top_indices = np.argsort(sim_scores)[-n:][::-1] return [ (idx, sim_scores[idx]) for idx in top_indices ] def recommend_for_user(self, user_ratings, n=10): """Recommend items based on a user's rating history. Args: user_ratings: dict of {item_idx: rating} n: number of recommendations Returns: list of (item_idx, score) tuples """ # Build user profile as weighted average of rated item vectors rated_indices = list(user_ratings.keys()) rated_scores = np.array([user_ratings[i] for i in rated_indices]) # Normalize ratings to weights (higher rating = higher weight) weights = (rated_scores - rated_scores.mean()) / (rated_scores.std() + 1e-8) weights = np.maximum(weights, 0) # Only use positively-rated items if weights.sum() == 0: weights = np.ones_like(weights) / len(weights) else: weights = weights / weights.sum() # Weighted sum of similarities scores = np.zeros(self.item_sim.shape[0]) for idx, weight in zip(rated_indices, weights): scores += weight * self.item_sim[idx] # Exclude already-rated items for idx in rated_indices: scores[idx] = -1 # Return top N top_indices = np.argsort(scores)[-n:][::-1] return [(idx, scores[idx]) for idx in top_indices] # Usage cb = ContentBasedRecommender() cb.fit(movies) # Find movies similar to "Toy Story (1995)" (index 0) similar = cb.get_similar_items(item_idx=0, n=5) for idx, score in similar: print(f" {movies.iloc[idx]['title']}: {score:.3f}")
Hybrid Recommender
The best results come from combining collaborative filtering and content-based signals. Our hybrid blends scores from item-based CF and content-based filtering with a tunable weight parameter:
Python
class HybridRecommender: """Hybrid recommender combining CF and content-based scores. score = alpha * cf_score + (1 - alpha) * content_score """ def __init__(self, cf_model, cb_model, alpha=0.7): self.cf = cf_model self.cb = cb_model self.alpha = alpha def recommend(self, user_idx, user_ratings, n=10): """Generate hybrid recommendations. Args: user_idx: user index for CF model user_ratings: dict of {item_idx: rating} for content model n: number of recommendations """ # Get CF scores cf_recs = self.cf.recommend(user_idx, n=n * 3) cf_scores = {idx: score for idx, score in cf_recs} # Get content-based scores cb_recs = self.cb.recommend_for_user(user_ratings, n=n * 3) cb_scores = {idx: score for idx, score in cb_recs} # Normalize scores to [0, 1] def normalize(scores_dict): if not scores_dict: return scores_dict vals = list(scores_dict.values()) min_v, max_v = min(vals), max(vals) rng = max_v - min_v if max_v != min_v else 1 return {k: (v - min_v) / rng for k, v in scores_dict.items()} cf_norm = normalize(cf_scores) cb_norm = normalize(cb_scores) # Merge scores all_items = set(cf_norm.keys()) | set(cb_norm.keys()) hybrid_scores = {} for item in all_items: cf_s = cf_norm.get(item, 0) cb_s = cb_norm.get(item, 0) hybrid_scores[item] = self.alpha * cf_s + (1 - self.alpha) * cb_s # Sort and return top N sorted_items = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True) return sorted_items[:n] # Usage hybrid = HybridRecommender(item_cf, cb, alpha=0.7) # Build user_ratings dict from training data user_idx = 0 user_row = train_matrix.toarray()[user_idx] user_ratings = {i: r for i, r in enumerate(user_row) if r > 0} recs = hybrid.recommend(user_idx, user_ratings, n=10) for item_idx, score in recs: title = movies.iloc[item_idx]["title"] print(f" {title}: {score:.3f}")
Tuning Alpha: The alpha parameter controls the blend. alpha=1.0 is pure CF, alpha=0.0 is pure content-based. Start with alpha=0.7 (CF-dominant) and tune using your evaluation metrics from Lesson 7.
Next: Neural Collaborative Filtering
Move beyond traditional methods with a deep learning approach using PyTorch embeddings and a two-tower architecture.
Step 4: Neural Collaborative Filtering →