PCA & SVD
Dimensionality reduction is a core ML skill tested in interviews. PCA from eigendecomposition tests your linear algebra knowledge. SVD provides a more numerically stable alternative. We implement both from scratch, compute explained variance ratios, and cover t-SNE basics.
PCA from Eigendecomposition
PCA finds the directions of maximum variance in the data. The algorithm: (1) center the data, (2) compute the covariance matrix, (3) find eigenvectors, (4) project onto top-k eigenvectors.
import numpy as np
class PCA:
"""Principal Component Analysis from scratch using eigendecomposition."""
def __init__(self, n_components=2):
self.n_components = n_components
self.components = None # (n_components, n_features) - principal axes
self.mean = None # (n_features,) - feature means
self.eigenvalues = None # (n_features,) - all eigenvalues
self.explained_variance_ = None
self.explained_variance_ratio_ = None
def fit(self, X):
n_samples, n_features = X.shape
# Step 1: Center the data (subtract mean)
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
# Step 2: Compute covariance matrix
# cov = (1/(n-1)) * X_centered.T @ X_centered
cov_matrix = (X_centered.T @ X_centered) / (n_samples - 1)
# Step 3: Eigendecomposition
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# eigh returns eigenvalues in ascending order; reverse them
idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
self.eigenvalues = eigenvalues
# Step 4: Select top-k components
self.components = eigenvectors[:, :self.n_components].T # (k, d)
# Compute explained variance
self.explained_variance_ = eigenvalues[:self.n_components]
total_variance = np.sum(eigenvalues)
self.explained_variance_ratio_ = self.explained_variance_ / total_variance
return self
def transform(self, X):
"""Project data onto principal components."""
X_centered = X - self.mean
return X_centered @ self.components.T # (n_samples, n_components)
def fit_transform(self, X):
self.fit(X)
return self.transform(X)
def inverse_transform(self, X_reduced):
"""Reconstruct data from reduced representation."""
return X_reduced @ self.components + self.mean
# ---- Test ----
np.random.seed(42)
# Create correlated data (most variance along one direction)
X = np.random.randn(200, 5)
X[:, 1] = X[:, 0] * 2 + np.random.randn(200) * 0.1 # Feature 1 ~ 2*Feature 0
X[:, 2] = X[:, 0] * -1 + np.random.randn(200) * 0.1 # Feature 2 ~ -Feature 0
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)
print(f"Original shape: {X.shape}") # (200, 5)
print(f"Reduced shape: {X_reduced.shape}") # (200, 2)
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained: {sum(pca.explained_variance_ratio_):.4f}")
# Reconstruction error
X_reconstructed = pca.inverse_transform(X_reduced)
recon_error = np.mean((X - X_reconstructed) ** 2)
print(f"Reconstruction error: {recon_error:.6f}")
PCA via SVD (More Stable)
SVD is numerically more stable than eigendecomposition of the covariance matrix, especially when features are highly correlated. This is what scikit-learn actually uses.
class PCA_SVD:
"""PCA using Singular Value Decomposition (more numerically stable)."""
def __init__(self, n_components=2):
self.n_components = n_components
self.components = None
self.mean = None
self.singular_values = None
self.explained_variance_ratio_ = None
def fit(self, X):
n_samples, n_features = X.shape
# Step 1: Center the data
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
# Step 2: SVD of centered data
# X = U * S * V.T
# U: (n, n) left singular vectors
# S: (min(n,d),) singular values
# Vt: (d, d) right singular vectors (these are our components!)
U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
self.singular_values = S
self.components = Vt[:self.n_components] # (k, d)
# Explained variance from singular values
# variance = S^2 / (n - 1)
explained_var = (S ** 2) / (n_samples - 1)
total_var = np.sum(explained_var)
self.explained_variance_ratio_ = explained_var[:self.n_components] / total_var
return self
def transform(self, X):
X_centered = X - self.mean
return X_centered @ self.components.T
def fit_transform(self, X):
self.fit(X)
return self.transform(X)
# ---- Verify SVD-based PCA matches eigendecomposition PCA ----
pca_svd = PCA_SVD(n_components=2)
X_reduced_svd = pca_svd.fit_transform(X)
print(f"\nSVD-based explained variance: {pca_svd.explained_variance_ratio_}")
# Should match eigendecomposition PCA results
Choosing the Number of Components
def choose_n_components(X, target_variance=0.95):
"""Find minimum components to explain target_variance of total variance."""
# Fit full PCA
pca_full = PCA(n_components=X.shape[1])
pca_full.fit(X)
# Cumulative explained variance
cumulative = np.cumsum(pca_full.explained_variance_ratio_)
# Find number of components
n_components = np.argmax(cumulative >= target_variance) + 1
print(f"Components needed for {target_variance*100}% variance: {n_components}")
print(f"Cumulative explained variance: {cumulative[:n_components+2]}")
return n_components
n_comp = choose_n_components(X, target_variance=0.95)
SVD for Matrix Approximation
SVD can approximate any matrix with a low-rank version, useful for compression and denoising.
def low_rank_approximation(X, rank):
"""Approximate matrix X with a rank-k matrix using SVD.
This is the best rank-k approximation in Frobenius norm
(Eckart-Young theorem).
"""
U, S, Vt = np.linalg.svd(X, full_matrices=False)
# Keep only top-k singular values
U_k = U[:, :rank]
S_k = S[:rank]
Vt_k = Vt[:rank, :]
# Reconstruct
X_approx = U_k @ np.diag(S_k) @ Vt_k
# Measure quality
original_energy = np.sum(S ** 2)
retained_energy = np.sum(S_k ** 2)
compression = (X.shape[0] * rank + rank + rank * X.shape[1]) / X.size
print(f"Rank-{rank} approximation:")
print(f" Energy retained: {retained_energy/original_energy:.4f}")
print(f" Compression ratio: {compression:.4f}")
print(f" Frobenius error: {np.linalg.norm(X - X_approx):.4f}")
return X_approx
# ---- Test ----
np.random.seed(42)
# Create a matrix with low effective rank
A = np.random.randn(50, 3) @ np.random.randn(3, 30) + np.random.randn(50, 30) * 0.1
A_approx = low_rank_approximation(A, rank=3)
A_approx_1 = low_rank_approximation(A, rank=1)
t-SNE Basics
t-SNE is a non-linear dimensionality reduction technique for visualization. Unlike PCA, it preserves local structure. Here is a simplified implementation showing the core concepts.
def compute_pairwise_affinities(X, perplexity=30.0):
"""Compute pairwise affinities p_ij using Gaussian kernel.
For each point, find sigma that gives the desired perplexity.
Perplexity ~ effective number of neighbors.
"""
n = X.shape[0]
distances = np.sum((X[:, np.newaxis] - X[np.newaxis]) ** 2, axis=2)
P = np.zeros((n, n))
target_entropy = np.log(perplexity)
for i in range(n):
# Binary search for sigma that gives target perplexity
lo, hi = 1e-10, 1e10
for _ in range(50): # Binary search iterations
sigma = (lo + hi) / 2
pi = np.exp(-distances[i] / (2 * sigma ** 2))
pi[i] = 0
sum_pi = np.sum(pi)
if sum_pi == 0:
lo = sigma
continue
pi /= sum_pi
entropy = -np.sum(pi[pi > 0] * np.log(pi[pi > 0]))
if entropy > target_entropy:
hi = sigma
else:
lo = sigma
P[i] = pi
# Symmetrize: P = (P + P.T) / (2n)
P = (P + P.T) / (2 * n)
P = np.maximum(P, 1e-12) # Avoid numerical issues
return P
def tsne_simple(X, n_components=2, perplexity=30.0, n_iter=500, lr=200.0):
"""Simplified t-SNE implementation.
Core idea:
1. Compute pairwise similarities in high-D (Gaussian kernel)
2. Initialize low-D embedding randomly
3. Iteratively move points so low-D similarities (Student-t)
match high-D similarities (Gaussian)
"""
n = X.shape[0]
# Step 1: Compute high-dimensional affinities
P = compute_pairwise_affinities(X, perplexity)
# Step 2: Initialize low-dimensional embedding
Y = np.random.randn(n, n_components) * 0.01
for iteration in range(n_iter):
# Compute low-dimensional affinities (Student-t kernel)
dist_Y = np.sum((Y[:, np.newaxis] - Y[np.newaxis]) ** 2, axis=2)
Q = 1.0 / (1.0 + dist_Y)
np.fill_diagonal(Q, 0)
Q_sum = np.sum(Q)
Q = Q / Q_sum
Q = np.maximum(Q, 1e-12)
# Compute gradient
PQ_diff = P - Q
grad = np.zeros_like(Y)
for i in range(n):
diff = Y[i] - Y # (n, n_components)
grad[i] = 4 * np.sum(
(PQ_diff[i] * Q[i] * (1 + dist_Y[i]))[:, np.newaxis] * diff,
axis=0
)
# Update embedding
Y -= lr * grad
# Center embedding
Y -= np.mean(Y, axis=0)
return Y
# Note: Full t-SNE also includes:
# - Early exaggeration (multiply P by 4 for first 250 iterations)
# - Momentum-based updates
# - Barnes-Hut approximation for O(n log n) instead of O(n^2)
PCA vs SVD vs t-SNE Comparison
| Property | PCA | SVD | t-SNE |
|---|---|---|---|
| Type | Linear | Linear | Non-linear |
| Preserves | Global variance | Global variance | Local structure |
| Inverse transform | Yes | Yes | No |
| Deterministic | Yes | Yes | No (random init) |
| Use case | Feature reduction | Matrix approximation | Visualization only |
| Complexity | O(nd^2) | O(nd min(n,d)) | O(n^2) per iteration |
Key Takeaways
- PCA steps: center data, compute covariance, eigendecompose, project onto top-k eigenvectors
- Use
np.linalg.eigh(noteig) for symmetric matrices like covariance — it is faster and more stable - SVD-based PCA avoids computing covariance matrix explicitly, better numerical stability
- Explained variance ratio tells you how much information each component captures
- SVD provides the best rank-k approximation (Eckart-Young theorem)
- t-SNE preserves local structure but is non-parametric and non-invertible — use for visualization only
Lilly Tech Systems