Linear Regression
Linear regression is the first algorithm interviewers ask you to implement from scratch. It tests gradient descent understanding, matrix math fluency, and awareness of regularization. We build the complete implementation: gradient descent, normal equation, Ridge (L2), Lasso (L1), and polynomial features.
The Math
Linear regression models the relationship y = Xw + b where X is the feature matrix, w is the weight vector, and b is the bias. We minimize the Mean Squared Error (MSE) loss:
# Loss function (MSE):
# L(w, b) = (1/2n) * sum((X @ w + b - y)^2)
#
# Gradients:
# dL/dw = (1/n) * X.T @ (X @ w + b - y)
# dL/db = (1/n) * sum(X @ w + b - y)
#
# Update rule (gradient descent):
# w = w - lr * dL/dw
# b = b - lr * dL/db
Implementation: Gradient Descent
import numpy as np
class LinearRegressionGD:
"""Linear Regression using Gradient Descent."""
def __init__(self, learning_rate=0.01, n_iterations=1000, tol=1e-6):
self.lr = learning_rate
self.n_iter = n_iterations
self.tol = tol
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X, y):
n_samples, n_features = X.shape
# Initialize parameters to zeros
self.weights = np.zeros(n_features)
self.bias = 0.0
self.loss_history = []
for i in range(self.n_iter):
# Forward pass
y_pred = X @ self.weights + self.bias
# Compute MSE loss
loss = np.mean((y_pred - y) ** 2) / 2
self.loss_history.append(loss)
# Check convergence
if i > 0 and abs(self.loss_history[-2] - loss) < self.tol:
print(f"Converged at iteration {i}")
break
# Compute gradients
error = y_pred - y # (n_samples,)
dw = (X.T @ error) / n_samples # (n_features,)
db = np.mean(error) # scalar
# Update parameters
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict(self, X):
return X @ self.weights + self.bias
def score(self, X, y):
"""R-squared score."""
y_pred = self.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - (ss_res / ss_tot)
# ---- Test it ----
np.random.seed(42)
X = np.random.randn(200, 3)
true_w = np.array([3.0, -2.0, 5.0])
y = X @ true_w + 1.5 + np.random.randn(200) * 0.3
model = LinearRegressionGD(learning_rate=0.01, n_iterations=2000)
model.fit(X, y)
print(f"Learned weights: {model.weights}") # ~[3.0, -2.0, 5.0]
print(f"Learned bias: {model.bias:.4f}") # ~1.5
print(f"R-squared: {model.score(X, y):.6f}") # ~0.999
print(f"Converged in {len(model.loss_history)} iterations")
Implementation: Normal Equation
The normal equation gives the closed-form solution: w = (X.T @ X)^(-1) @ X.T @ y. No iteration needed, but it requires matrix inversion which is O(d^3) and can be numerically unstable.
class LinearRegressionNormal:
"""Linear Regression using the Normal Equation (closed-form)."""
def __init__(self):
self.weights = None
self.bias = None
def fit(self, X, y):
n_samples = X.shape[0]
# Add bias column (column of ones)
X_b = np.column_stack([np.ones(n_samples), X])
# Normal equation: w = (X.T @ X)^(-1) @ X.T @ y
# Use np.linalg.solve instead of inverse for numerical stability
XtX = X_b.T @ X_b
Xty = X_b.T @ y
w_full = np.linalg.solve(XtX, Xty)
self.bias = w_full[0]
self.weights = w_full[1:]
return self
def predict(self, X):
return X @ self.weights + self.bias
# ---- Test it ----
model_normal = LinearRegressionNormal()
model_normal.fit(X, y)
print(f"Normal eq weights: {model_normal.weights}") # ~[3.0, -2.0, 5.0]
print(f"Normal eq bias: {model_normal.bias:.4f}") # ~1.5
np.linalg.solve(A, b) instead of np.linalg.inv(A) @ b. The solve approach is numerically more stable and faster. Interviewers notice this.Ridge Regression (L2 Regularization)
Ridge adds an L2 penalty term alpha * ||w||^2 to the loss. This prevents overfitting by shrinking weights toward zero.
class RidgeRegression:
"""Ridge Regression (L2 regularization) from scratch."""
def __init__(self, alpha=1.0, learning_rate=0.01, n_iterations=1000):
self.alpha = alpha
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0.0
self.loss_history = []
for i in range(self.n_iter):
y_pred = X @ self.weights + self.bias
# MSE + L2 penalty (note: bias is NOT regularized)
mse = np.mean((y_pred - y) ** 2) / 2
l2_penalty = (self.alpha / 2) * np.sum(self.weights ** 2)
loss = mse + l2_penalty
self.loss_history.append(loss)
# Gradients (L2 adds alpha * w to weight gradient)
error = y_pred - y
dw = (X.T @ error) / n_samples + self.alpha * self.weights
db = np.mean(error)
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict(self, X):
return X @ self.weights + self.bias
# Closed-form solution for Ridge:
@staticmethod
def fit_closed_form(X, y, alpha=1.0):
"""Ridge closed form: w = (X.T X + alpha*I)^(-1) X.T y"""
n_samples, n_features = X.shape
X_b = np.column_stack([np.ones(n_samples), X])
I = np.eye(n_features + 1)
I[0, 0] = 0 # Don't regularize bias
w = np.linalg.solve(X_b.T @ X_b + alpha * I, X_b.T @ y)
return w[0], w[1:] # bias, weights
Lasso Regression (L1 Regularization)
Lasso uses L1 penalty alpha * ||w||_1 which encourages sparsity (some weights become exactly zero). The L1 gradient is not smooth, so we use the subgradient.
class LassoRegression:
"""Lasso Regression (L1 regularization) using coordinate descent."""
def __init__(self, alpha=1.0, n_iterations=1000, tol=1e-6):
self.alpha = alpha
self.n_iter = n_iterations
self.tol = tol
self.weights = None
self.bias = None
def _soft_threshold(self, x, threshold):
"""Soft thresholding operator for L1 proximal gradient."""
return np.sign(x) * np.maximum(np.abs(x) - threshold, 0)
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = np.mean(y)
# Coordinate descent: optimize one weight at a time
for iteration in range(self.n_iter):
w_old = self.weights.copy()
for j in range(n_features):
# Compute residual without feature j's contribution
residual = y - self.bias - X @ self.weights + X[:, j] * self.weights[j]
# Correlation of feature j with residual
rho = X[:, j] @ residual / n_samples
# Soft thresholding (L1 proximal operator)
col_norm_sq = np.sum(X[:, j] ** 2) / n_samples
self.weights[j] = self._soft_threshold(rho, self.alpha) / col_norm_sq
# Update bias (not regularized)
self.bias = np.mean(y - X @ self.weights)
# Check convergence
if np.max(np.abs(self.weights - w_old)) < self.tol:
break
return self
def predict(self, X):
return X @ self.weights + self.bias
# ---- Test Lasso sparsity ----
np.random.seed(42)
X_sparse = np.random.randn(100, 10)
# Only first 3 features matter
true_w_sparse = np.array([5, -3, 2, 0, 0, 0, 0, 0, 0, 0], dtype=float)
y_sparse = X_sparse @ true_w_sparse + np.random.randn(100) * 0.5
lasso = LassoRegression(alpha=0.1)
lasso.fit(X_sparse, y_sparse)
print(f"Lasso weights: {np.round(lasso.weights, 3)}")
# Many weights should be exactly 0, showing L1 sparsity
Polynomial Features
Polynomial regression fits non-linear relationships by creating polynomial features, then applying linear regression on the expanded feature set.
def polynomial_features(X, degree=2):
"""
Create polynomial features up to given degree.
For X with features [x1, x2] and degree=2:
Returns [x1, x2, x1^2, x1*x2, x2^2]
"""
n_samples, n_features = X.shape
features = [X] # Start with original features
for d in range(2, degree + 1):
# Generate all combinations of features raised to power d
# For simplicity, we do individual feature powers
features.append(X ** d)
# For interaction terms (degree 2 only for simplicity)
if degree >= 2 and n_features > 1:
interactions = []
for i in range(n_features):
for j in range(i + 1, n_features):
interactions.append((X[:, i] * X[:, j]).reshape(-1, 1))
if interactions:
features.append(np.hstack(interactions))
return np.hstack(features)
# ---- Full pipeline: polynomial + linear regression ----
def polynomial_regression(X, y, degree=2, lr=0.01, n_iter=2000):
"""Complete polynomial regression pipeline."""
# 1. Create polynomial features
X_poly = polynomial_features(X, degree)
# 2. Feature scaling (critical for gradient descent convergence)
mean = X_poly.mean(axis=0)
std = X_poly.std(axis=0)
std[std == 0] = 1 # Avoid division by zero
X_scaled = (X_poly - mean) / std
# 3. Fit linear regression on polynomial features
model = LinearRegressionGD(learning_rate=lr, n_iterations=n_iter)
model.fit(X_scaled, y)
return model, mean, std
# ---- Test with non-linear data ----
np.random.seed(42)
X_nl = np.random.uniform(-3, 3, (100, 1))
y_nl = 2 * X_nl[:, 0]**2 - 3 * X_nl[:, 0] + 1 + np.random.randn(100) * 0.5
model_poly, mean, std = polynomial_regression(X_nl, y_nl, degree=2)
print(f"Polynomial regression R^2: {model_poly.score((polynomial_features(X_nl, 2) - mean) / std, y_nl):.4f}")
X_scaled = (X - mean) / std.Key Takeaways
- Linear regression gradient:
dw = (1/n) * X.T @ (Xw + b - y) - Normal equation:
w = (X.T X)^(-1) X.T y— usenp.linalg.solve, notnp.linalg.inv - Ridge adds
alpha * wto weight gradient (L2 penalty) - Lasso uses coordinate descent with soft thresholding (L1 penalty encourages sparsity)
- Always scale features before gradient descent, especially with polynomial features
- Track loss history and implement early stopping for convergence
Lilly Tech Systems