Intermediate

Linear Regression

Linear regression is the first algorithm interviewers ask you to implement from scratch. It tests gradient descent understanding, matrix math fluency, and awareness of regularization. We build the complete implementation: gradient descent, normal equation, Ridge (L2), Lasso (L1), and polynomial features.

The Math

Linear regression models the relationship y = Xw + b where X is the feature matrix, w is the weight vector, and b is the bias. We minimize the Mean Squared Error (MSE) loss:

# Loss function (MSE):
# L(w, b) = (1/2n) * sum((X @ w + b - y)^2)
#
# Gradients:
# dL/dw = (1/n) * X.T @ (X @ w + b - y)
# dL/db = (1/n) * sum(X @ w + b - y)
#
# Update rule (gradient descent):
# w = w - lr * dL/dw
# b = b - lr * dL/db

Implementation: Gradient Descent

import numpy as np

class LinearRegressionGD:
    """Linear Regression using Gradient Descent."""

    def __init__(self, learning_rate=0.01, n_iterations=1000, tol=1e-6):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.tol = tol
        self.weights = None
        self.bias = None
        self.loss_history = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Initialize parameters to zeros
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        self.loss_history = []

        for i in range(self.n_iter):
            # Forward pass
            y_pred = X @ self.weights + self.bias

            # Compute MSE loss
            loss = np.mean((y_pred - y) ** 2) / 2
            self.loss_history.append(loss)

            # Check convergence
            if i > 0 and abs(self.loss_history[-2] - loss) < self.tol:
                print(f"Converged at iteration {i}")
                break

            # Compute gradients
            error = y_pred - y                        # (n_samples,)
            dw = (X.T @ error) / n_samples            # (n_features,)
            db = np.mean(error)                       # scalar

            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict(self, X):
        return X @ self.weights + self.bias

    def score(self, X, y):
        """R-squared score."""
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)


# ---- Test it ----
np.random.seed(42)
X = np.random.randn(200, 3)
true_w = np.array([3.0, -2.0, 5.0])
y = X @ true_w + 1.5 + np.random.randn(200) * 0.3

model = LinearRegressionGD(learning_rate=0.01, n_iterations=2000)
model.fit(X, y)

print(f"Learned weights: {model.weights}")    # ~[3.0, -2.0, 5.0]
print(f"Learned bias: {model.bias:.4f}")       # ~1.5
print(f"R-squared: {model.score(X, y):.6f}")   # ~0.999
print(f"Converged in {len(model.loss_history)} iterations")

Implementation: Normal Equation

The normal equation gives the closed-form solution: w = (X.T @ X)^(-1) @ X.T @ y. No iteration needed, but it requires matrix inversion which is O(d^3) and can be numerically unstable.

class LinearRegressionNormal:
    """Linear Regression using the Normal Equation (closed-form)."""

    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples = X.shape[0]

        # Add bias column (column of ones)
        X_b = np.column_stack([np.ones(n_samples), X])

        # Normal equation: w = (X.T @ X)^(-1) @ X.T @ y
        # Use np.linalg.solve instead of inverse for numerical stability
        XtX = X_b.T @ X_b
        Xty = X_b.T @ y
        w_full = np.linalg.solve(XtX, Xty)

        self.bias = w_full[0]
        self.weights = w_full[1:]

        return self

    def predict(self, X):
        return X @ self.weights + self.bias


# ---- Test it ----
model_normal = LinearRegressionNormal()
model_normal.fit(X, y)
print(f"Normal eq weights: {model_normal.weights}")  # ~[3.0, -2.0, 5.0]
print(f"Normal eq bias: {model_normal.bias:.4f}")     # ~1.5

💡

Interview tip: Always mention that you would use np.linalg.solve(A, b) instead of np.linalg.inv(A) @ b. The solve approach is numerically more stable and faster. Interviewers notice this.

Ridge Regression (L2 Regularization)

Ridge adds an L2 penalty term alpha * ||w||^2 to the loss. This prevents overfitting by shrinking weights toward zero.

class RidgeRegression:
    """Ridge Regression (L2 regularization) from scratch."""

    def __init__(self, alpha=1.0, learning_rate=0.01, n_iterations=1000):
        self.alpha = alpha
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None
        self.loss_history = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        self.loss_history = []

        for i in range(self.n_iter):
            y_pred = X @ self.weights + self.bias

            # MSE + L2 penalty (note: bias is NOT regularized)
            mse = np.mean((y_pred - y) ** 2) / 2
            l2_penalty = (self.alpha / 2) * np.sum(self.weights ** 2)
            loss = mse + l2_penalty
            self.loss_history.append(loss)

            # Gradients (L2 adds alpha * w to weight gradient)
            error = y_pred - y
            dw = (X.T @ error) / n_samples + self.alpha * self.weights
            db = np.mean(error)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict(self, X):
        return X @ self.weights + self.bias

    # Closed-form solution for Ridge:
    @staticmethod
    def fit_closed_form(X, y, alpha=1.0):
        """Ridge closed form: w = (X.T X + alpha*I)^(-1) X.T y"""
        n_samples, n_features = X.shape
        X_b = np.column_stack([np.ones(n_samples), X])
        I = np.eye(n_features + 1)
        I[0, 0] = 0  # Don't regularize bias
        w = np.linalg.solve(X_b.T @ X_b + alpha * I, X_b.T @ y)
        return w[0], w[1:]  # bias, weights

Lasso Regression (L1 Regularization)

Lasso uses L1 penalty alpha * ||w||_1 which encourages sparsity (some weights become exactly zero). The L1 gradient is not smooth, so we use the subgradient.

class LassoRegression:
    """Lasso Regression (L1 regularization) using coordinate descent."""

    def __init__(self, alpha=1.0, n_iterations=1000, tol=1e-6):
        self.alpha = alpha
        self.n_iter = n_iterations
        self.tol = tol
        self.weights = None
        self.bias = None

    def _soft_threshold(self, x, threshold):
        """Soft thresholding operator for L1 proximal gradient."""
        return np.sign(x) * np.maximum(np.abs(x) - threshold, 0)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = np.mean(y)

        # Coordinate descent: optimize one weight at a time
        for iteration in range(self.n_iter):
            w_old = self.weights.copy()

            for j in range(n_features):
                # Compute residual without feature j's contribution
                residual = y - self.bias - X @ self.weights + X[:, j] * self.weights[j]

                # Correlation of feature j with residual
                rho = X[:, j] @ residual / n_samples

                # Soft thresholding (L1 proximal operator)
                col_norm_sq = np.sum(X[:, j] ** 2) / n_samples
                self.weights[j] = self._soft_threshold(rho, self.alpha) / col_norm_sq

            # Update bias (not regularized)
            self.bias = np.mean(y - X @ self.weights)

            # Check convergence
            if np.max(np.abs(self.weights - w_old)) < self.tol:
                break

        return self

    def predict(self, X):
        return X @ self.weights + self.bias


# ---- Test Lasso sparsity ----
np.random.seed(42)
X_sparse = np.random.randn(100, 10)
# Only first 3 features matter
true_w_sparse = np.array([5, -3, 2, 0, 0, 0, 0, 0, 0, 0], dtype=float)
y_sparse = X_sparse @ true_w_sparse + np.random.randn(100) * 0.5

lasso = LassoRegression(alpha=0.1)
lasso.fit(X_sparse, y_sparse)
print(f"Lasso weights: {np.round(lasso.weights, 3)}")
# Many weights should be exactly 0, showing L1 sparsity

Polynomial Features

Polynomial regression fits non-linear relationships by creating polynomial features, then applying linear regression on the expanded feature set.

def polynomial_features(X, degree=2):
    """
    Create polynomial features up to given degree.
    For X with features [x1, x2] and degree=2:
    Returns [x1, x2, x1^2, x1*x2, x2^2]
    """
    n_samples, n_features = X.shape
    features = [X]  # Start with original features

    for d in range(2, degree + 1):
        # Generate all combinations of features raised to power d
        # For simplicity, we do individual feature powers
        features.append(X ** d)

    # For interaction terms (degree 2 only for simplicity)
    if degree >= 2 and n_features > 1:
        interactions = []
        for i in range(n_features):
            for j in range(i + 1, n_features):
                interactions.append((X[:, i] * X[:, j]).reshape(-1, 1))
        if interactions:
            features.append(np.hstack(interactions))

    return np.hstack(features)


# ---- Full pipeline: polynomial + linear regression ----
def polynomial_regression(X, y, degree=2, lr=0.01, n_iter=2000):
    """Complete polynomial regression pipeline."""
    # 1. Create polynomial features
    X_poly = polynomial_features(X, degree)

    # 2. Feature scaling (critical for gradient descent convergence)
    mean = X_poly.mean(axis=0)
    std = X_poly.std(axis=0)
    std[std == 0] = 1  # Avoid division by zero
    X_scaled = (X_poly - mean) / std

    # 3. Fit linear regression on polynomial features
    model = LinearRegressionGD(learning_rate=lr, n_iterations=n_iter)
    model.fit(X_scaled, y)

    return model, mean, std


# ---- Test with non-linear data ----
np.random.seed(42)
X_nl = np.random.uniform(-3, 3, (100, 1))
y_nl = 2 * X_nl[:, 0]**2 - 3 * X_nl[:, 0] + 1 + np.random.randn(100) * 0.5

model_poly, mean, std = polynomial_regression(X_nl, y_nl, degree=2)
print(f"Polynomial regression R^2: {model_poly.score((polynomial_features(X_nl, 2) - mean) / std, y_nl):.4f}")

⚠

Common interview mistake: Forgetting to scale features before gradient descent. Polynomial features have very different magnitudes (x vs x^2 vs x^3), causing gradient descent to oscillate or diverge. Always standardize features: X_scaled = (X - mean) / std.

Key Takeaways

💡

Linear regression gradient: dw = (1/n) * X.T @ (Xw + b - y)
Normal equation: w = (X.T X)^(-1) X.T y — use np.linalg.solve, not np.linalg.inv
Ridge adds alpha * w to weight gradient (L2 penalty)
Lasso uses coordinate descent with soft thresholding (L1 penalty encourages sparsity)
Always scale features before gradient descent, especially with polynomial features
Track loss history and implement early stopping for convergence

← Previous Implementing ML From Scratch Next → Logistic Regression