Intermediate

Logistic Regression

Logistic regression is the most frequently asked "implement from scratch" algorithm in ML interviews. It combines the sigmoid function, cross-entropy loss, and gradient descent into a clean classification framework. We implement binary and multi-class versions with numerical stability.

Binary Logistic Regression

For binary classification, logistic regression applies the sigmoid function to the linear output to produce a probability between 0 and 1.

# Sigmoid function: sigma(z) = 1 / (1 + exp(-z))
# Maps any real number to (0, 1)
#
# Binary Cross-Entropy Loss:
# L = -(1/n) * sum(y * log(p) + (1-y) * log(1-p))
#
# Gradient (identical structure to linear regression!):
# dL/dw = (1/n) * X.T @ (sigmoid(Xw + b) - y)
# dL/db = (1/n) * sum(sigmoid(Xw + b) - y)
import numpy as np

class LogisticRegression:
    """Binary Logistic Regression from scratch."""

    def __init__(self, learning_rate=0.01, n_iterations=1000, tol=1e-6):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.tol = tol
        self.weights = None
        self.bias = None
        self.loss_history = []

    def _sigmoid(self, z):
        """Numerically stable sigmoid."""
        # Clip to avoid overflow in exp
        z = np.clip(z, -500, 500)
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        self.loss_history = []

        for i in range(self.n_iter):
            # Forward pass
            z = X @ self.weights + self.bias
            probs = self._sigmoid(z)

            # Compute binary cross-entropy loss
            # Clip probabilities to avoid log(0)
            probs_clipped = np.clip(probs, 1e-15, 1 - 1e-15)
            loss = -np.mean(
                y * np.log(probs_clipped) +
                (1 - y) * np.log(1 - probs_clipped)
            )
            self.loss_history.append(loss)

            # Check convergence
            if i > 0 and abs(self.loss_history[-2] - loss) < self.tol:
                break

            # Compute gradients
            error = probs - y                          # (n_samples,)
            dw = (X.T @ error) / n_samples             # (n_features,)
            db = np.mean(error)                        # scalar

            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X):
        """Return probability of class 1."""
        z = X @ self.weights + self.bias
        return self._sigmoid(z)

    def predict(self, X, threshold=0.5):
        """Return class predictions (0 or 1)."""
        return (self.predict_proba(X) >= threshold).astype(int)

    def accuracy(self, X, y):
        return np.mean(self.predict(X) == y)


# ---- Test it ----
np.random.seed(42)
X_0 = np.random.randn(100, 2) + np.array([2, 2])
X_1 = np.random.randn(100, 2) + np.array([-2, -2])
X = np.vstack([X_0, X_1])
y = np.array([0]*100 + [1]*100)

# Shuffle
idx = np.random.permutation(200)
X, y = X[idx], y[idx]

model = LogisticRegression(learning_rate=0.1, n_iterations=500)
model.fit(X, y)

print(f"Accuracy: {model.accuracy(X, y):.4f}")         # ~1.0
print(f"Weights: {model.weights}")
print(f"Final loss: {model.loss_history[-1]:.6f}")
Critical numerical stability: Always clip probabilities before taking log: np.clip(probs, 1e-15, 1 - 1e-15). Without clipping, log(0) = -inf will destroy your loss computation. Also clip sigmoid input to avoid exp() overflow. These two lines separate a passing interview from a failing one.

Multi-Class: Softmax Regression

For K classes, we use softmax to convert K raw scores into a probability distribution, then minimize cross-entropy loss.

class SoftmaxRegression:
    """Multi-class Logistic Regression using Softmax."""

    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None  # (n_features, n_classes)
        self.bias = None     # (n_classes,)
        self.loss_history = []

    def _softmax(self, z):
        """Numerically stable softmax.
        z shape: (n_samples, n_classes)
        """
        # Subtract max for numerical stability (log-sum-exp trick)
        z_shifted = z - np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z_shifted)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def _one_hot(self, y, n_classes):
        """Convert class labels to one-hot encoding."""
        one_hot = np.zeros((len(y), n_classes))
        one_hot[np.arange(len(y)), y] = 1
        return one_hot

    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Initialize parameters
        self.weights = np.zeros((n_features, n_classes))
        self.bias = np.zeros(n_classes)
        self.loss_history = []

        # One-hot encode labels
        y_onehot = self._one_hot(y, n_classes)

        for i in range(self.n_iter):
            # Forward pass
            z = X @ self.weights + self.bias        # (n_samples, n_classes)
            probs = self._softmax(z)                 # (n_samples, n_classes)

            # Cross-entropy loss
            # Use the one-hot encoding to select correct class probabilities
            log_probs = np.log(np.clip(probs, 1e-15, 1.0))
            loss = -np.mean(np.sum(y_onehot * log_probs, axis=1))
            self.loss_history.append(loss)

            # Compute gradients
            # The gradient has the same elegant form: (probs - one_hot)
            error = probs - y_onehot                 # (n_samples, n_classes)
            dw = (X.T @ error) / n_samples           # (n_features, n_classes)
            db = np.mean(error, axis=0)              # (n_classes,)

            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X):
        z = X @ self.weights + self.bias
        return self._softmax(z)

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def accuracy(self, X, y):
        return np.mean(self.predict(X) == y)


# ---- Test with 3 classes ----
np.random.seed(42)
X_c0 = np.random.randn(80, 2) + np.array([3, 0])
X_c1 = np.random.randn(80, 2) + np.array([-3, 0])
X_c2 = np.random.randn(80, 2) + np.array([0, 4])
X_multi = np.vstack([X_c0, X_c1, X_c2])
y_multi = np.array([0]*80 + [1]*80 + [2]*80)

idx = np.random.permutation(240)
X_multi, y_multi = X_multi[idx], y_multi[idx]

model_sm = SoftmaxRegression(learning_rate=0.1, n_iterations=500)
model_sm.fit(X_multi, y_multi)

print(f"Multi-class accuracy: {model_sm.accuracy(X_multi, y_multi):.4f}")
print(f"Weight matrix shape: {model_sm.weights.shape}")  # (2, 3)
print(f"Final loss: {model_sm.loss_history[-1]:.4f}")
💡
Interview insight: The gradient of softmax cross-entropy has the same elegant form as binary logistic regression: error = probs - y_onehot. This is not a coincidence — the binary case is a special case of softmax with 2 classes. Mentioning this connection in an interview demonstrates deep understanding.

Decision Boundary Visualization

Understanding decision boundaries helps verify your implementation is correct. Here is how to compute the decision boundary analytically for binary logistic regression:

# For 2D binary logistic regression, the decision boundary is:
# w1*x1 + w2*x2 + b = 0
# Solving for x2: x2 = -(w1*x1 + b) / w2

def get_decision_boundary(model, x1_range):
    """Compute decision boundary line for 2D binary logistic regression."""
    w1, w2 = model.weights
    b = model.bias

    if abs(w2) < 1e-10:
        # Vertical line
        x1_boundary = -b / w1
        return np.full_like(x1_range, x1_boundary), x1_range
    else:
        x2_boundary = -(w1 * x1_range + b) / w2
        return x1_range, x2_boundary

# Usage:
# x1_vals = np.linspace(-5, 5, 100)
# x1_line, x2_line = get_decision_boundary(model, x1_vals)
# plt.plot(x1_line, x2_line, 'k--', label='Decision Boundary')

Key Takeaways

💡
  • Sigmoid: sigma(z) = 1 / (1 + exp(-z)) — always clip input to avoid overflow
  • Binary cross-entropy: -mean(y*log(p) + (1-y)*log(1-p)) — always clip probabilities
  • Gradient is elegantly simple: dw = X.T @ (probs - y) / n
  • Softmax: exp(z - max(z)) / sum(exp(z - max(z))) — subtract max for stability
  • Multi-class gradient has the same form: dw = X.T @ (probs - y_onehot) / n
  • Numerical stability (clipping, log-sum-exp trick) is the #1 thing interviewers check