Intermediate
Logistic Regression
Logistic regression is the most frequently asked "implement from scratch" algorithm in ML interviews. It combines the sigmoid function, cross-entropy loss, and gradient descent into a clean classification framework. We implement binary and multi-class versions with numerical stability.
Binary Logistic Regression
For binary classification, logistic regression applies the sigmoid function to the linear output to produce a probability between 0 and 1.
# Sigmoid function: sigma(z) = 1 / (1 + exp(-z))
# Maps any real number to (0, 1)
#
# Binary Cross-Entropy Loss:
# L = -(1/n) * sum(y * log(p) + (1-y) * log(1-p))
#
# Gradient (identical structure to linear regression!):
# dL/dw = (1/n) * X.T @ (sigmoid(Xw + b) - y)
# dL/db = (1/n) * sum(sigmoid(Xw + b) - y)
import numpy as np
class LogisticRegression:
"""Binary Logistic Regression from scratch."""
def __init__(self, learning_rate=0.01, n_iterations=1000, tol=1e-6):
self.lr = learning_rate
self.n_iter = n_iterations
self.tol = tol
self.weights = None
self.bias = None
self.loss_history = []
def _sigmoid(self, z):
"""Numerically stable sigmoid."""
# Clip to avoid overflow in exp
z = np.clip(z, -500, 500)
return 1.0 / (1.0 + np.exp(-z))
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0.0
self.loss_history = []
for i in range(self.n_iter):
# Forward pass
z = X @ self.weights + self.bias
probs = self._sigmoid(z)
# Compute binary cross-entropy loss
# Clip probabilities to avoid log(0)
probs_clipped = np.clip(probs, 1e-15, 1 - 1e-15)
loss = -np.mean(
y * np.log(probs_clipped) +
(1 - y) * np.log(1 - probs_clipped)
)
self.loss_history.append(loss)
# Check convergence
if i > 0 and abs(self.loss_history[-2] - loss) < self.tol:
break
# Compute gradients
error = probs - y # (n_samples,)
dw = (X.T @ error) / n_samples # (n_features,)
db = np.mean(error) # scalar
# Update parameters
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict_proba(self, X):
"""Return probability of class 1."""
z = X @ self.weights + self.bias
return self._sigmoid(z)
def predict(self, X, threshold=0.5):
"""Return class predictions (0 or 1)."""
return (self.predict_proba(X) >= threshold).astype(int)
def accuracy(self, X, y):
return np.mean(self.predict(X) == y)
# ---- Test it ----
np.random.seed(42)
X_0 = np.random.randn(100, 2) + np.array([2, 2])
X_1 = np.random.randn(100, 2) + np.array([-2, -2])
X = np.vstack([X_0, X_1])
y = np.array([0]*100 + [1]*100)
# Shuffle
idx = np.random.permutation(200)
X, y = X[idx], y[idx]
model = LogisticRegression(learning_rate=0.1, n_iterations=500)
model.fit(X, y)
print(f"Accuracy: {model.accuracy(X, y):.4f}") # ~1.0
print(f"Weights: {model.weights}")
print(f"Final loss: {model.loss_history[-1]:.6f}")
Critical numerical stability: Always clip probabilities before taking log:
np.clip(probs, 1e-15, 1 - 1e-15). Without clipping, log(0) = -inf will destroy your loss computation. Also clip sigmoid input to avoid exp() overflow. These two lines separate a passing interview from a failing one.Multi-Class: Softmax Regression
For K classes, we use softmax to convert K raw scores into a probability distribution, then minimize cross-entropy loss.
class SoftmaxRegression:
"""Multi-class Logistic Regression using Softmax."""
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None # (n_features, n_classes)
self.bias = None # (n_classes,)
self.loss_history = []
def _softmax(self, z):
"""Numerically stable softmax.
z shape: (n_samples, n_classes)
"""
# Subtract max for numerical stability (log-sum-exp trick)
z_shifted = z - np.max(z, axis=1, keepdims=True)
exp_z = np.exp(z_shifted)
return exp_z / np.sum(exp_z, axis=1, keepdims=True)
def _one_hot(self, y, n_classes):
"""Convert class labels to one-hot encoding."""
one_hot = np.zeros((len(y), n_classes))
one_hot[np.arange(len(y)), y] = 1
return one_hot
def fit(self, X, y):
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
# Initialize parameters
self.weights = np.zeros((n_features, n_classes))
self.bias = np.zeros(n_classes)
self.loss_history = []
# One-hot encode labels
y_onehot = self._one_hot(y, n_classes)
for i in range(self.n_iter):
# Forward pass
z = X @ self.weights + self.bias # (n_samples, n_classes)
probs = self._softmax(z) # (n_samples, n_classes)
# Cross-entropy loss
# Use the one-hot encoding to select correct class probabilities
log_probs = np.log(np.clip(probs, 1e-15, 1.0))
loss = -np.mean(np.sum(y_onehot * log_probs, axis=1))
self.loss_history.append(loss)
# Compute gradients
# The gradient has the same elegant form: (probs - one_hot)
error = probs - y_onehot # (n_samples, n_classes)
dw = (X.T @ error) / n_samples # (n_features, n_classes)
db = np.mean(error, axis=0) # (n_classes,)
# Update parameters
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict_proba(self, X):
z = X @ self.weights + self.bias
return self._softmax(z)
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)
def accuracy(self, X, y):
return np.mean(self.predict(X) == y)
# ---- Test with 3 classes ----
np.random.seed(42)
X_c0 = np.random.randn(80, 2) + np.array([3, 0])
X_c1 = np.random.randn(80, 2) + np.array([-3, 0])
X_c2 = np.random.randn(80, 2) + np.array([0, 4])
X_multi = np.vstack([X_c0, X_c1, X_c2])
y_multi = np.array([0]*80 + [1]*80 + [2]*80)
idx = np.random.permutation(240)
X_multi, y_multi = X_multi[idx], y_multi[idx]
model_sm = SoftmaxRegression(learning_rate=0.1, n_iterations=500)
model_sm.fit(X_multi, y_multi)
print(f"Multi-class accuracy: {model_sm.accuracy(X_multi, y_multi):.4f}")
print(f"Weight matrix shape: {model_sm.weights.shape}") # (2, 3)
print(f"Final loss: {model_sm.loss_history[-1]:.4f}")
Interview insight: The gradient of softmax cross-entropy has the same elegant form as binary logistic regression:
error = probs - y_onehot. This is not a coincidence — the binary case is a special case of softmax with 2 classes. Mentioning this connection in an interview demonstrates deep understanding.Decision Boundary Visualization
Understanding decision boundaries helps verify your implementation is correct. Here is how to compute the decision boundary analytically for binary logistic regression:
# For 2D binary logistic regression, the decision boundary is:
# w1*x1 + w2*x2 + b = 0
# Solving for x2: x2 = -(w1*x1 + b) / w2
def get_decision_boundary(model, x1_range):
"""Compute decision boundary line for 2D binary logistic regression."""
w1, w2 = model.weights
b = model.bias
if abs(w2) < 1e-10:
# Vertical line
x1_boundary = -b / w1
return np.full_like(x1_range, x1_boundary), x1_range
else:
x2_boundary = -(w1 * x1_range + b) / w2
return x1_range, x2_boundary
# Usage:
# x1_vals = np.linspace(-5, 5, 100)
# x1_line, x2_line = get_decision_boundary(model, x1_vals)
# plt.plot(x1_line, x2_line, 'k--', label='Decision Boundary')
Key Takeaways
- Sigmoid:
sigma(z) = 1 / (1 + exp(-z))— always clip input to avoid overflow - Binary cross-entropy:
-mean(y*log(p) + (1-y)*log(1-p))— always clip probabilities - Gradient is elegantly simple:
dw = X.T @ (probs - y) / n - Softmax:
exp(z - max(z)) / sum(exp(z - max(z)))— subtract max for stability - Multi-class gradient has the same form:
dw = X.T @ (probs - y_onehot) / n - Numerical stability (clipping, log-sum-exp trick) is the #1 thing interviewers check
Lilly Tech Systems