Optimization Algorithms
Every machine learning model is trained by solving an optimization problem: minimize a loss function with respect to model parameters. This lesson implements five optimization algorithms from scratch, progressing from basic gradient descent to the advanced methods used in modern deep learning.
Problem 1: Gradient Descent Variants
Problem: Implement batch gradient descent, stochastic gradient descent (SGD), and mini-batch SGD with momentum. These are the workhorses of ML training.
import math
import random
def gradient_descent(f, grad_f, x0, lr=0.01, max_iter=1000, tol=1e-8):
"""
Batch gradient descent.
x_{t+1} = x_t - lr * grad_f(x_t)
Time: O(max_iter * cost(grad_f)), Space: O(n)
"""
x = x0[:]
history = [f(x)]
for t in range(max_iter):
g = grad_f(x)
for i in range(len(x)):
x[i] -= lr * g[i]
loss = f(x)
history.append(loss)
# Check convergence
grad_norm = math.sqrt(sum(gi**2 for gi in g))
if grad_norm < tol:
print(f"Converged at iteration {t+1}, loss={loss:.8f}")
break
return x, history
def sgd_with_momentum(f, grad_f, x0, lr=0.01, momentum=0.9,
max_iter=1000, tol=1e-8):
"""
SGD with momentum.
v_t = momentum * v_{t-1} + grad_f(x_t)
x_{t+1} = x_t - lr * v_t
Momentum accumulates past gradients, accelerating convergence.
"""
x = x0[:]
n = len(x)
v = [0.0] * n # velocity
history = [f(x)]
for t in range(max_iter):
g = grad_f(x)
for i in range(n):
v[i] = momentum * v[i] + g[i]
x[i] -= lr * v[i]
loss = f(x)
history.append(loss)
grad_norm = math.sqrt(sum(gi**2 for gi in g))
if grad_norm < tol:
print(f"Converged at iteration {t+1}, loss={loss:.8f}")
break
return x, history
# Test: minimize Rosenbrock function f(x,y) = (1-x)^2 + 100*(y-x^2)^2
# Minimum at (1, 1)
def rosenbrock(x):
return (1 - x[0])**2 + 100 * (x[1] - x[0]**2)**2
def grad_rosenbrock(x):
return [
-2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2),
200*(x[1] - x[0]**2)
]
x0 = [-1.0, 1.0]
x_opt, hist = gradient_descent(rosenbrock, grad_rosenbrock, x0,
lr=0.001, max_iter=50000)
print(f"GD result: x={x_opt[0]:.4f}, y={x_opt[1]:.4f}")
print(f"GD loss: {rosenbrock(x_opt):.8f}")
x_opt_m, hist_m = sgd_with_momentum(rosenbrock, grad_rosenbrock, x0,
lr=0.001, momentum=0.9, max_iter=50000)
print(f"Momentum result: x={x_opt_m[0]:.4f}, y={x_opt_m[1]:.4f}")
Problem 2: Newton's Method
Problem: Implement Newton's method for optimization, which uses second-order information (the Hessian) to converge much faster than gradient descent near the optimum. The update rule is x_{t+1} = x_t - H^(-1) * grad_f(x_t).
def newtons_method(f, grad_f, hessian_f, x0, max_iter=100, tol=1e-10):
"""
Newton's method for optimization.
x_{t+1} = x_t - H(x_t)^{-1} * grad(x_t)
Quadratic convergence near the optimum.
Time: O(max_iter * n^3) for the matrix inverse, Space: O(n^2)
"""
x = x0[:]
n = len(x)
for t in range(max_iter):
g = grad_f(x)
H = hessian_f(x)
# Solve H * delta = -g using Gauss elimination
# (more stable than computing H^{-1} directly)
delta = solve_linear_system(H, [-gi for gi in g])
for i in range(n):
x[i] += delta[i]
grad_norm = math.sqrt(sum(gi**2 for gi in g))
if grad_norm < tol:
print(f"Newton converged at iteration {t+1}")
break
return x
def solve_linear_system(A, b):
"""
Solve Ax = b using Gaussian elimination with partial pivoting.
Time: O(n^3), Space: O(n^2)
"""
n = len(A)
# Augmented matrix [A | b]
aug = [A[i][:] + [b[i]] for i in range(n)]
# Forward elimination
for col in range(n):
# Partial pivoting
max_row = col
for row in range(col + 1, n):
if abs(aug[row][col]) > abs(aug[max_row][col]):
max_row = row
aug[col], aug[max_row] = aug[max_row], aug[col]
if abs(aug[col][col]) < 1e-15:
raise ValueError("Singular system")
for row in range(col + 1, n):
factor = aug[row][col] / aug[col][col]
for j in range(col, n + 1):
aug[row][j] -= factor * aug[col][j]
# Back substitution
x = [0.0] * n
for i in range(n - 1, -1, -1):
x[i] = aug[i][n]
for j in range(i + 1, n):
x[i] -= aug[i][j] * x[j]
x[i] /= aug[i][i]
return x
# Test: minimize f(x, y) = (x-3)^2 + (y-5)^2
# This is quadratic, so Newton converges in 1 step
def f_quad(x):
return (x[0] - 3)**2 + (x[1] - 5)**2
def grad_quad(x):
return [2*(x[0] - 3), 2*(x[1] - 5)]
def hessian_quad(x):
return [[2, 0], [0, 2]]
result = newtons_method(f_quad, grad_quad, hessian_quad, [0.0, 0.0])
print(f"Newton result: [{result[0]:.4f}, {result[1]:.4f}]")
# [3.0000, 5.0000] in just 1 iteration
Problem 3: Adam Optimizer
Problem: Implement the Adam (Adaptive Moment Estimation) optimizer, the default optimizer for deep learning. Adam combines momentum (first moment) with RMSProp (second moment) and includes bias correction.
def adam(f, grad_f, x0, lr=0.001, beta1=0.9, beta2=0.999,
epsilon=1e-8, max_iter=10000, tol=1e-8):
"""
Adam optimizer (Kingma & Ba, 2015).
Combines momentum and adaptive learning rates.
m_t = beta1 * m_{t-1} + (1-beta1) * g_t (first moment)
v_t = beta2 * v_{t-1} + (1-beta2) * g_t^2 (second moment)
m_hat = m_t / (1 - beta1^t) (bias correction)
v_hat = v_t / (1 - beta2^t) (bias correction)
x_{t+1} = x_t - lr * m_hat / (sqrt(v_hat) + eps)
Time: O(max_iter * cost(grad_f)), Space: O(n)
"""
x = x0[:]
n = len(x)
m = [0.0] * n # first moment (mean of gradients)
v = [0.0] * n # second moment (mean of squared gradients)
history = [f(x)]
for t in range(1, max_iter + 1):
g = grad_f(x)
for i in range(n):
# Update biased first moment estimate
m[i] = beta1 * m[i] + (1 - beta1) * g[i]
# Update biased second moment estimate
v[i] = beta2 * v[i] + (1 - beta2) * g[i]**2
# Bias-corrected estimates
m_hat = m[i] / (1 - beta1**t)
v_hat = v[i] / (1 - beta2**t)
# Update parameters
x[i] -= lr * m_hat / (math.sqrt(v_hat) + epsilon)
loss = f(x)
history.append(loss)
grad_norm = math.sqrt(sum(gi**2 for gi in g))
if grad_norm < tol:
print(f"Adam converged at iteration {t}, loss={loss:.8f}")
break
return x, history
# Test on Rosenbrock
x_adam, hist_adam = adam(rosenbrock, grad_rosenbrock, [-1.0, 1.0],
lr=0.01, max_iter=20000)
print(f"Adam result: x={x_adam[0]:.4f}, y={x_adam[1]:.4f}")
print(f"Adam loss: {rosenbrock(x_adam):.8f}")
Problem 4: L-BFGS (Limited-Memory BFGS)
Problem: Implement a simplified L-BFGS optimizer. L-BFGS approximates the inverse Hessian using a limited history of gradient differences, giving quasi-Newton convergence without storing the full Hessian matrix.
def lbfgs(f, grad_f, x0, m=10, max_iter=200, tol=1e-8):
"""
L-BFGS (Limited-memory BFGS) optimizer.
Approximates inverse Hessian using last m gradient pairs.
Time: O(max_iter * m * n), Space: O(m * n)
"""
n = len(x0)
x = x0[:]
g = grad_f(x)
# Storage for curvature pairs
s_history = [] # s_k = x_{k+1} - x_k
y_history = [] # y_k = g_{k+1} - g_k
rho_history = [] # rho_k = 1 / (y_k^T s_k)
for iteration in range(max_iter):
grad_norm = math.sqrt(sum(gi**2 for gi in g))
if grad_norm < tol:
print(f"L-BFGS converged at iteration {iteration}")
break
# Two-loop recursion to compute search direction d = -H_k * g_k
q = g[:]
alphas = []
# First loop (backward through history)
for i in range(len(s_history) - 1, -1, -1):
alpha = rho_history[i] * sum(
s_history[i][j] * q[j] for j in range(n))
alphas.insert(0, alpha)
for j in range(n):
q[j] -= alpha * y_history[i][j]
# Initial Hessian approximation: H_0 = gamma * I
if len(s_history) > 0:
sy = sum(s_history[-1][j] * y_history[-1][j] for j in range(n))
yy = sum(y_history[-1][j] * y_history[-1][j] for j in range(n))
gamma = sy / yy if yy > 1e-15 else 1.0
else:
gamma = 1.0
r = [gamma * qi for qi in q]
# Second loop (forward through history)
for i in range(len(s_history)):
beta = rho_history[i] * sum(
y_history[i][j] * r[j] for j in range(n))
for j in range(n):
r[j] += (alphas[i] - beta) * s_history[i][j]
d = [-ri for ri in r] # search direction
# Line search (simple backtracking Armijo)
alpha_step = 1.0
fx = f(x)
dg = sum(d[j] * g[j] for j in range(n))
while f([x[j] + alpha_step * d[j] for j in range(n)]) > \
fx + 1e-4 * alpha_step * dg:
alpha_step *= 0.5
if alpha_step < 1e-15:
break
# Update x
x_new = [x[j] + alpha_step * d[j] for j in range(n)]
g_new = grad_f(x_new)
# Store curvature pair
s = [x_new[j] - x[j] for j in range(n)]
y = [g_new[j] - g[j] for j in range(n)]
sy = sum(s[j] * y[j] for j in range(n))
if sy > 1e-15: # curvature condition
if len(s_history) >= m:
s_history.pop(0)
y_history.pop(0)
rho_history.pop(0)
s_history.append(s)
y_history.append(y)
rho_history.append(1.0 / sy)
x = x_new
g = g_new
return x
# Test on Rosenbrock
x_lbfgs = lbfgs(rosenbrock, grad_rosenbrock, [-1.0, 1.0])
print(f"L-BFGS result: x={x_lbfgs[0]:.6f}, y={x_lbfgs[1]:.6f}")
print(f"L-BFGS loss: {rosenbrock(x_lbfgs):.10f}")
Problem 5: Constrained Optimization (Lagrange Multipliers)
Problem: Implement constrained optimization using the penalty method. Solve: minimize f(x) subject to g(x) = 0. The penalty method converts the constrained problem to an unconstrained one: minimize f(x) + (rho/2) * g(x)^2.
def penalty_method(f, grad_f, g, grad_g, x0,
rho_init=1.0, rho_max=1e6, rho_mult=10,
max_outer=20, max_inner=1000, tol=1e-6):
"""
Penalty method for constrained optimization.
minimize f(x) subject to g(x) = 0
Converts to: minimize f(x) + (rho/2) * ||g(x)||^2
Increases rho iteratively until constraint is satisfied.
"""
x = x0[:]
rho = rho_init
for outer in range(max_outer):
# Define augmented objective and gradient
def augmented_f(x):
gx = g(x)
penalty = sum(gi**2 for gi in gx)
return f(x) + (rho / 2) * penalty
def augmented_grad(x):
gf = grad_f(x)
gx = g(x)
gg = grad_g(x) # Jacobian of g, shape (m x n)
n = len(x)
grad = gf[:]
for i in range(n):
for k in range(len(gx)):
grad[i] += rho * gx[k] * gg[k][i]
return grad
# Solve unconstrained subproblem with gradient descent
for inner in range(max_inner):
g_val = augmented_grad(x)
grad_norm = math.sqrt(sum(gi**2 for gi in g_val))
if grad_norm < tol * 0.1:
break
lr = 0.001
for i in range(len(x)):
x[i] -= lr * g_val[i]
# Check constraint violation
constraint = g(x)
violation = math.sqrt(sum(ci**2 for ci in constraint))
print(f"Outer {outer+1}: x=[{x[0]:.4f}, {x[1]:.4f}], "
f"f={f(x):.4f}, violation={violation:.6f}")
if violation < tol:
print(f"Constraint satisfied at iteration {outer+1}")
break
rho = min(rho * rho_mult, rho_max)
return x
# Test: minimize x + y subject to x^2 + y^2 = 1
# Analytical solution: x = y = -1/sqrt(2) ≈ -0.7071
def f_obj(x): return x[0] + x[1]
def grad_obj(x): return [1.0, 1.0]
def g_constraint(x): return [x[0]**2 + x[1]**2 - 1]
def grad_constraint(x): return [[2*x[0], 2*x[1]]]
result = penalty_method(f_obj, grad_obj, g_constraint, grad_constraint,
[0.5, 0.5])
print(f"\nOptimum: [{result[0]:.4f}, {result[1]:.4f}]")
print(f"Expected: [{-1/math.sqrt(2):.4f}, {-1/math.sqrt(2):.4f}]")
Optimizer Comparison
| Optimizer | Order | Per-Step Cost | Convergence | Best For |
|---|---|---|---|---|
| GD | 1st | O(n) | Linear | Simple, convex problems |
| SGD + Momentum | 1st | O(n) | Linear | Large-scale training |
| Adam | 1st | O(n) | Linear (adaptive) | Deep learning (default) |
| Newton | 2nd | O(n^3) | Quadratic | Small, smooth problems |
| L-BFGS | Quasi-2nd | O(mn) | Super-linear | Medium-scale, smooth |
Key Takeaways
- Gradient descent is simple but slow. Momentum accelerates convergence by accumulating past gradients.
- Newton's method converges quadratically but costs O(n^3) per step. Impractical for large models.
- Adam combines momentum with adaptive learning rates. It is the default optimizer for deep learning.
- L-BFGS approximates the inverse Hessian using gradient history, giving quasi-Newton convergence without the cubic cost.
- The penalty method converts constrained problems to unconstrained ones by adding a penalty term.
Lilly Tech Systems