Implement Distributions
Five problems that implement probability distributions from their mathematical definitions. You will build the PDF, PMF, CDF, and sampling functions for the four most important distributions in statistics, plus two classical sampling methods used in every Monte Carlo system.
Problem 1: Normal (Gaussian) Distribution
Problem: Implement the normal distribution PDF, CDF (using the error function approximation), and a sampling function. Do not use scipy.stats.norm or random.gauss.
import math
import random
# ---- CHALLENGE ----
def normal_pdf(x, mu=0, sigma=1):
"""Compute the PDF of N(mu, sigma^2) at point x."""
pass
def normal_cdf(x, mu=0, sigma=1):
"""Compute the CDF of N(mu, sigma^2) at point x."""
pass
# ---- SOLUTION ----
def normal_pdf(x, mu=0, sigma=1):
"""Normal distribution probability density function.
Formula: f(x) = (1 / sqrt(2 * pi * sigma^2)) * exp(-(x - mu)^2 / (2 * sigma^2))
This is the bell curve. The peak is at x = mu with height 1/(sigma * sqrt(2*pi)).
"""
if sigma <= 0:
raise ValueError("sigma must be positive")
coefficient = 1.0 / (sigma * math.sqrt(2 * math.pi))
exponent = -((x - mu) ** 2) / (2 * sigma ** 2)
return coefficient * math.exp(exponent)
def _erf_approx(x):
"""Approximate the error function using Abramowitz & Stegun formula 7.1.26.
Maximum error: 1.5e-7. This is how you implement erf from scratch.
erf(x) = 1 - (a1*t + a2*t^2 + a3*t^3 + a4*t^4 + a5*t^5) * exp(-x^2)
where t = 1 / (1 + 0.3275911 * |x|)
"""
sign = 1 if x >= 0 else -1
x = abs(x)
# Constants from Abramowitz & Stegun
a1 = 0.254829592
a2 = -0.284496736
a3 = 1.421413741
a4 = -1.453152027
a5 = 1.061405429
p = 0.3275911
t = 1.0 / (1.0 + p * x)
y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-x * x)
return sign * y
def normal_cdf(x, mu=0, sigma=1):
"""Normal distribution cumulative distribution function.
Formula: CDF(x) = 0.5 * (1 + erf((x - mu) / (sigma * sqrt(2))))
We implement erf from scratch using the Abramowitz & Stegun approximation.
"""
if sigma <= 0:
raise ValueError("sigma must be positive")
z = (x - mu) / (sigma * math.sqrt(2))
return 0.5 * (1 + _erf_approx(z))
# ---- VERIFICATION ----
# Standard normal: PDF at 0 should be 1/sqrt(2*pi) ~ 0.3989
assert abs(normal_pdf(0) - 0.3989422804) < 1e-6
# CDF at 0 should be 0.5
assert abs(normal_cdf(0) - 0.5) < 1e-6
# CDF at 1.96 should be ~0.975 (the 97.5th percentile)
assert abs(normal_cdf(1.96) - 0.975) < 0.001
# CDF at -1.96 should be ~0.025
assert abs(normal_cdf(-1.96) - 0.025) < 0.001
# Non-standard normal: N(10, 4)
assert abs(normal_pdf(10, mu=10, sigma=2) - 0.19947) < 0.001
print("All tests passed!")
Problem 2: Binomial Distribution
Problem: Implement the binomial PMF and CDF from scratch. Compute P(X = k) for X ~ Binomial(n, p) and P(X <= k). Handle large n without overflow using log-probabilities.
import math
# ---- CHALLENGE ----
def binomial_pmf(k, n, p):
"""P(X = k) for X ~ Binomial(n, p)."""
pass
def binomial_cdf(k, n, p):
"""P(X <= k) for X ~ Binomial(n, p)."""
pass
# ---- SOLUTION ----
def _log_comb(n, k):
"""Compute log(C(n,k)) = log(n!) - log(k!) - log((n-k)!)
Using log-gamma for numerical stability with large n.
log(n!) = sum(log(i) for i in 1..n)
"""
if k < 0 or k > n:
return float('-inf')
if k == 0 or k == n:
return 0.0
# Use the smaller of k, n-k for efficiency
k = min(k, n - k)
result = 0.0
for i in range(k):
result += math.log(n - i) - math.log(i + 1)
return result
def binomial_pmf(k, n, p):
"""Binomial probability mass function.
Formula: P(X=k) = C(n,k) * p^k * (1-p)^(n-k)
We compute in log-space to avoid overflow for large n:
log(P) = log(C(n,k)) + k*log(p) + (n-k)*log(1-p)
"""
if k < 0 or k > n:
return 0.0
if p == 0:
return 1.0 if k == 0 else 0.0
if p == 1:
return 1.0 if k == n else 0.0
log_prob = _log_comb(n, k) + k * math.log(p) + (n - k) * math.log(1 - p)
return math.exp(log_prob)
def binomial_cdf(k, n, p):
"""Binomial CDF: P(X <= k) = sum of PMF from 0 to k."""
if k < 0:
return 0.0
if k >= n:
return 1.0
total = 0.0
for i in range(k + 1):
total += binomial_pmf(i, n, p)
return total
# ---- VERIFICATION ----
# Fair coin, 10 flips, P(exactly 5 heads)
pmf_5 = binomial_pmf(5, 10, 0.5)
assert abs(pmf_5 - 0.24609375) < 1e-6, f"Expected ~0.246, got {pmf_5}"
# P(X <= 5) for fair coin, 10 flips ~ 0.6230
cdf_5 = binomial_cdf(5, 10, 0.5)
assert abs(cdf_5 - 0.623046875) < 1e-6
# Edge cases
assert binomial_pmf(0, 10, 0.5) > 0
assert binomial_pmf(11, 10, 0.5) == 0.0
assert abs(binomial_cdf(10, 10, 0.5) - 1.0) < 1e-10
# Large n (test log-space computation doesn't overflow)
pmf_large = binomial_pmf(500, 1000, 0.5)
assert pmf_large > 0 and pmf_large < 1
print("All tests passed!")
Problem 3: Poisson Distribution
Problem: Implement the Poisson PMF and CDF from scratch. The Poisson distribution models the number of events in a fixed interval when events occur independently at a constant average rate lambda.
import math
# ---- CHALLENGE ----
def poisson_pmf(k, lam):
"""P(X = k) for X ~ Poisson(lambda)."""
pass
def poisson_cdf(k, lam):
"""P(X <= k) for X ~ Poisson(lambda)."""
pass
# ---- SOLUTION ----
def poisson_pmf(k, lam):
"""Poisson probability mass function.
Formula: P(X=k) = (lambda^k * e^(-lambda)) / k!
In log-space for numerical stability:
log(P) = k*log(lambda) - lambda - log(k!)
log(k!) = sum(log(i) for i in 1..k)
"""
if k < 0:
return 0.0
if lam == 0:
return 1.0 if k == 0 else 0.0
if lam < 0:
raise ValueError("lambda must be non-negative")
# Compute in log-space to avoid overflow
log_prob = k * math.log(lam) - lam - sum(math.log(i) for i in range(1, k + 1))
return math.exp(log_prob)
def poisson_cdf(k, lam):
"""Poisson CDF: P(X <= k) = sum of PMF from 0 to k."""
if k < 0:
return 0.0
total = 0.0
for i in range(k + 1):
total += poisson_pmf(i, lam)
return total
# ---- VERIFICATION ----
# Poisson(3), P(X=2) ~ 0.2240
pmf_2 = poisson_pmf(2, 3)
assert abs(pmf_2 - 0.22404) < 0.001
# Poisson(3), P(X=0) = e^(-3) ~ 0.04979
assert abs(poisson_pmf(0, 3) - math.exp(-3)) < 1e-10
# CDF should approach 1 for large k
assert abs(poisson_cdf(20, 3) - 1.0) < 1e-6
# Sum of all PMFs should be 1
total = sum(poisson_pmf(k, 5) for k in range(50))
assert abs(total - 1.0) < 1e-6
# Large lambda (test log-space stability)
pmf_large = poisson_pmf(100, 100)
assert pmf_large > 0 and pmf_large < 1
print("All tests passed!")
Problem 4: Exponential Distribution
Problem: Implement the exponential distribution PDF, CDF, and inverse CDF (quantile function) from scratch. The exponential models time between events in a Poisson process.
import math
import random
# ---- CHALLENGE ----
def exponential_pdf(x, lam):
"""PDF of Exponential(lambda) at point x."""
pass
def exponential_cdf(x, lam):
"""CDF of Exponential(lambda) at point x."""
pass
def exponential_inv_cdf(p, lam):
"""Inverse CDF (quantile function) of Exponential(lambda)."""
pass
def sample_exponential(lam, n=1):
"""Generate n samples from Exponential(lambda) using inverse CDF method."""
pass
# ---- SOLUTION ----
def exponential_pdf(x, lam):
"""Exponential distribution PDF.
Formula: f(x) = lambda * exp(-lambda * x) for x >= 0
f(x) = 0 for x < 0
Mean = 1/lambda, Variance = 1/lambda^2
"""
if lam <= 0:
raise ValueError("lambda must be positive")
if x < 0:
return 0.0
return lam * math.exp(-lam * x)
def exponential_cdf(x, lam):
"""Exponential distribution CDF.
Formula: F(x) = 1 - exp(-lambda * x) for x >= 0
F(x) = 0 for x < 0
"""
if lam <= 0:
raise ValueError("lambda must be positive")
if x < 0:
return 0.0
return 1 - math.exp(-lam * x)
def exponential_inv_cdf(p, lam):
"""Inverse CDF (quantile function).
Formula: F^(-1)(p) = -ln(1-p) / lambda
This is the key to sampling: if U ~ Uniform(0,1),
then X = -ln(1-U)/lambda ~ Exponential(lambda).
"""
if not (0 <= p < 1):
raise ValueError("p must be in [0, 1)")
if lam <= 0:
raise ValueError("lambda must be positive")
return -math.log(1 - p) / lam
def sample_exponential(lam, n=1):
"""Generate samples using the inverse CDF method.
Algorithm:
1. Generate U ~ Uniform(0, 1)
2. Return X = -ln(U) / lambda
(Using U instead of 1-U since both are uniform)
"""
samples = []
for _ in range(n):
u = random.random()
while u == 0: # avoid log(0)
u = random.random()
samples.append(-math.log(u) / lam)
return samples if n > 1 else samples[0]
# ---- VERIFICATION ----
# PDF at x=0 should be lambda
assert abs(exponential_pdf(0, 2.0) - 2.0) < 1e-10
# CDF at x=0 should be 0
assert abs(exponential_cdf(0, 2.0) - 0.0) < 1e-10
# Median = ln(2)/lambda
median_x = math.log(2) / 2.0
assert abs(exponential_cdf(median_x, 2.0) - 0.5) < 1e-10
# Inverse CDF: F^(-1)(0.5) should equal median
assert abs(exponential_inv_cdf(0.5, 2.0) - median_x) < 1e-10
# Sampling: mean of many samples should approximate 1/lambda
random.seed(42)
samples = sample_exponential(2.0, n=10000)
sample_mean = sum(samples) / len(samples)
assert abs(sample_mean - 0.5) < 0.05, f"Sample mean {sample_mean} should be ~0.5"
print("All tests passed!")
Problem 5: Sampling Methods — Inverse CDF and Box-Muller
Problem: Implement the Box-Muller transform to generate standard normal samples from uniform random numbers. Also implement a general inverse CDF sampler. These are the two most important sampling methods in computational statistics.
import math
import random
# ---- CHALLENGE ----
def box_muller():
"""Generate two independent standard normal samples from two uniform samples."""
pass
def sample_normal(mu=0, sigma=1, n=1):
"""Generate n samples from N(mu, sigma^2) using Box-Muller."""
pass
def inverse_cdf_sample(inv_cdf_func, n=1):
"""General inverse CDF sampler for any distribution."""
pass
# ---- SOLUTION ----
def box_muller():
"""Box-Muller transform: convert 2 uniform samples to 2 normal samples.
Algorithm:
1. Generate U1, U2 ~ Uniform(0, 1)
2. Z1 = sqrt(-2 * ln(U1)) * cos(2 * pi * U2)
3. Z2 = sqrt(-2 * ln(U1)) * sin(2 * pi * U2)
4. Z1 and Z2 are independent standard normal samples
Why it works:
- The transform exploits the polar form of the 2D normal distribution
- A 2D standard normal in polar coordinates has:
R^2 ~ Exponential(1/2), i.e., R = sqrt(-2*ln(U1))
Theta ~ Uniform(0, 2*pi), i.e., Theta = 2*pi*U2
- Converting back to Cartesian gives two independent N(0,1) samples
"""
u1 = random.random()
u2 = random.random()
while u1 == 0: # avoid log(0)
u1 = random.random()
r = math.sqrt(-2.0 * math.log(u1))
theta = 2.0 * math.pi * u2
z1 = r * math.cos(theta)
z2 = r * math.sin(theta)
return z1, z2
def sample_normal(mu=0, sigma=1, n=1):
"""Generate n samples from N(mu, sigma^2) using Box-Muller.
Transform: X = mu + sigma * Z, where Z ~ N(0,1)
"""
samples = []
for _ in range((n + 1) // 2): # Box-Muller produces pairs
z1, z2 = box_muller()
samples.append(mu + sigma * z1)
samples.append(mu + sigma * z2)
return samples[:n] if n > 1 else samples[0]
def inverse_cdf_sample(inv_cdf_func, n=1):
"""General inverse CDF (quantile) method.
Algorithm:
1. Generate U ~ Uniform(0, 1)
2. Return X = F^(-1)(U)
This works for ANY distribution with a known inverse CDF.
The proof is simple: P(X <= x) = P(F^(-1)(U) <= x) = P(U <= F(x)) = F(x).
Args:
inv_cdf_func: function that takes p in [0,1) and returns quantile
n: number of samples to generate
"""
samples = []
for _ in range(n):
u = random.random()
while u == 0 or u == 1: # stay in open interval
u = random.random()
samples.append(inv_cdf_func(u))
return samples if n > 1 else samples[0]
# ---- VERIFICATION ----
# Box-Muller: generate many samples and check mean/std
random.seed(42)
samples = sample_normal(mu=0, sigma=1, n=10000)
sample_mean = sum(samples) / len(samples)
sample_var = sum((x - sample_mean) ** 2 for x in samples) / (len(samples) - 1)
assert abs(sample_mean) < 0.05, f"Mean {sample_mean} should be ~0"
assert abs(sample_var - 1.0) < 0.1, f"Variance {sample_var} should be ~1"
# Non-standard normal: N(100, 25)
samples2 = sample_normal(mu=100, sigma=5, n=10000)
mean2 = sum(samples2) / len(samples2)
assert abs(mean2 - 100) < 1.0, f"Mean {mean2} should be ~100"
# Inverse CDF sampler: sample from Exponential(2)
exp_inv = lambda p: -math.log(1 - p) / 2.0
exp_samples = inverse_cdf_sample(exp_inv, n=10000)
exp_mean = sum(exp_samples) / len(exp_samples)
assert abs(exp_mean - 0.5) < 0.05, f"Exp mean {exp_mean} should be ~0.5"
print("All tests passed!")
numpy.random.randn) uses one of these methods internally. Quant firms expect you to implement them from memory.Key Takeaways
- Always compute in log-space for distributions with factorials (binomial, Poisson) to avoid overflow
- The error function approximation (Abramowitz & Stegun) is the standard way to implement normal CDF from scratch
- Box-Muller transform: two uniform samples become two independent normal samples
- Inverse CDF method works for any distribution with a known quantile function
- Know the relationships: Exponential is time between Poisson events; Binomial is sum of Bernoulli trials
Lilly Tech Systems