Lab 04: Probability & Bayesian Deep LearningΒΆ
Source: Deep Learning Interviews by Shlomo Kashani β Chapter: Probabilistic Programming & Bayesian DL
This lab covers foundational probability theory and Bayesian methods essential for deep learning:
Expectation and Variance
Conditional Probability & Bayesβ Rule
Maximum Likelihood Estimation (MLE)
Fisher Information & Cramer-Rao Bound
Conjugate Priors
Posterior Predictive Distributions
Bayesian Deep Learning Basics
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import beta as beta_dist
np.random.seed(42)
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['font.size'] = 12
PART 1: Expectation and VarianceΒΆ
For a discrete random variable \(X\) taking values \(x_1, x_2, \ldots, x_n\) with probabilities \(p(x_i)\):
Expectation (mean): $\(E[X] = \sum_{i} x_i \cdot p(x_i)\)$
Variance: $\(\text{Var}(X) = E[(X - E[X])^2] = E[X^2] - (E[X])^2\)$
Key properties:
\(E[aX + b] = a \cdot E[X] + b\)
\(\text{Var}(aX + b) = a^2 \cdot \text{Var}(X)\)
\(E[X + Y] = E[X] + E[Y]\) (always, even if dependent)
\(\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y)\) (only if \(X, Y\) independent)
def discrete_expectation_variance(values, probs):
"""
Compute E[X] and Var(X) for a discrete random variable.
Args:
values: array of possible values x_i
probs: array of probabilities P(X = x_i), must sum to 1
Returns:
(expectation, variance)
"""
values = np.asarray(values, dtype=float)
probs = np.asarray(probs, dtype=float)
assert np.isclose(probs.sum(), 1.0), f"Probabilities must sum to 1, got {probs.sum()}"
E_X = np.sum(values * probs)
E_X2 = np.sum(values**2 * probs)
Var_X = E_X2 - E_X**2
return E_X, Var_X
# Demo: fair six-sided die
die_values = np.array([1, 2, 3, 4, 5, 6])
die_probs = np.array([1/6] * 6)
E, V = discrete_expectation_variance(die_values, die_probs)
print(f"Fair die: E[X] = {E:.4f} (expected 3.5)")
print(f"Fair die: Var(X) = {V:.4f} (expected {35/12:.4f})")
# Two dice sum problem: enumerate all 36 outcomes, compute exact E and Var
# Enumerate all outcomes
outcomes = []
for d1 in range(1, 7):
for d2 in range(1, 7):
outcomes.append(d1 + d2)
print(f"Total outcomes: {len(outcomes)}")
# Build the probability distribution of the sum
sum_values = np.arange(2, 13) # possible sums: 2 through 12
sum_counts = np.array([outcomes.count(s) for s in sum_values])
sum_probs = sum_counts / 36.0
print("\nSum distribution:")
for val, cnt, prob in zip(sum_values, sum_counts, sum_probs):
print(f" Sum={val:2d}: count={cnt}, P={prob:.4f} {'|' + '#' * cnt}")
E_sum, Var_sum = discrete_expectation_variance(sum_values, sum_probs)
print(f"\nE[X1 + X2] = {E_sum:.4f} (expected 7.0)")
print(f"Var(X1 + X2) = {Var_sum:.4f} (expected {70/12:.4f})")
print(f"\nVerification: 2 * E[single die] = 2 * 3.5 = {2 * 3.5}")
print(f"Verification: 2 * Var(single die) = 2 * {35/12:.4f} = {2 * 35/12:.4f}")
# Verify E[aX+b] = a*E[X]+b and Var(aX+b) = a^2*Var(X) with simulation (100k samples)
np.random.seed(42)
N = 100_000
X = np.random.normal(loc=3.0, scale=2.0, size=N)
a, b = 2.5, -1.0
Y = a * X + b
E_X_sim = np.mean(X)
Var_X_sim = np.var(X)
E_Y_sim = np.mean(Y)
Var_Y_sim = np.var(Y)
print("--- Verifying E[aX + b] = a*E[X] + b ---")
print(f" E[Y] (simulated) = {E_Y_sim:.4f}")
print(f" a*E[X] + b (computed) = {a * E_X_sim + b:.4f}")
print(f" Theoretical = {a * 3.0 + b:.4f}")
print("\n--- Verifying Var(aX + b) = a^2 * Var(X) ---")
print(f" Var(Y) (simulated) = {Var_Y_sim:.4f}")
print(f" a^2 * Var(X) (computed) = {a**2 * Var_X_sim:.4f}")
print(f" Theoretical = {a**2 * 4.0:.4f}")
print(f"\nRelative error in E: {abs(E_Y_sim - (a * E_X_sim + b)) / abs(E_Y_sim):.2e}")
print(f"Relative error in Var: {abs(Var_Y_sim - a**2 * Var_X_sim) / abs(Var_Y_sim):.2e}")
PART 2: Conditional Probability & Bayesβ RuleΒΆ
Conditional probability: $\(P(A|B) = \frac{P(A \cap B)}{P(B)}\)$
Bayesβ Rule: $\(P(A|B) = \frac{P(B|A) \cdot P(A)}{P(B)}\)$
where $\(P(B) = P(B|A)P(A) + P(B|\neg A)P(\neg A)\)$
Bayesβ rule allows us to invert conditional probabilities: given how likely the evidence is under each hypothesis, we can compute how likely each hypothesis is given the evidence.
# Disease test problem
# 1% prevalence, 95% sensitivity (true positive rate), 10% false positive rate
p_disease = 0.01
sensitivity = 0.95 # P(+|disease)
false_positive_rate = 0.10 # P(+|no disease)
# P(+) via law of total probability
p_positive = sensitivity * p_disease + false_positive_rate * (1 - p_disease)
# Bayes' Rule: P(disease|+)
p_disease_given_positive = (sensitivity * p_disease) / p_positive
print("=== Disease Test (Bayes' Rule) ===")
print(f" Prevalence P(disease) = {p_disease:.2%}")
print(f" Sensitivity P(+|disease) = {sensitivity:.2%}")
print(f" False positive P(+|healthy) = {false_positive_rate:.2%}")
print(f"")
print(f" P(+) = {p_positive:.4f}")
print(f" P(disease|+) = {p_disease_given_positive:.4f} = {p_disease_given_positive:.1%}")
print(f"")
print(f" Despite 95% sensitivity, a positive test only means ~{p_disease_given_positive*100:.1f}%")
print(f" chance of actually having the disease!")
print(f" This is the base-rate fallacy in action.")
# Bayesian update: coin fair(p=0.5) vs biased(p=0.7)
# Prior: P(fair) = 0.6, observe HHTTH, update after each observation
observations = [1, 1, 0, 0, 1] # H=1, T=0 -> HHTTH
p_fair_coin = 0.5
p_biased_coin = 0.7
p_fair = 0.6 # prior P(fair)
posteriors = [p_fair]
print("=== Bayesian Update: Fair vs Biased Coin ===")
print(f"Prior P(fair) = {p_fair:.4f}")
print()
for i, obs in enumerate(observations):
# Likelihood of this observation under each hypothesis
if obs == 1: # heads
lik_fair = p_fair_coin
lik_biased = p_biased_coin
else: # tails
lik_fair = 1 - p_fair_coin
lik_biased = 1 - p_biased_coin
# Evidence: P(obs) = P(obs|fair)*P(fair) + P(obs|biased)*P(biased)
p_evidence = lik_fair * p_fair + lik_biased * (1 - p_fair)
# Posterior: P(fair|obs) = P(obs|fair)*P(fair) / P(obs)
p_fair = (lik_fair * p_fair) / p_evidence
posteriors.append(p_fair)
label = 'H' if obs == 1 else 'T'
print(f" After obs {i+1} ({label}): P(fair) = {p_fair:.4f}, P(biased) = {1 - p_fair:.4f}")
# Plot evolution of P(fair)
plt.figure(figsize=(8, 5))
plt.plot(range(len(posteriors)), posteriors, 'bo-', linewidth=2, markersize=8)
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Undecided (0.5)')
plt.xlabel('Number of observations')
plt.ylabel('P(fair)')
plt.title('Bayesian Update: P(coin is fair) after each observation (HHTTH)')
obs_str = ['prior'] + ['H' if o == 1 else 'T' for o in observations]
plt.xticks(range(len(posteriors)), obs_str)
plt.ylim(0, 1)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Monty Hall via Bayes
# 3 doors, car behind one. You pick door 1. Monty opens door 3 (goat).
# Compute P(car at door 2 | Monty opens door 3)
# Priors: P(car at door i) = 1/3 for i = 1, 2, 3
p_car = {1: 1/3, 2: 1/3, 3: 1/3}
# Likelihoods: P(Monty opens door 3 | car at door i)
# If car at door 1 (your pick): Monty can open door 2 or 3 equally -> P = 1/2
# If car at door 2: Monty must open door 3 (can't open door 1 or 2) -> P = 1
# If car at door 3: Monty can't open door 3 -> P = 0
p_monty_opens_3 = {1: 1/2, 2: 1, 3: 0}
# Evidence: P(Monty opens 3)
p_evidence = sum(p_monty_opens_3[d] * p_car[d] for d in [1, 2, 3])
# Posterior for each door
print("=== Monty Hall Problem via Bayes' Rule ===")
print(f"You chose door 1. Monty opens door 3 (goat).")
print(f"")
print(f"P(Monty opens 3) = {p_evidence:.4f}")
print()
for door in [1, 2, 3]:
posterior = (p_monty_opens_3[door] * p_car[door]) / p_evidence
print(f" P(car at door {door} | Monty opens 3) = {posterior:.4f} = {posterior} ")
p_car_door2 = (p_monty_opens_3[2] * p_car[2]) / p_evidence
print(f"\nConclusion: P(car at door 2 | Monty opens 3) = {p_car_door2:.4f} = 2/3")
print(f"You should SWITCH to door 2. Switching wins 2/3 of the time!")
PART 3: Maximum Likelihood Estimation (MLE)ΒΆ
Given i.i.d. data \(x_1, \ldots, x_n\) from a distribution with parameter \(\theta\):
Likelihood function: $\(L(\theta) = \prod_{i=1}^{n} p(x_i | \theta)\)$
MLE: $\(\hat{\theta}_{\text{MLE}} = \arg\max_{\theta} L(\theta)\)$
In practice, we usually maximize the log-likelihood (monotonic, converts products to sums): $\(\ell(\theta) = \sum_{i=1}^{n} \log p(x_i | \theta)\)$
Set \(\frac{d\ell}{d\theta} = 0\) and solve for \(\theta\).
def mle_bernoulli(data):
"""
MLE for p in Bernoulli(p).
Log-likelihood: l(p) = k*log(p) + (n-k)*log(1-p)
Setting dl/dp = k/p - (n-k)/(1-p) = 0 gives p_hat = k/n = sample mean.
Args:
data: array of 0s and 1s
Returns:
MLE estimate of p
"""
data = np.asarray(data, dtype=float)
return np.mean(data)
# Demo
coin_data = np.array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1]) # 7 heads, 3 tails
p_mle = mle_bernoulli(coin_data)
print(f"Coin data: {coin_data}")
print(f"MLE estimate of p: {p_mle:.2f} ({int(coin_data.sum())} heads out of {len(coin_data)} flips)")
def mle_gaussian(data):
"""
MLE for mu and sigma^2 in N(mu, sigma^2).
mu_hat = (1/n) * sum(x_i) = sample mean
sigma^2_hat = (1/n) * sum((x_i - mu_hat)^2) = biased variance
Note: MLE variance estimator is biased (divides by n, not n-1).
Args:
data: array of observations
Returns:
(mu_hat, sigma2_hat)
"""
data = np.asarray(data, dtype=float)
mu_hat = np.mean(data)
sigma2_hat = np.mean((data - mu_hat)**2) # biased: divides by n
return mu_hat, sigma2_hat
# Demo
np.random.seed(42)
true_mu, true_sigma = 5.0, 2.0
gauss_data = np.random.normal(true_mu, true_sigma, size=100)
mu_hat, sigma2_hat = mle_gaussian(gauss_data)
print(f"True parameters: mu={true_mu}, sigma^2={true_sigma**2}")
print(f"MLE estimates: mu_hat={mu_hat:.4f}, sigma^2_hat={sigma2_hat:.4f}")
print(f"Unbiased var (n-1): {np.var(gauss_data, ddof=1):.4f}")
# Plot likelihood L(p) and log L(p) for coin data, mark the MLE
data = np.array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1]) # 7H, 3T
n_heads = data.sum()
n_tails = len(data) - n_heads
p_mle_val = n_heads / len(data)
p_range = np.linspace(0.01, 0.99, 500)
# Likelihood: L(p) = p^k * (1-p)^(n-k)
likelihood = p_range**n_heads * (1 - p_range)**n_tails
# Log-likelihood: l(p) = k*log(p) + (n-k)*log(1-p)
log_likelihood = n_heads * np.log(p_range) + n_tails * np.log(1 - p_range)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plot Likelihood
axes[0].plot(p_range, likelihood, 'b-', linewidth=2)
axes[0].axvline(x=p_mle_val, color='r', linestyle='--', linewidth=2, label=f'MLE = {p_mle_val:.1f}')
axes[0].set_xlabel('p')
axes[0].set_ylabel('L(p)')
axes[0].set_title(f'Likelihood function (data: {int(n_heads)}H, {int(n_tails)}T)')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
# Plot Log-Likelihood
axes[1].plot(p_range, log_likelihood, 'b-', linewidth=2)
axes[1].axvline(x=p_mle_val, color='r', linestyle='--', linewidth=2, label=f'MLE = {p_mle_val:.1f}')
axes[1].set_xlabel('p')
axes[1].set_ylabel('log L(p)')
axes[1].set_title(f'Log-likelihood function (data: {int(n_heads)}H, {int(n_tails)}T)')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"MLE: p_hat = {p_mle_val:.2f}")
print(f"Max likelihood: L({p_mle_val:.1f}) = {p_mle_val**n_heads * (1-p_mle_val)**n_tails:.6f}")
print(f"Max log-likelihood: l({p_mle_val:.1f}) = {n_heads*np.log(p_mle_val) + n_tails*np.log(1-p_mle_val):.4f}")
PART 4: Fisher InformationΒΆ
Fisher Information measures how much information a random variable \(X\) carries about an unknown parameter \(\theta\):
Equivalently: $\(I(\theta) = \text{Var}\left(\frac{d}{d\theta} \log p(x|\theta)\right)\)$
Cramer-Rao Lower Bound: For any unbiased estimator \(\hat{\theta}\): $\(\text{Var}(\hat{\theta}) \geq \frac{1}{n \cdot I(\theta)}\)$
Higher Fisher information means the parameter is easier to estimate (lower variance achievable).
def fisher_info_bernoulli(p):
"""
Fisher information for Bernoulli(p).
log p(x|p) = x*log(p) + (1-x)*log(1-p)
d/dp log p = x/p - (1-x)/(1-p)
d^2/dp^2 log p = -x/p^2 - (1-x)/(1-p)^2
I(p) = E[-d^2/dp^2 log p] = 1/(p*(1-p))
Args:
p: success probability, 0 < p < 1
Returns:
Fisher information I(p)
"""
return 1.0 / (p * (1 - p))
def fisher_info_gaussian_mu(sigma):
"""
Fisher information for N(mu, sigma^2) with known sigma, w.r.t. mu.
log p(x|mu) = -0.5*log(2*pi*sigma^2) - (x-mu)^2/(2*sigma^2)
d^2/dmu^2 log p = -1/sigma^2
I(mu) = 1/sigma^2
Args:
sigma: standard deviation (known)
Returns:
Fisher information I(mu)
"""
return 1.0 / sigma**2
# Demo
print("=== Fisher Information Examples ===")
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
print(f" Bernoulli(p={p}): I(p) = {fisher_info_bernoulli(p):.4f}")
print()
for sigma in [0.5, 1.0, 2.0, 5.0]:
print(f" Gaussian(sigma={sigma}): I(mu) = {fisher_info_gaussian_mu(sigma):.4f}")
print("\nNote: Bernoulli Fisher info is lowest at p=0.5 (hardest to distinguish small")
print("changes), and highest near 0 or 1. Gaussian Fisher info increases as sigma decreases.")
# Verify Cramer-Rao bound via simulation
# Bernoulli(p=0.3), N=50 samples, 10000 simulations
p_true = 0.3
N = 50
n_simulations = 10_000
np.random.seed(42)
# Run simulations: each time draw N Bernoulli samples and compute MLE
mle_estimates = np.zeros(n_simulations)
for i in range(n_simulations):
samples = np.random.binomial(1, p_true, size=N)
mle_estimates[i] = np.mean(samples) # MLE for Bernoulli is sample mean
# Empirical variance of MLE
empirical_var = np.var(mle_estimates)
# Cramer-Rao lower bound: 1 / (N * I(p)) = p*(1-p)/N
cramer_rao_bound = 1.0 / (N * fisher_info_bernoulli(p_true))
# Equivalently: p*(1-p)/N
cr_bound_direct = p_true * (1 - p_true) / N
print("=== Cramer-Rao Bound Verification ===")
print(f" True p = {p_true}, N = {N}, simulations = {n_simulations}")
print(f"")
print(f" Empirical Var(p_hat) = {empirical_var:.6f}")
print(f" Cramer-Rao bound = {cramer_rao_bound:.6f}")
print(f" (= p*(1-p)/N = {cr_bound_direct:.6f})")
print(f"")
print(f" Var(p_hat) >= CR bound? {empirical_var >= cramer_rao_bound * 0.95}")
print(f" Ratio Var/CR_bound = {empirical_var / cramer_rao_bound:.4f}")
print(f" (Should be close to 1.0 since MLE achieves the bound for Bernoulli)")
# Histogram of MLE estimates
plt.figure(figsize=(8, 5))
plt.hist(mle_estimates, bins=50, density=True, alpha=0.7, color='steelblue', edgecolor='white')
plt.axvline(x=p_true, color='r', linestyle='--', linewidth=2, label=f'True p = {p_true}')
plt.axvline(x=np.mean(mle_estimates), color='green', linestyle='-', linewidth=2,
label=f'Mean of MLEs = {np.mean(mle_estimates):.4f}')
# Overlay the theoretical distribution: N(p, p(1-p)/N)
x_range = np.linspace(0, 0.6, 200)
theoretical_pdf = stats.norm.pdf(x_range, loc=p_true, scale=np.sqrt(cramer_rao_bound))
plt.plot(x_range, theoretical_pdf, 'k-', linewidth=2, label='Theoretical N(p, CR bound)')
plt.xlabel('MLE estimate of p')
plt.ylabel('Density')
plt.title(f'Distribution of MLE estimates (N={N}, {n_simulations} simulations)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
PART 5: Conjugate PriorsΒΆ
A conjugate prior is one where the posterior distribution has the same functional form as the prior.
Beta-Bernoulli ConjugacyΒΆ
Prior: \(\theta \sim \text{Beta}(\alpha, \beta)\)
Likelihood: \(x_1, \ldots, x_n \sim \text{Bernoulli}(\theta)\)
Posterior: \(\theta | \text{data} \sim \text{Beta}(\alpha + k, \, \beta + n - k)\)
where \(k = \sum x_i\) (number of successes).
Gaussian-Gaussian ConjugacyΒΆ
Prior: \(\mu \sim N(\mu_0, \sigma_0^2)\)
Likelihood: \(x_1, \ldots, x_n \sim N(\mu, \sigma^2)\) (known \(\sigma^2\))
Posterior: \(\mu | \text{data} \sim N(\mu_n, \sigma_n^2)\) with:
\(\sigma_n^2 = \left(\frac{1}{\sigma_0^2} + \frac{n}{\sigma^2}\right)^{-1}\)
\(\mu_n = \sigma_n^2 \left(\frac{\mu_0}{\sigma_0^2} + \frac{n \bar{x}}{\sigma^2}\right)\)
The prior βpseudo-countsβ \(\alpha, \beta\) represent our prior belief strength.
def beta_bernoulli_update(alpha_prior, beta_prior, data):
"""
Compute and plot prior, likelihood, and posterior for Beta-Bernoulli model.
Prior: Beta(alpha_prior, beta_prior)
Data: sequence of 0s and 1s
Posterior: Beta(alpha_prior + n_heads, beta_prior + n_tails)
Returns:
(alpha_post, beta_post)
"""
data = np.asarray(data)
n_heads = int(data.sum())
n_tails = len(data) - n_heads
alpha_post = alpha_prior + n_heads
beta_post = beta_prior + n_tails
x = np.linspace(0, 1, 500)
# Prior
prior_pdf = beta_dist.pdf(x, alpha_prior, beta_prior)
# Likelihood (proportional to Beta(n_heads+1, n_tails+1) for visualization)
likelihood_pdf = beta_dist.pdf(x, n_heads + 1, n_tails + 1)
# Posterior
posterior_pdf = beta_dist.pdf(x, alpha_post, beta_post)
# Normalize for visual comparison
prior_pdf_norm = prior_pdf / prior_pdf.max() if prior_pdf.max() > 0 else prior_pdf
likelihood_pdf_norm = likelihood_pdf / likelihood_pdf.max() if likelihood_pdf.max() > 0 else likelihood_pdf
posterior_pdf_norm = posterior_pdf / posterior_pdf.max() if posterior_pdf.max() > 0 else posterior_pdf
plt.figure(figsize=(10, 6))
plt.plot(x, prior_pdf_norm, 'b--', linewidth=2,
label=f'Prior: Beta({alpha_prior}, {beta_prior})')
plt.plot(x, likelihood_pdf_norm, 'g:', linewidth=2,
label=f'Likelihood ({n_heads}H, {n_tails}T)')
plt.plot(x, posterior_pdf_norm, 'r-', linewidth=2.5,
label=f'Posterior: Beta({alpha_post}, {beta_post})')
# Mark posterior mean
post_mean = alpha_post / (alpha_post + beta_post)
plt.axvline(x=post_mean, color='red', linestyle='--', alpha=0.5,
label=f'Posterior mean = {post_mean:.3f}')
plt.xlabel(r'$\theta$ (probability of heads)')
plt.ylabel('Normalized density')
plt.title('Beta-Bernoulli Conjugate Update')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Prior: Beta({alpha_prior}, {beta_prior}), mean = {alpha_prior/(alpha_prior+beta_prior):.3f}")
print(f"Data: {n_heads} heads, {n_tails} tails (MLE = {n_heads/len(data):.3f})")
print(f"Posterior: Beta({alpha_post}, {beta_post}), mean = {post_mean:.3f}")
return alpha_post, beta_post
# Demo: weakly informative prior, observe 7H 3T
data = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
alpha_post, beta_post = beta_bernoulli_update(alpha_prior=2, beta_prior=2, data=data)
# Prior strength demo: weak, medium, and strong priors all observe 7H/3T
data = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0]) # 7H, 3T
n_heads = int(data.sum())
n_tails = len(data) - n_heads
priors = [
(1, 1, 'Weak: Beta(1,1) = Uniform'),
(5, 5, 'Medium: Beta(5,5)'),
(50, 50, 'Strong: Beta(50,50)'),
]
x = np.linspace(0, 1, 500)
colors = ['steelblue', 'darkorange', 'green']
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
for idx, (alpha, beta_val, label) in enumerate(priors):
ax = axes[idx]
alpha_post = alpha + n_heads
beta_post = beta_val + n_tails
prior_pdf = beta_dist.pdf(x, alpha, beta_val)
posterior_pdf = beta_dist.pdf(x, alpha_post, beta_post)
ax.plot(x, prior_pdf, '--', color=colors[idx], linewidth=2, label=f'Prior Beta({alpha},{beta_val})')
ax.fill_between(x, posterior_pdf, alpha=0.3, color=colors[idx])
ax.plot(x, posterior_pdf, '-', color=colors[idx], linewidth=2.5,
label=f'Post Beta({alpha_post},{beta_post})')
post_mean = alpha_post / (alpha_post + beta_post)
ax.axvline(x=post_mean, color='red', linestyle=':', alpha=0.7,
label=f'Post mean={post_mean:.3f}')
ax.axvline(x=0.7, color='gray', linestyle='--', alpha=0.5, label='MLE=0.7')
ax.set_title(label)
ax.set_xlabel(r'$\theta$')
ax.set_ylabel('Density')
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3)
plt.suptitle('Effect of Prior Strength (all see 7H/3T)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
print("Observation: With a weak prior, the posterior is dominated by the data (MLE=0.7).")
print("With a strong prior centered at 0.5, the posterior barely moves from 0.5.")
print("The prior 'pseudo-count' strength relative to data size determines the influence.")
PART 6: Posterior Predictive DistributionΒΆ
The posterior predictive distribution integrates out the parameter uncertainty:
For the Beta-Bernoulli model: $\(P(x_{\text{new}} = 1 | \text{data}) = E[\theta | \text{data}] = \frac{\alpha_{\text{post}}}{\alpha_{\text{post}} + \beta_{\text{post}}}\)$
This is a weighted average of the prior mean and the MLE, where the weights depend on sample size relative to prior strength.
# Posterior predictive for Beta-Bernoulli
def posterior_predictive_bernoulli(alpha_post, beta_post):
"""
Posterior predictive probability for the next observation being 1 (heads)
under the Beta-Bernoulli model.
P(x_new = 1 | data) = E[theta | data] = alpha / (alpha + beta)
Args:
alpha_post: posterior alpha parameter
beta_post: posterior beta parameter
Returns:
P(x_new = 1 | data)
"""
return alpha_post / (alpha_post + beta_post)
# Scenario: observe coin data incrementally and track posterior predictive
alpha_prior, beta_prior_val = 2, 2 # weakly informative prior
data_sequence = [1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1] # 11H, 4T
alpha_running = alpha_prior
beta_running = beta_prior_val
predictives = [posterior_predictive_bernoulli(alpha_running, beta_running)]
print("=== Posterior Predictive After Each Observation ===")
print(f"Prior Beta({alpha_prior},{beta_prior_val}): P(next=H) = {predictives[0]:.4f}")
for i, obs in enumerate(data_sequence):
if obs == 1:
alpha_running += 1
else:
beta_running += 1
pred = posterior_predictive_bernoulli(alpha_running, beta_running)
predictives.append(pred)
label = 'H' if obs == 1 else 'T'
total = i + 1
heads_so_far = alpha_running - alpha_prior
print(f" After obs {total:2d} ({label}): Beta({alpha_running},{beta_running}), "
f"P(next=H) = {pred:.4f}, MLE = {heads_so_far/total:.4f}")
# Plot
mle_running = []
heads_count = 0
for i, obs in enumerate(data_sequence):
heads_count += obs
mle_running.append(heads_count / (i + 1))
plt.figure(figsize=(10, 5))
plt.plot(range(len(predictives)), predictives, 'ro-', linewidth=2, markersize=6,
label='Posterior predictive P(H|data)')
plt.plot(range(1, len(predictives)), mle_running, 'bs--', linewidth=1.5, markersize=5,
label='MLE (running proportion of H)', alpha=0.7)
plt.axhline(y=0.5, color='gray', linestyle=':', alpha=0.5)
plt.xlabel('Number of observations')
plt.ylabel('P(next flip = Heads)')
plt.title('Posterior Predictive vs MLE Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\nFinal posterior: Beta({alpha_running}, {beta_running})")
print(f"Final posterior predictive P(next=H) = {predictives[-1]:.4f}")
print(f"Final MLE = {sum(data_sequence)/len(data_sequence):.4f}")
print("The posterior predictive is shrunk toward the prior mean (0.5) relative to the MLE.")
PART 7: Bayesian Deep Learning BasicsΒΆ
In standard deep learning, we find point estimates of weights via SGD. In Bayesian deep learning, we maintain a distribution over weights:
Types of UncertaintyΒΆ
Epistemic uncertainty (model uncertainty): reducible with more data. Captured by the posterior distribution over weights.
Aleatoric uncertainty (data noise): irreducible, inherent in the data.
MC Dropout as Approximate Bayesian InferenceΒΆ
Gal & Ghahramani (2016) showed that dropout at test time approximates a Bayesian posterior:
Train model with dropout
At test time, run \(T\) forward passes with dropout still enabled
The mean of predictions approximates \(E[y | x, \mathcal{D}]\)
The variance of predictions captures epistemic uncertainty
Predictions far from training data will have higher variance (more uncertain).
# MC Dropout Demo
# Simple linear model y = 2x + 1 with dropout applied to weights
np.random.seed(42)
# True function: y = 2x + 1
x_train = np.linspace(0, 5, 20)
y_train = 2 * x_train + 1 + np.random.randn(20) * 0.5
# "Learned" weights (simulate having trained a model)
# w[0] = slope, w[1] = intercept
w = np.array([2.1, 0.95])
# Test points (including extrapolation)
x_test = np.array([3.0, 6.0, 10.0])
T = 100 # number of MC dropout forward passes
dropout_rate = 0.3
predictions = np.zeros((T, len(x_test)))
for t in range(T):
# Apply dropout: randomly zero each weight with probability dropout_rate
# Scale surviving weights by 1/(1-dropout_rate) to maintain expected output (inverted dropout)
mask = (np.random.rand(len(w)) > dropout_rate).astype(float)
w_dropped = w * mask / (1 - dropout_rate)
# Forward pass: y = w[0]*x + w[1]
predictions[t, :] = w_dropped[0] * x_test + w_dropped[1]
# Compute mean and std of predictions
pred_mean = predictions.mean(axis=0)
pred_std = predictions.std(axis=0)
print("=== MC Dropout Uncertainty Estimation ===")
print(f"Model: y = {w[0]}*x + {w[1]}")
print(f"Training range: x in [0, 5]")
print(f"Dropout rate: {dropout_rate}, T = {T} forward passes")
print()
print(f"{'x_test':>8} {'True y':>10} {'Mean pred':>12} {'Std pred':>10} {'95% CI':>20}")
print("-" * 65)
for i, x_val in enumerate(x_test):
true_y = 2 * x_val + 1
lo = pred_mean[i] - 1.96 * pred_std[i]
hi = pred_mean[i] + 1.96 * pred_std[i]
print(f"{x_val:8.1f} {true_y:10.2f} {pred_mean[i]:12.4f} {pred_std[i]:10.4f} [{lo:8.2f}, {hi:8.2f}]")
print("\nNote: Uncertainty (std) increases for x=10 (extrapolation beyond training range).")
# Plot MC dropout predictions with uncertainty bands
# Dense test grid for smooth plot
x_dense = np.linspace(-1, 12, 200)
T_dense = 100
preds_dense = np.zeros((T_dense, len(x_dense)))
np.random.seed(42)
for t in range(T_dense):
mask = (np.random.rand(len(w)) > dropout_rate).astype(float)
w_dropped = w * mask / (1 - dropout_rate)
preds_dense[t, :] = w_dropped[0] * x_dense + w_dropped[1]
mean_dense = preds_dense.mean(axis=0)
std_dense = preds_dense.std(axis=0)
plt.figure(figsize=(10, 6))
# Plot individual MC samples (faint)
for t in range(min(20, T_dense)):
plt.plot(x_dense, preds_dense[t], color='lightblue', alpha=0.3, linewidth=0.5)
# Uncertainty bands
plt.fill_between(x_dense, mean_dense - 2*std_dense, mean_dense + 2*std_dense,
alpha=0.2, color='steelblue', label=r'$\pm 2\sigma$ uncertainty')
plt.fill_between(x_dense, mean_dense - std_dense, mean_dense + std_dense,
alpha=0.3, color='steelblue', label=r'$\pm 1\sigma$ uncertainty')
# Mean prediction
plt.plot(x_dense, mean_dense, 'b-', linewidth=2.5, label='MC Dropout mean')
# True function
plt.plot(x_dense, 2*x_dense + 1, 'k--', linewidth=2, label='True: y = 2x + 1')
# Training data
plt.scatter(x_train, y_train, c='red', s=40, zorder=5, label='Training data', edgecolors='darkred')
# Mark test points
for i, x_val in enumerate(x_test):
plt.plot(x_val, pred_mean[i], 'g^', markersize=12, zorder=6)
plt.axvspan(0, 5, alpha=0.05, color='green', label='Training range')
plt.xlabel('x')
plt.ylabel('y')
plt.title('MC Dropout: Predictions with Epistemic Uncertainty')
plt.legend(loc='upper left', fontsize=9)
plt.grid(True, alpha=0.3)
plt.xlim(-1, 12)
plt.tight_layout()
plt.show()
print("The uncertainty bands widen outside the training range [0, 5],")
print("reflecting greater epistemic uncertainty during extrapolation.")
SummaryΒΆ
This lab covered the core probability and Bayesian concepts underpinning modern deep learning:
Topic |
Key Takeaway |
|---|---|
Expectation & Variance |
Fundamental summary statistics; linearity of expectation holds universally |
Bayesβ Rule |
Inverts conditional probabilities; reveals base-rate fallacy (disease test ~8.7%) |
Bayesian Updating |
Sequential evidence incorporation; posterior after one step becomes prior for the next |
MLE |
Point estimate maximizing data likelihood; sample mean for Bernoulli, (mean, biased var) for Gaussian |
Fisher Information |
Quantifies estimability; Cramer-Rao bounds the minimum achievable variance |
Conjugate Priors |
Beta-Bernoulli conjugacy makes updating analytical; prior strength controls data influence |
Posterior Predictive |
Integrates out parameter uncertainty; shrinks MLE toward prior |
Bayesian DL |
Distributions over weights capture epistemic uncertainty; MC Dropout provides a practical approximation |
Source: Deep Learning Interviews by Shlomo Kashani β Chapter: Probabilistic Programming & Bayesian DL