Run this notebook: Open in Colab Open in Kaggle

Lab 04: Probability & Bayesian Deep Learning¶

Source: Deep Learning Interviews by Shlomo Kashani – Chapter: Probabilistic Programming & Bayesian DL

This lab covers foundational probability theory and Bayesian methods essential for deep learning:

Expectation and Variance
Conditional Probability & Bayes’ Rule
Maximum Likelihood Estimation (MLE)
Fisher Information & Cramer-Rao Bound
Conjugate Priors
Posterior Predictive Distributions
Bayesian Deep Learning Basics

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import beta as beta_dist

np.random.seed(42)
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['font.size'] = 12

PART 1: Expectation and Variance¶

For a discrete random variable $X$ taking values $x_1, x_2, \ldots, x_n$ with probabilities $p(x_i)$:

Expectation (mean): $$E[X] = \sum_{i} x_i \cdot p(x_i)$$

Variance: $$\text{Var}(X) = E[(X - E[X])^2] = E[X^2] - (E[X])^2$$

Key properties:

$E[aX + b] = a \cdot E[X] + b$
$\text{Var}(aX + b) = a^2 \cdot \text{Var}(X)$
$E[X + Y] = E[X] + E[Y]$ (always, even if dependent)
$\text{Var}(X + Y) = \text{Var}(X) + \text{Var}(Y)$ (only if $X, Y$ independent)

def discrete_expectation_variance(values, probs):
    """
    Compute E[X] and Var(X) for a discrete random variable.
    
    Args:
        values: array of possible values x_i
        probs: array of probabilities P(X = x_i), must sum to 1
    Returns:
        (expectation, variance)
    """
    values = np.asarray(values, dtype=float)
    probs = np.asarray(probs, dtype=float)
    assert np.isclose(probs.sum(), 1.0), f"Probabilities must sum to 1, got {probs.sum()}"
    
    E_X = np.sum(values * probs)
    E_X2 = np.sum(values**2 * probs)
    Var_X = E_X2 - E_X**2
    return E_X, Var_X


# Demo: fair six-sided die
die_values = np.array([1, 2, 3, 4, 5, 6])
die_probs = np.array([1/6] * 6)

E, V = discrete_expectation_variance(die_values, die_probs)
print(f"Fair die: E[X] = {E:.4f} (expected 3.5)")
print(f"Fair die: Var(X) = {V:.4f} (expected {35/12:.4f})")

# Two dice sum problem: enumerate all 36 outcomes, compute exact E and Var

# Enumerate all outcomes
outcomes = []
for d1 in range(1, 7):
    for d2 in range(1, 7):
        outcomes.append(d1 + d2)

print(f"Total outcomes: {len(outcomes)}")

# Build the probability distribution of the sum
sum_values = np.arange(2, 13)  # possible sums: 2 through 12
sum_counts = np.array([outcomes.count(s) for s in sum_values])
sum_probs = sum_counts / 36.0

print("\nSum distribution:")
for val, cnt, prob in zip(sum_values, sum_counts, sum_probs):
    print(f"  Sum={val:2d}: count={cnt}, P={prob:.4f} {'|' + '#' * cnt}")

E_sum, Var_sum = discrete_expectation_variance(sum_values, sum_probs)
print(f"\nE[X1 + X2] = {E_sum:.4f} (expected 7.0)")
print(f"Var(X1 + X2) = {Var_sum:.4f} (expected {70/12:.4f})")
print(f"\nVerification: 2 * E[single die] = 2 * 3.5 = {2 * 3.5}")
print(f"Verification: 2 * Var(single die) = 2 * {35/12:.4f} = {2 * 35/12:.4f}")

# Verify E[aX+b] = a*E[X]+b and Var(aX+b) = a^2*Var(X) with simulation (100k samples)

np.random.seed(42)
N = 100_000
X = np.random.normal(loc=3.0, scale=2.0, size=N)

a, b = 2.5, -1.0
Y = a * X + b

E_X_sim = np.mean(X)
Var_X_sim = np.var(X)
E_Y_sim = np.mean(Y)
Var_Y_sim = np.var(Y)

print("--- Verifying E[aX + b] = a*E[X] + b ---")
print(f"  E[Y] (simulated)        = {E_Y_sim:.4f}")
print(f"  a*E[X] + b (computed)   = {a * E_X_sim + b:.4f}")
print(f"  Theoretical             = {a * 3.0 + b:.4f}")

print("\n--- Verifying Var(aX + b) = a^2 * Var(X) ---")
print(f"  Var(Y) (simulated)      = {Var_Y_sim:.4f}")
print(f"  a^2 * Var(X) (computed) = {a**2 * Var_X_sim:.4f}")
print(f"  Theoretical             = {a**2 * 4.0:.4f}")

print(f"\nRelative error in E: {abs(E_Y_sim - (a * E_X_sim + b)) / abs(E_Y_sim):.2e}")
print(f"Relative error in Var: {abs(Var_Y_sim - a**2 * Var_X_sim) / abs(Var_Y_sim):.2e}")

PART 2: Conditional Probability & Bayes’ Rule¶

Conditional probability: $$P(A|B) = \frac{P(A \cap B)}{P(B)}$$

Bayes’ Rule: $$P(A|B) = \frac{P(B|A) \cdot P(A)}{P(B)}$$

where $$P(B) = P(B|A)P(A) + P(B|\neg A)P(\neg A)$$

Bayes’ rule allows us to invert conditional probabilities: given how likely the evidence is under each hypothesis, we can compute how likely each hypothesis is given the evidence.

# Disease test problem
# 1% prevalence, 95% sensitivity (true positive rate), 10% false positive rate

p_disease = 0.01
sensitivity = 0.95      # P(+|disease)
false_positive_rate = 0.10  # P(+|no disease)

# P(+) via law of total probability
p_positive = sensitivity * p_disease + false_positive_rate * (1 - p_disease)

# Bayes' Rule: P(disease|+)
p_disease_given_positive = (sensitivity * p_disease) / p_positive

print("=== Disease Test (Bayes' Rule) ===")
print(f"  Prevalence P(disease)       = {p_disease:.2%}")
print(f"  Sensitivity P(+|disease)    = {sensitivity:.2%}")
print(f"  False positive P(+|healthy) = {false_positive_rate:.2%}")
print(f"")
print(f"  P(+) = {p_positive:.4f}")
print(f"  P(disease|+) = {p_disease_given_positive:.4f} = {p_disease_given_positive:.1%}")
print(f"")
print(f"  Despite 95% sensitivity, a positive test only means ~{p_disease_given_positive*100:.1f}%")
print(f"  chance of actually having the disease!")
print(f"  This is the base-rate fallacy in action.")

# Bayesian update: coin fair(p=0.5) vs biased(p=0.7)
# Prior: P(fair) = 0.6, observe HHTTH, update after each observation

observations = [1, 1, 0, 0, 1]  # H=1, T=0  -> HHTTH

p_fair_coin = 0.5
p_biased_coin = 0.7

p_fair = 0.6  # prior P(fair)
posteriors = [p_fair]

print("=== Bayesian Update: Fair vs Biased Coin ===")
print(f"Prior P(fair) = {p_fair:.4f}")
print()

for i, obs in enumerate(observations):
    # Likelihood of this observation under each hypothesis
    if obs == 1:  # heads
        lik_fair = p_fair_coin
        lik_biased = p_biased_coin
    else:  # tails
        lik_fair = 1 - p_fair_coin
        lik_biased = 1 - p_biased_coin
    
    # Evidence: P(obs) = P(obs|fair)*P(fair) + P(obs|biased)*P(biased)
    p_evidence = lik_fair * p_fair + lik_biased * (1 - p_fair)
    
    # Posterior: P(fair|obs) = P(obs|fair)*P(fair) / P(obs)
    p_fair = (lik_fair * p_fair) / p_evidence
    posteriors.append(p_fair)
    
    label = 'H' if obs == 1 else 'T'
    print(f"  After obs {i+1} ({label}): P(fair) = {p_fair:.4f}, P(biased) = {1 - p_fair:.4f}")

# Plot evolution of P(fair)
plt.figure(figsize=(8, 5))
plt.plot(range(len(posteriors)), posteriors, 'bo-', linewidth=2, markersize=8)
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Undecided (0.5)')
plt.xlabel('Number of observations')
plt.ylabel('P(fair)')
plt.title('Bayesian Update: P(coin is fair) after each observation (HHTTH)')
obs_str = ['prior'] + ['H' if o == 1 else 'T' for o in observations]
plt.xticks(range(len(posteriors)), obs_str)
plt.ylim(0, 1)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Monty Hall via Bayes
# 3 doors, car behind one. You pick door 1. Monty opens door 3 (goat).
# Compute P(car at door 2 | Monty opens door 3)

# Priors: P(car at door i) = 1/3 for i = 1, 2, 3
p_car = {1: 1/3, 2: 1/3, 3: 1/3}

# Likelihoods: P(Monty opens door 3 | car at door i)
# If car at door 1 (your pick): Monty can open door 2 or 3 equally -> P = 1/2
# If car at door 2: Monty must open door 3 (can't open door 1 or 2) -> P = 1
# If car at door 3: Monty can't open door 3 -> P = 0
p_monty_opens_3 = {1: 1/2, 2: 1, 3: 0}

# Evidence: P(Monty opens 3)
p_evidence = sum(p_monty_opens_3[d] * p_car[d] for d in [1, 2, 3])

# Posterior for each door
print("=== Monty Hall Problem via Bayes' Rule ===")
print(f"You chose door 1. Monty opens door 3 (goat).")
print(f"")
print(f"P(Monty opens 3) = {p_evidence:.4f}")
print()

for door in [1, 2, 3]:
    posterior = (p_monty_opens_3[door] * p_car[door]) / p_evidence
    print(f"  P(car at door {door} | Monty opens 3) = {posterior:.4f} = {posterior} ")

p_car_door2 = (p_monty_opens_3[2] * p_car[2]) / p_evidence
print(f"\nConclusion: P(car at door 2 | Monty opens 3) = {p_car_door2:.4f} = 2/3")
print(f"You should SWITCH to door 2. Switching wins 2/3 of the time!")

PART 3: Maximum Likelihood Estimation (MLE)¶

Given i.i.d. data $x_1, \ldots, x_n$ from a distribution with parameter $\theta$:

Likelihood function: $$L(\theta) = \prod_{i=1}^{n} p(x_i | \theta)$$

MLE: $$\hat{\theta}_{\text{MLE}} = \arg\max_{\theta} L(\theta)$$

In practice, we usually maximize the log-likelihood (monotonic, converts products to sums): $$\ell(\theta) = \sum_{i=1}^{n} \log p(x_i | \theta)$$

Set $\frac{d\ell}{d\theta} = 0$ and solve for $\theta$.

def mle_bernoulli(data):
    """
    MLE for p in Bernoulli(p).
    
    Log-likelihood: l(p) = k*log(p) + (n-k)*log(1-p)
    Setting dl/dp = k/p - (n-k)/(1-p) = 0 gives p_hat = k/n = sample mean.
    
    Args:
        data: array of 0s and 1s
    Returns:
        MLE estimate of p
    """
    data = np.asarray(data, dtype=float)
    return np.mean(data)


# Demo
coin_data = np.array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1])  # 7 heads, 3 tails
p_mle = mle_bernoulli(coin_data)
print(f"Coin data: {coin_data}")
print(f"MLE estimate of p: {p_mle:.2f} ({int(coin_data.sum())} heads out of {len(coin_data)} flips)")

def mle_gaussian(data):
    """
    MLE for mu and sigma^2 in N(mu, sigma^2).
    
    mu_hat = (1/n) * sum(x_i)           = sample mean
    sigma^2_hat = (1/n) * sum((x_i - mu_hat)^2)  = biased variance
    
    Note: MLE variance estimator is biased (divides by n, not n-1).
    
    Args:
        data: array of observations
    Returns:
        (mu_hat, sigma2_hat)
    """
    data = np.asarray(data, dtype=float)
    mu_hat = np.mean(data)
    sigma2_hat = np.mean((data - mu_hat)**2)  # biased: divides by n
    return mu_hat, sigma2_hat


# Demo
np.random.seed(42)
true_mu, true_sigma = 5.0, 2.0
gauss_data = np.random.normal(true_mu, true_sigma, size=100)

mu_hat, sigma2_hat = mle_gaussian(gauss_data)
print(f"True parameters: mu={true_mu}, sigma^2={true_sigma**2}")
print(f"MLE estimates:   mu_hat={mu_hat:.4f}, sigma^2_hat={sigma2_hat:.4f}")
print(f"Unbiased var (n-1):  {np.var(gauss_data, ddof=1):.4f}")

# Plot likelihood L(p) and log L(p) for coin data, mark the MLE

data = np.array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1])  # 7H, 3T
n_heads = data.sum()
n_tails = len(data) - n_heads
p_mle_val = n_heads / len(data)

p_range = np.linspace(0.01, 0.99, 500)

# Likelihood: L(p) = p^k * (1-p)^(n-k)
likelihood = p_range**n_heads * (1 - p_range)**n_tails

# Log-likelihood: l(p) = k*log(p) + (n-k)*log(1-p)
log_likelihood = n_heads * np.log(p_range) + n_tails * np.log(1 - p_range)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot Likelihood
axes[0].plot(p_range, likelihood, 'b-', linewidth=2)
axes[0].axvline(x=p_mle_val, color='r', linestyle='--', linewidth=2, label=f'MLE = {p_mle_val:.1f}')
axes[0].set_xlabel('p')
axes[0].set_ylabel('L(p)')
axes[0].set_title(f'Likelihood function (data: {int(n_heads)}H, {int(n_tails)}T)')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Plot Log-Likelihood
axes[1].plot(p_range, log_likelihood, 'b-', linewidth=2)
axes[1].axvline(x=p_mle_val, color='r', linestyle='--', linewidth=2, label=f'MLE = {p_mle_val:.1f}')
axes[1].set_xlabel('p')
axes[1].set_ylabel('log L(p)')
axes[1].set_title(f'Log-likelihood function (data: {int(n_heads)}H, {int(n_tails)}T)')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"MLE: p_hat = {p_mle_val:.2f}")
print(f"Max likelihood: L({p_mle_val:.1f}) = {p_mle_val**n_heads * (1-p_mle_val)**n_tails:.6f}")
print(f"Max log-likelihood: l({p_mle_val:.1f}) = {n_heads*np.log(p_mle_val) + n_tails*np.log(1-p_mle_val):.4f}")

PART 4: Fisher Information¶

Fisher Information measures how much information a random variable $X$ carries about an unknown parameter $\theta$:

\[I(\theta) = E\left[-\frac{d^2}{d\theta^2} \log p(x|\theta)\right]\]

Equivalently: $$I(\theta) = \text{Var}\left(\frac{d}{d\theta} \log p(x|\theta)\right)$$

Cramer-Rao Lower Bound: For any unbiased estimator $\hat{\theta}$: $$\text{Var}(\hat{\theta}) \geq \frac{1}{n \cdot I(\theta)}$$

Higher Fisher information means the parameter is easier to estimate (lower variance achievable).

def fisher_info_bernoulli(p):
    """
    Fisher information for Bernoulli(p).
    
    log p(x|p) = x*log(p) + (1-x)*log(1-p)
    d/dp log p = x/p - (1-x)/(1-p)
    d^2/dp^2 log p = -x/p^2 - (1-x)/(1-p)^2
    I(p) = E[-d^2/dp^2 log p] = 1/(p*(1-p))
    
    Args:
        p: success probability, 0 < p < 1
    Returns:
        Fisher information I(p)
    """
    return 1.0 / (p * (1 - p))


def fisher_info_gaussian_mu(sigma):
    """
    Fisher information for N(mu, sigma^2) with known sigma, w.r.t. mu.
    
    log p(x|mu) = -0.5*log(2*pi*sigma^2) - (x-mu)^2/(2*sigma^2)
    d^2/dmu^2 log p = -1/sigma^2
    I(mu) = 1/sigma^2
    
    Args:
        sigma: standard deviation (known)
    Returns:
        Fisher information I(mu)
    """
    return 1.0 / sigma**2


# Demo
print("=== Fisher Information Examples ===")
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
    print(f"  Bernoulli(p={p}): I(p) = {fisher_info_bernoulli(p):.4f}")

print()
for sigma in [0.5, 1.0, 2.0, 5.0]:
    print(f"  Gaussian(sigma={sigma}): I(mu) = {fisher_info_gaussian_mu(sigma):.4f}")

print("\nNote: Bernoulli Fisher info is lowest at p=0.5 (hardest to distinguish small")
print("changes), and highest near 0 or 1. Gaussian Fisher info increases as sigma decreases.")

# Verify Cramer-Rao bound via simulation
# Bernoulli(p=0.3), N=50 samples, 10000 simulations

p_true = 0.3
N = 50
n_simulations = 10_000

np.random.seed(42)

# Run simulations: each time draw N Bernoulli samples and compute MLE
mle_estimates = np.zeros(n_simulations)
for i in range(n_simulations):
    samples = np.random.binomial(1, p_true, size=N)
    mle_estimates[i] = np.mean(samples)  # MLE for Bernoulli is sample mean

# Empirical variance of MLE
empirical_var = np.var(mle_estimates)

# Cramer-Rao lower bound: 1 / (N * I(p)) = p*(1-p)/N
cramer_rao_bound = 1.0 / (N * fisher_info_bernoulli(p_true))
# Equivalently: p*(1-p)/N
cr_bound_direct = p_true * (1 - p_true) / N

print("=== Cramer-Rao Bound Verification ===")
print(f"  True p = {p_true}, N = {N}, simulations = {n_simulations}")
print(f"")
print(f"  Empirical Var(p_hat)     = {empirical_var:.6f}")
print(f"  Cramer-Rao bound         = {cramer_rao_bound:.6f}")
print(f"  (= p*(1-p)/N             = {cr_bound_direct:.6f})")
print(f"")
print(f"  Var(p_hat) >= CR bound?  {empirical_var >= cramer_rao_bound * 0.95}")
print(f"  Ratio Var/CR_bound       = {empirical_var / cramer_rao_bound:.4f}")
print(f"  (Should be close to 1.0 since MLE achieves the bound for Bernoulli)")

# Histogram of MLE estimates
plt.figure(figsize=(8, 5))
plt.hist(mle_estimates, bins=50, density=True, alpha=0.7, color='steelblue', edgecolor='white')
plt.axvline(x=p_true, color='r', linestyle='--', linewidth=2, label=f'True p = {p_true}')
plt.axvline(x=np.mean(mle_estimates), color='green', linestyle='-', linewidth=2, 
            label=f'Mean of MLEs = {np.mean(mle_estimates):.4f}')

# Overlay the theoretical distribution: N(p, p(1-p)/N)
x_range = np.linspace(0, 0.6, 200)
theoretical_pdf = stats.norm.pdf(x_range, loc=p_true, scale=np.sqrt(cramer_rao_bound))
plt.plot(x_range, theoretical_pdf, 'k-', linewidth=2, label='Theoretical N(p, CR bound)')

plt.xlabel('MLE estimate of p')
plt.ylabel('Density')
plt.title(f'Distribution of MLE estimates (N={N}, {n_simulations} simulations)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

PART 5: Conjugate Priors¶

A conjugate prior is one where the posterior distribution has the same functional form as the prior.

Beta-Bernoulli Conjugacy¶

Prior: $\theta \sim \text{Beta}(\alpha, \beta)$
Likelihood: $x_1, \ldots, x_n \sim \text{Bernoulli}(\theta)$
Posterior: $\theta | \text{data} \sim \text{Beta}(\alpha + k, \, \beta + n - k)$

where $k = \sum x_i$ (number of successes).

Gaussian-Gaussian Conjugacy¶

Prior: $\mu \sim N(\mu_0, \sigma_0^2)$
Likelihood: $x_1, \ldots, x_n \sim N(\mu, \sigma^2)$ (known $\sigma^2$)
Posterior: $\mu | \text{data} \sim N(\mu_n, \sigma_n^2)$ with:
- $\sigma_n^2 = \left(\frac{1}{\sigma_0^2} + \frac{n}{\sigma^2}\right)^{-1}$
- $\mu_n = \sigma_n^2 \left(\frac{\mu_0}{\sigma_0^2} + \frac{n \bar{x}}{\sigma^2}\right)$

The prior “pseudo-counts” $\alpha, \beta$ represent our prior belief strength.

def beta_bernoulli_update(alpha_prior, beta_prior, data):
    """
    Compute and plot prior, likelihood, and posterior for Beta-Bernoulli model.
    
    Prior: Beta(alpha_prior, beta_prior)
    Data: sequence of 0s and 1s
    Posterior: Beta(alpha_prior + n_heads, beta_prior + n_tails)
    
    Returns:
        (alpha_post, beta_post)
    """
    data = np.asarray(data)
    n_heads = int(data.sum())
    n_tails = len(data) - n_heads
    
    alpha_post = alpha_prior + n_heads
    beta_post = beta_prior + n_tails
    
    x = np.linspace(0, 1, 500)
    
    # Prior
    prior_pdf = beta_dist.pdf(x, alpha_prior, beta_prior)
    
    # Likelihood (proportional to Beta(n_heads+1, n_tails+1) for visualization)
    likelihood_pdf = beta_dist.pdf(x, n_heads + 1, n_tails + 1)
    
    # Posterior
    posterior_pdf = beta_dist.pdf(x, alpha_post, beta_post)
    
    # Normalize for visual comparison
    prior_pdf_norm = prior_pdf / prior_pdf.max() if prior_pdf.max() > 0 else prior_pdf
    likelihood_pdf_norm = likelihood_pdf / likelihood_pdf.max() if likelihood_pdf.max() > 0 else likelihood_pdf
    posterior_pdf_norm = posterior_pdf / posterior_pdf.max() if posterior_pdf.max() > 0 else posterior_pdf
    
    plt.figure(figsize=(10, 6))
    plt.plot(x, prior_pdf_norm, 'b--', linewidth=2, 
             label=f'Prior: Beta({alpha_prior}, {beta_prior})')
    plt.plot(x, likelihood_pdf_norm, 'g:', linewidth=2, 
             label=f'Likelihood ({n_heads}H, {n_tails}T)')
    plt.plot(x, posterior_pdf_norm, 'r-', linewidth=2.5, 
             label=f'Posterior: Beta({alpha_post}, {beta_post})')
    
    # Mark posterior mean
    post_mean = alpha_post / (alpha_post + beta_post)
    plt.axvline(x=post_mean, color='red', linestyle='--', alpha=0.5, 
                label=f'Posterior mean = {post_mean:.3f}')
    
    plt.xlabel(r'$\theta$ (probability of heads)')
    plt.ylabel('Normalized density')
    plt.title('Beta-Bernoulli Conjugate Update')
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Prior:     Beta({alpha_prior}, {beta_prior}), mean = {alpha_prior/(alpha_prior+beta_prior):.3f}")
    print(f"Data:      {n_heads} heads, {n_tails} tails (MLE = {n_heads/len(data):.3f})")
    print(f"Posterior: Beta({alpha_post}, {beta_post}), mean = {post_mean:.3f}")
    
    return alpha_post, beta_post


# Demo: weakly informative prior, observe 7H 3T
data = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
alpha_post, beta_post = beta_bernoulli_update(alpha_prior=2, beta_prior=2, data=data)

# Prior strength demo: weak, medium, and strong priors all observe 7H/3T

data = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])  # 7H, 3T
n_heads = int(data.sum())
n_tails = len(data) - n_heads

priors = [
    (1, 1, 'Weak: Beta(1,1) = Uniform'),
    (5, 5, 'Medium: Beta(5,5)'),
    (50, 50, 'Strong: Beta(50,50)'),
]

x = np.linspace(0, 1, 500)
colors = ['steelblue', 'darkorange', 'green']

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, (alpha, beta_val, label) in enumerate(priors):
    ax = axes[idx]
    alpha_post = alpha + n_heads
    beta_post = beta_val + n_tails
    
    prior_pdf = beta_dist.pdf(x, alpha, beta_val)
    posterior_pdf = beta_dist.pdf(x, alpha_post, beta_post)
    
    ax.plot(x, prior_pdf, '--', color=colors[idx], linewidth=2, label=f'Prior Beta({alpha},{beta_val})')
    ax.fill_between(x, posterior_pdf, alpha=0.3, color=colors[idx])
    ax.plot(x, posterior_pdf, '-', color=colors[idx], linewidth=2.5, 
            label=f'Post Beta({alpha_post},{beta_post})')
    
    post_mean = alpha_post / (alpha_post + beta_post)
    ax.axvline(x=post_mean, color='red', linestyle=':', alpha=0.7,
               label=f'Post mean={post_mean:.3f}')
    ax.axvline(x=0.7, color='gray', linestyle='--', alpha=0.5, label='MLE=0.7')
    
    ax.set_title(label)
    ax.set_xlabel(r'$\theta$')
    ax.set_ylabel('Density')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)

plt.suptitle('Effect of Prior Strength (all see 7H/3T)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print("Observation: With a weak prior, the posterior is dominated by the data (MLE=0.7).")
print("With a strong prior centered at 0.5, the posterior barely moves from 0.5.")
print("The prior 'pseudo-count' strength relative to data size determines the influence.")

PART 6: Posterior Predictive Distribution¶

The posterior predictive distribution integrates out the parameter uncertainty:

\[p(x_{\text{new}} | \text{data}) = \int p(x_{\text{new}} | \theta) \, p(\theta | \text{data}) \, d\theta\]

For the Beta-Bernoulli model: $$P(x_{\text{new}} = 1 | \text{data}) = E[\theta | \text{data}] = \frac{\alpha_{\text{post}}}{\alpha_{\text{post}} + \beta_{\text{post}}}$$

This is a weighted average of the prior mean and the MLE, where the weights depend on sample size relative to prior strength.

# Posterior predictive for Beta-Bernoulli

def posterior_predictive_bernoulli(alpha_post, beta_post):
    """
    Posterior predictive probability for the next observation being 1 (heads)
    under the Beta-Bernoulli model.
    
    P(x_new = 1 | data) = E[theta | data] = alpha / (alpha + beta)
    
    Args:
        alpha_post: posterior alpha parameter
        beta_post: posterior beta parameter
    Returns:
        P(x_new = 1 | data)
    """
    return alpha_post / (alpha_post + beta_post)


# Scenario: observe coin data incrementally and track posterior predictive
alpha_prior, beta_prior_val = 2, 2  # weakly informative prior
data_sequence = [1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1]  # 11H, 4T

alpha_running = alpha_prior
beta_running = beta_prior_val
predictives = [posterior_predictive_bernoulli(alpha_running, beta_running)]

print("=== Posterior Predictive After Each Observation ===")
print(f"Prior Beta({alpha_prior},{beta_prior_val}): P(next=H) = {predictives[0]:.4f}")

for i, obs in enumerate(data_sequence):
    if obs == 1:
        alpha_running += 1
    else:
        beta_running += 1
    pred = posterior_predictive_bernoulli(alpha_running, beta_running)
    predictives.append(pred)
    label = 'H' if obs == 1 else 'T'
    total = i + 1
    heads_so_far = alpha_running - alpha_prior
    print(f"  After obs {total:2d} ({label}): Beta({alpha_running},{beta_running}), "
          f"P(next=H) = {pred:.4f}, MLE = {heads_so_far/total:.4f}")

# Plot
mle_running = []
heads_count = 0
for i, obs in enumerate(data_sequence):
    heads_count += obs
    mle_running.append(heads_count / (i + 1))

plt.figure(figsize=(10, 5))
plt.plot(range(len(predictives)), predictives, 'ro-', linewidth=2, markersize=6, 
         label='Posterior predictive P(H|data)')
plt.plot(range(1, len(predictives)), mle_running, 'bs--', linewidth=1.5, markersize=5, 
         label='MLE (running proportion of H)', alpha=0.7)
plt.axhline(y=0.5, color='gray', linestyle=':', alpha=0.5)
plt.xlabel('Number of observations')
plt.ylabel('P(next flip = Heads)')
plt.title('Posterior Predictive vs MLE Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nFinal posterior: Beta({alpha_running}, {beta_running})")
print(f"Final posterior predictive P(next=H) = {predictives[-1]:.4f}")
print(f"Final MLE = {sum(data_sequence)/len(data_sequence):.4f}")
print("The posterior predictive is shrunk toward the prior mean (0.5) relative to the MLE.")

PART 7: Bayesian Deep Learning Basics¶

In standard deep learning, we find point estimates of weights via SGD. In Bayesian deep learning, we maintain a distribution over weights:

\[p(\mathbf{w} | \mathcal{D}) \propto p(\mathcal{D} | \mathbf{w}) \cdot p(\mathbf{w})\]

Types of Uncertainty¶

Epistemic uncertainty (model uncertainty): reducible with more data. Captured by the posterior distribution over weights.
Aleatoric uncertainty (data noise): irreducible, inherent in the data.

MC Dropout as Approximate Bayesian Inference¶

Gal & Ghahramani (2016) showed that dropout at test time approximates a Bayesian posterior:

Train model with dropout
At test time, run $T$ forward passes with dropout still enabled
The mean of predictions approximates $E[y | x, \mathcal{D}]$
The variance of predictions captures epistemic uncertainty

Predictions far from training data will have higher variance (more uncertain).

# MC Dropout Demo
# Simple linear model y = 2x + 1 with dropout applied to weights

np.random.seed(42)

# True function: y = 2x + 1
x_train = np.linspace(0, 5, 20)
y_train = 2 * x_train + 1 + np.random.randn(20) * 0.5

# "Learned" weights (simulate having trained a model)
# w[0] = slope, w[1] = intercept
w = np.array([2.1, 0.95])

# Test points (including extrapolation)
x_test = np.array([3.0, 6.0, 10.0])

T = 100  # number of MC dropout forward passes
dropout_rate = 0.3
predictions = np.zeros((T, len(x_test)))

for t in range(T):
    # Apply dropout: randomly zero each weight with probability dropout_rate
    # Scale surviving weights by 1/(1-dropout_rate) to maintain expected output (inverted dropout)
    mask = (np.random.rand(len(w)) > dropout_rate).astype(float)
    w_dropped = w * mask / (1 - dropout_rate)
    
    # Forward pass: y = w[0]*x + w[1]
    predictions[t, :] = w_dropped[0] * x_test + w_dropped[1]

# Compute mean and std of predictions
pred_mean = predictions.mean(axis=0)
pred_std = predictions.std(axis=0)

print("=== MC Dropout Uncertainty Estimation ===")
print(f"Model: y = {w[0]}*x + {w[1]}")
print(f"Training range: x in [0, 5]")
print(f"Dropout rate: {dropout_rate}, T = {T} forward passes")
print()
print(f"{'x_test':>8} {'True y':>10} {'Mean pred':>12} {'Std pred':>10} {'95% CI':>20}")
print("-" * 65)
for i, x_val in enumerate(x_test):
    true_y = 2 * x_val + 1
    lo = pred_mean[i] - 1.96 * pred_std[i]
    hi = pred_mean[i] + 1.96 * pred_std[i]
    print(f"{x_val:8.1f} {true_y:10.2f} {pred_mean[i]:12.4f} {pred_std[i]:10.4f} [{lo:8.2f}, {hi:8.2f}]")

print("\nNote: Uncertainty (std) increases for x=10 (extrapolation beyond training range).")

# Plot MC dropout predictions with uncertainty bands

# Dense test grid for smooth plot
x_dense = np.linspace(-1, 12, 200)
T_dense = 100
preds_dense = np.zeros((T_dense, len(x_dense)))

np.random.seed(42)
for t in range(T_dense):
    mask = (np.random.rand(len(w)) > dropout_rate).astype(float)
    w_dropped = w * mask / (1 - dropout_rate)
    preds_dense[t, :] = w_dropped[0] * x_dense + w_dropped[1]

mean_dense = preds_dense.mean(axis=0)
std_dense = preds_dense.std(axis=0)

plt.figure(figsize=(10, 6))

# Plot individual MC samples (faint)
for t in range(min(20, T_dense)):
    plt.plot(x_dense, preds_dense[t], color='lightblue', alpha=0.3, linewidth=0.5)

# Uncertainty bands
plt.fill_between(x_dense, mean_dense - 2*std_dense, mean_dense + 2*std_dense,
                 alpha=0.2, color='steelblue', label=r'$\pm 2\sigma$ uncertainty')
plt.fill_between(x_dense, mean_dense - std_dense, mean_dense + std_dense,
                 alpha=0.3, color='steelblue', label=r'$\pm 1\sigma$ uncertainty')

# Mean prediction
plt.plot(x_dense, mean_dense, 'b-', linewidth=2.5, label='MC Dropout mean')

# True function
plt.plot(x_dense, 2*x_dense + 1, 'k--', linewidth=2, label='True: y = 2x + 1')

# Training data
plt.scatter(x_train, y_train, c='red', s=40, zorder=5, label='Training data', edgecolors='darkred')

# Mark test points
for i, x_val in enumerate(x_test):
    plt.plot(x_val, pred_mean[i], 'g^', markersize=12, zorder=6)

plt.axvspan(0, 5, alpha=0.05, color='green', label='Training range')
plt.xlabel('x')
plt.ylabel('y')
plt.title('MC Dropout: Predictions with Epistemic Uncertainty')
plt.legend(loc='upper left', fontsize=9)
plt.grid(True, alpha=0.3)
plt.xlim(-1, 12)
plt.tight_layout()
plt.show()

print("The uncertainty bands widen outside the training range [0, 5],")
print("reflecting greater epistemic uncertainty during extrapolation.")

Summary¶

This lab covered the core probability and Bayesian concepts underpinning modern deep learning:

Topic	Key Takeaway
Expectation & Variance	Fundamental summary statistics; linearity of expectation holds universally
Bayes’ Rule	Inverts conditional probabilities; reveals base-rate fallacy (disease test ~8.7%)
Bayesian Updating	Sequential evidence incorporation; posterior after one step becomes prior for the next
MLE	Point estimate maximizing data likelihood; sample mean for Bernoulli, (mean, biased var) for Gaussian
Fisher Information	Quantifies estimability; Cramer-Rao bounds the minimum achievable variance
Conjugate Priors	Beta-Bernoulli conjugacy makes updating analytical; prior strength controls data influence
Posterior Predictive	Integrates out parameter uncertainty; shrinks MLE toward prior
Bayesian DL	Distributions over weights capture epistemic uncertainty; MC Dropout provides a practical approximation

Source: Deep Learning Interviews by Shlomo Kashani – Chapter: Probabilistic Programming & Bayesian DL