Lab 02: Information Theory & Entropy¶
Sources:
Deep Learning Interviews by Shlomo Kashani (PDF:
resources/2201.00650v2.pdf)Speech and Language Processing (Jurafsky & Martin), Ch 3.3, 3.7
Topics covered:
Shannon’s Entropy
Cross-Entropy
KL Divergence
Mutual Information
Perplexity and its relation to entropy
Information Gain for classification
Jensen’s Inequality
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
np.random.seed(42)
%matplotlib inline
PART 1: Shannon’s Entropy¶
Reference: DLI – “Shannon’s Entropy”, SLP Ch 3.7
Shannon entropy measures the average amount of “surprise” or “information” in a random variable \(X\):
Properties:
\(H(X) \geq 0\) (entropy is non-negative)
\(H(X)\) is maximized when \(X\) is uniformly distributed over its support
\(H(X) = 0\) only when one outcome has probability 1 (no uncertainty)
Binary entropy formula (Bernoulli random variable with \(P(X=1) = p\)):
The binary entropy function reaches its maximum of 1 bit at \(p = 0.5\).
def entropy(probs):
"""
Compute Shannon entropy H(X) = -sum(p * log2(p)).
Handle p=0 by treating 0*log(0) = 0.
"""
probs = np.array(probs, dtype=float)
# Filter out zero probabilities to avoid log(0)
mask = probs > 0
return -np.sum(probs[mask] * np.log2(probs[mask]))
# Quick tests
print("Entropy of fair coin [0.5, 0.5]:", entropy([0.5, 0.5]), "bits")
print("Entropy of certain outcome [1, 0]:", entropy([1.0, 0.0]), "bits")
print("Entropy of fair die [1/6]*6:", entropy([1/6]*6), "bits")
def binary_entropy(p):
"""Compute entropy of a Bernoulli random variable with P(X=1) = p."""
if p == 0 or p == 1:
return 0.0
return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
# Quick tests
print("H(0.0) =", binary_entropy(0.0))
print("H(0.5) =", binary_entropy(0.5))
print("H(0.9) =", binary_entropy(0.9))
print("H(1.0) =", binary_entropy(1.0))
# Plot binary entropy curve and mark maximum at p=0.5
p_values = np.linspace(0.001, 0.999, 200)
h_values = np.array([binary_entropy(p) for p in p_values])
plt.figure(figsize=(8, 5))
plt.plot(p_values, h_values, 'b-', linewidth=2)
plt.scatter([0.5], [binary_entropy(0.5)], color='red', s=100, zorder=5,
label=f'Max at p=0.5, H={binary_entropy(0.5):.2f} bit')
plt.xlabel('p', fontsize=12)
plt.ylabel('H(p) [bits]', fontsize=12)
plt.title('Binary Entropy Function', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Compare fair die vs loaded die entropy
fair_die = np.array([1/6] * 6)
loaded_die = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.5])
h_fair = entropy(fair_die)
h_loaded = entropy(loaded_die)
print(f"Fair die probabilities: {fair_die}")
print(f"Loaded die probabilities: {loaded_die}")
print(f"")
print(f"Entropy of fair die: {h_fair:.4f} bits")
print(f"Entropy of loaded die: {h_loaded:.4f} bits")
print(f"")
print(f"Fair die has HIGHER entropy ({h_fair:.4f} > {h_loaded:.4f}).")
print(f"This is because the fair die has more uncertainty (uniform distribution).")
print(f"The loaded die is more predictable (outcome 6 is heavily favored).")
# Entropy maximization demo:
# Generate 1000 random distributions over K=8 outcomes,
# compute entropy of each, show uniform has the highest.
K = 8
n_samples = 1000
# Generate random distributions using Dirichlet(1,...,1)
random_dists = np.random.dirichlet(np.ones(K), size=n_samples)
entropies = np.array([entropy(d) for d in random_dists])
uniform_entropy = entropy(np.ones(K) / K)
plt.figure(figsize=(8, 5))
plt.hist(entropies, bins=40, edgecolor='black', alpha=0.7, label='Random distributions')
plt.axvline(uniform_entropy, color='red', linewidth=2, linestyle='--',
label=f'Uniform entropy = {uniform_entropy:.4f} bits')
plt.xlabel('Entropy (bits)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title(f'Entropy of {n_samples} random distributions over K={K} outcomes', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Uniform entropy (theoretical max): {uniform_entropy:.4f} bits")
print(f"Max entropy among random samples: {entropies.max():.4f} bits")
print(f"Mean entropy among random samples: {entropies.mean():.4f} bits")
print(f"All random samples have entropy <= uniform: {np.all(entropies <= uniform_entropy + 1e-10)}")
PART 2: Cross-Entropy¶
Reference: DLI – “The Logit Function and Entropy”, SLP Ch 3.7
Cross-entropy between a true distribution \(p\) and a model distribution \(q\):
Properties:
\(H(p, q) \geq H(p)\) – cross-entropy is at least as large as entropy (equality iff \(p = q\))
Relation to KL divergence: \(H(p, q) = H(p) + D_{KL}(p \| q)\)
Cross-entropy is the most commonly used loss function for classification in deep learning
def cross_entropy(p, q):
"""
Compute cross-entropy H(p, q) = -sum(p * log2(q)).
Clip q to [1e-15, 1] to avoid log(0).
"""
p = np.array(p, dtype=float)
q = np.array(q, dtype=float)
q_clipped = np.clip(q, 1e-15, 1.0)
return -np.sum(p * np.log2(q_clipped))
# Quick test: cross-entropy of p with itself should equal entropy of p
p_test = np.array([0.25, 0.25, 0.25, 0.25])
print(f"H(p) = {entropy(p_test):.4f} bits")
print(f"H(p, p) = {cross_entropy(p_test, p_test):.4f} bits (should match H(p))")
# Verify H(p, q) >= H(p) for sample distributions
p = np.array([0.25, 0.25, 0.25, 0.25])
q = np.array([0.1, 0.2, 0.3, 0.4])
h_p = entropy(p)
h_pq = cross_entropy(p, q)
print(f"p = {p}")
print(f"q = {q}")
print(f"")
print(f"H(p) = {h_p:.4f} bits")
print(f"H(p, q) = {h_pq:.4f} bits")
print(f"")
print(f"H(p, q) >= H(p)? {h_pq >= h_p - 1e-10} (difference = {h_pq - h_p:.4f} bits)")
print(f"")
print(f"The difference H(p,q) - H(p) = {h_pq - h_p:.4f} bits is the KL divergence D_KL(p||q).")
PART 3: KL Divergence¶
Reference: DLI – “Kullback-Leibler Divergence (KLD)”
KL divergence measures how different distribution \(q\) is from distribution \(p\):
Properties:
\(D_{KL}(p \| q) \geq 0\) (Gibbs’ inequality), with equality iff \(p = q\)
Not symmetric: \(D_{KL}(p \| q) \neq D_{KL}(q \| p)\) in general
\(D_{KL}(p \| q)\) is not a true distance metric (no symmetry, no triangle inequality)
Uses natural log (nats) by convention in information theory / ML
def kl_divergence(p, q):
"""
Compute D_KL(p || q) = sum(p * ln(p / q)).
Uses natural log (nats). Clip to avoid numerical issues.
"""
p = np.array(p, dtype=float)
q = np.array(q, dtype=float)
q = np.clip(q, 1e-15, 1.0)
mask = p > 0
return np.sum(p[mask] * np.log(p[mask] / q[mask]))
# Quick tests
p_test = np.array([0.5, 0.5])
print(f"D_KL(p || p) = {kl_divergence(p_test, p_test):.6f} nats (should be 0)")
print(f"D_KL([0.5,0.5] || [0.9,0.1]) = {kl_divergence([0.5,0.5], [0.9,0.1]):.6f} nats")
# Demonstrate asymmetry: D_KL(p||q) != D_KL(q||p)
p = np.array([0.4, 0.6])
q = np.array([0.8, 0.2])
kl_pq = kl_divergence(p, q)
kl_qp = kl_divergence(q, p)
print(f"p = {p}")
print(f"q = {q}")
print(f"")
print(f"D_KL(p || q) = {kl_pq:.6f} nats")
print(f"D_KL(q || p) = {kl_qp:.6f} nats")
print(f"")
print(f"Are they equal? {np.isclose(kl_pq, kl_qp)}")
print(f"Difference: |D_KL(p||q) - D_KL(q||p)| = {abs(kl_pq - kl_qp):.6f} nats")
print(f"")
print(f"KL divergence is NOT symmetric -- it is not a true distance metric.")
def kl_gaussians(mu1, sigma1, mu2, sigma2):
"""
Compute KL divergence between two univariate Gaussians (closed form).
D_KL(N(mu1, sigma1^2) || N(mu2, sigma2^2))
= log(sigma2/sigma1) + (sigma1^2 + (mu1-mu2)^2) / (2*sigma2^2) - 0.5
"""
return (np.log(sigma2 / sigma1)
+ (sigma1**2 + (mu1 - mu2)**2) / (2 * sigma2**2)
- 0.5)
# Plot KL divergence as a function of mu2 (varying mean of q)
mu1, sigma1 = 0.0, 1.0
sigma2 = 1.0
mu2_values = np.linspace(-5, 5, 200)
kl_values = [kl_gaussians(mu1, sigma1, mu2, sigma2) for mu2 in mu2_values]
plt.figure(figsize=(8, 5))
plt.plot(mu2_values, kl_values, 'b-', linewidth=2)
plt.xlabel(r'$\mu_2$', fontsize=13)
plt.ylabel(r'$D_{KL}(\mathcal{N}(0,1) \| \mathcal{N}(\mu_2,1))$ [nats]', fontsize=12)
plt.title(r'KL divergence between $\mathcal{N}(0,1)$ and $\mathcal{N}(\mu_2,1)$', fontsize=13)
plt.axvline(0, color='red', linestyle='--', alpha=0.5, label=r'$\mu_2 = \mu_1 = 0$ (KL=0)')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"KL at mu2=0 (same as mu1): {kl_gaussians(mu1, sigma1, 0.0, sigma2):.6f} nats")
print(f"KL at mu2=2: {kl_gaussians(mu1, sigma1, 2.0, sigma2):.6f} nats")
print(f"KL at mu2=-3: {kl_gaussians(mu1, sigma1, -3.0, sigma2):.6f} nats")
# DLI-style problem:
# p = [0.36, 0.48, 0.16], q = [1/3, 1/3, 1/3]
# Compute both directions of KL divergence.
p = np.array([0.36, 0.48, 0.16])
q = np.array([1/3, 1/3, 1/3])
kl_pq = kl_divergence(p, q)
kl_qp = kl_divergence(q, p)
print("DLI-style problem")
print(f"p = {p}")
print(f"q = [{1/3:.4f}, {1/3:.4f}, {1/3:.4f}] (uniform)")
print(f"")
print(f"D_KL(p || q) = {kl_pq:.6f} nats")
print(f"D_KL(q || p) = {kl_qp:.6f} nats")
print(f"")
print("Interpretation:")
print(" D_KL(p || q) -- 'forward KL': measures info lost when q (uniform)")
print(" is used to approximate the true distribution p.")
print(" D_KL(q || p) -- 'reverse KL': measures info lost when p")
print(" is used to approximate q (uniform).")
print(f"")
print(f"In ML, D_KL(p || q) is typically used where p is the true data")
print(f"distribution and q is the model distribution.")
PART 4: Mutual Information¶
Reference: DLI – “Mutual Information”
Mutual information measures how much knowing \(X\) tells us about \(Y\) (and vice versa):
Properties:
\(I(X; Y) \geq 0\), with equality iff \(X\) and \(Y\) are independent
\(I(X; Y) = I(Y; X)\) (symmetric)
\(I(X; Y) \leq \min(H(X), H(Y))\)
def mutual_information(joint_prob):
"""
Compute I(X;Y) from a joint probability table.
joint_prob: 2D array of shape (|X|, |Y|), entries sum to 1.
Returns mutual information in nats.
"""
joint_prob = np.array(joint_prob, dtype=float)
# Marginals
p_x = joint_prob.sum(axis=1) # sum over Y
p_y = joint_prob.sum(axis=0) # sum over X
mi = 0.0
for i in range(joint_prob.shape[0]):
for j in range(joint_prob.shape[1]):
if joint_prob[i, j] > 0 and p_x[i] > 0 and p_y[j] > 0:
mi += joint_prob[i, j] * np.log(joint_prob[i, j] / (p_x[i] * p_y[j]))
return mi
# Quick test with a simple joint distribution
joint_test = np.array([[0.3, 0.1],
[0.1, 0.5]])
print(f"Joint distribution:\n{joint_test}")
print(f"I(X;Y) = {mutual_information(joint_test):.6f} nats")
# Verify I(X;Y) = 0 for independent variables
# Independent means joint = outer product of marginals
p_x = np.array([0.3, 0.7])
p_y = np.array([0.4, 0.6])
joint_independent = np.outer(p_x, p_y)
mi_indep = mutual_information(joint_independent)
print("Independent variables:")
print(f" p(X) = {p_x}")
print(f" p(Y) = {p_y}")
print(f" Joint = outer product of marginals:")
print(f" {joint_independent}")
print(f" I(X;Y) = {mi_indep:.10f} nats (should be ~0)")
print(f" Is essentially zero? {np.isclose(mi_indep, 0, atol=1e-10)}")
# Example with correlated variables: I(X;Y) > 0
# Joint distribution where X and Y are correlated
joint_correlated = np.array([[0.4, 0.05],
[0.05, 0.5]])
mi_corr = mutual_information(joint_correlated)
p_x_corr = joint_correlated.sum(axis=1)
p_y_corr = joint_correlated.sum(axis=0)
print("Correlated variables:")
print(f" Joint distribution:")
print(f" {joint_correlated}")
print(f" Marginals: p(X) = {p_x_corr}, p(Y) = {p_y_corr}")
print(f" I(X;Y) = {mi_corr:.6f} nats")
print(f" I(X;Y) > 0? {mi_corr > 0}")
print(f"")
print(f" When X=0, Y=0 is very likely (0.4/0.45 = {0.4/0.45:.2%}),")
print(f" and when X=1, Y=1 is very likely (0.5/0.55 = {0.5/0.55:.2%}).")
print(f" So knowing X tells us a lot about Y -> high mutual information.")
PART 5: Perplexity¶
Reference: SLP Ch 3.3 – Evaluating Language Models: Perplexity
Perplexity is the inverse probability of the test set, normalized by the number of words \(N\):
Relation to entropy:
where \(H\) is the cross-entropy of the model on the test data.
Interpretation: Lower perplexity = better model (less “surprised” by the data). A perplexity of \(k\) means the model is as uncertain as if it had to choose uniformly among \(k\) options at each step.
def perplexity_from_probs(probs):
"""
Compute perplexity given a sequence of word probabilities.
PP = exp(-(1/N) * sum(ln(p_i)))
"""
probs = np.array(probs, dtype=float)
N = len(probs)
log_sum = np.sum(np.log(probs))
return np.exp(-log_sum / N)
def entropy_to_perplexity(H):
"""Convert entropy (bits) to perplexity: PP = 2^H."""
return 2 ** H
def perplexity_to_entropy(PP):
"""Convert perplexity to entropy (bits): H = log2(PP)."""
return np.log2(PP)
# Quick tests
print("--- Perplexity utilities ---")
print(f"Entropy 3.0 bits -> Perplexity {entropy_to_perplexity(3.0):.2f}")
print(f"Perplexity 8.0 -> Entropy {perplexity_to_entropy(8.0):.2f} bits")
print(f"Round-trip: H=5 -> PP={entropy_to_perplexity(5.0):.2f} -> H={perplexity_to_entropy(entropy_to_perplexity(5.0)):.2f}")
# Compare uniform model (V=1000) vs frequency-based model perplexity
V = 1000 # vocabulary size
test_sequence = ["the", "cat", "sat", "on", "the", "mat", "and", "the", "dog", "ran"]
# Model A: uniform probability 1/V for every word
probs_A = np.array([1/V] * len(test_sequence))
# Model B: frequency-based probabilities (mock)
freq_probs = {
"the": 0.07, "cat": 0.01, "sat": 0.005, "on": 0.03,
"mat": 0.002, "and": 0.04, "dog": 0.01, "ran": 0.005
}
probs_B = np.array([freq_probs[w] for w in test_sequence])
pp_A = perplexity_from_probs(probs_A)
pp_B = perplexity_from_probs(probs_B)
print(f"Test sequence: {test_sequence}")
print(f"")
print(f"Model A (uniform, V={V}):")
print(f" Probabilities: {probs_A[:3]}... (all {1/V})")
print(f" Perplexity: {pp_A:.2f}")
print(f"")
print(f"Model B (frequency-based):")
print(f" Probabilities: {probs_B}")
print(f" Perplexity: {pp_B:.2f}")
print(f"")
print(f"Model B has LOWER perplexity ({pp_B:.2f} < {pp_A:.2f}), so it is the better model.")
print(f"The frequency-based model is less 'surprised' by common words like 'the'.")
PART 6: Information Gain¶
Reference: DLI – “Classification and Information Gain”
Information Gain (IG) is the reduction in entropy achieved by splitting a dataset:
Used to select the best feature/threshold for splitting in decision trees (ID3, C4.5). Higher information gain = better split (more reduction in uncertainty).
def _label_entropy(labels):
"""Compute entropy of a label array using class frequencies."""
labels = np.array(labels)
if len(labels) == 0:
return 0.0
counts = np.array(list(Counter(labels).values()), dtype=float)
probs = counts / counts.sum()
return entropy(probs)
def information_gain(parent_labels, left_labels, right_labels):
"""
Compute information gain from splitting parent into left and right.
IG = H(parent) - (|left|/|parent|)*H(left) - (|right|/|parent|)*H(right)
"""
parent_labels = np.array(parent_labels)
left_labels = np.array(left_labels)
right_labels = np.array(right_labels)
n_parent = len(parent_labels)
n_left = len(left_labels)
n_right = len(right_labels)
h_parent = _label_entropy(parent_labels)
h_left = _label_entropy(left_labels)
h_right = _label_entropy(right_labels)
ig = h_parent - (n_left / n_parent) * h_left - (n_right / n_parent) * h_right
return ig
# Quick test
parent = [0, 0, 0, 1, 1, 1]
left = [0, 0, 0] # pure
right = [1, 1, 1] # pure
print(f"Perfect split IG: {information_gain(parent, left, right):.4f} bits")
left2 = [0, 0, 1] # impure
right2 = [0, 1, 1] # impure
print(f"Impure split IG: {information_gain(parent, left2, right2):.4f} bits")
# Find best split threshold for a simple dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=float)
y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1, 1])
best_ig = -1
best_threshold = None
# Try midpoints between consecutive feature values as thresholds
thresholds = (X[:-1] + X[1:]) / 2.0
print("Threshold | Left labels | Right labels | IG")
print("-" * 75)
for t in thresholds:
left_mask = X <= t
right_mask = X > t
left_labels = y[left_mask]
right_labels = y[right_mask]
ig = information_gain(y, left_labels, right_labels)
print(f" {t:5.1f} | {str(left_labels):20s} | {str(right_labels):20s} | {ig:.4f}")
if ig > best_ig:
best_ig = ig
best_threshold = t
print(f"")
print(f"Best threshold: {best_threshold}")
print(f"Best information gain: {best_ig:.4f} bits")
PART 7: Jensen’s Inequality¶
Reference: DLI – “Jensen’s inequality”
For a concave function \(f\) (e.g., \(\log\)):
For a convex function \(f\) (e.g., \(x^2\)):
Jensen’s inequality is fundamental to proving that KL divergence is non-negative (Gibbs’ inequality), and it appears throughout information theory and variational inference.
# Verify Jensen's inequality numerically
x = np.array([1.0, 2.0, 5.0, 10.0, 0.5])
# --- Concave case: f = log ---
# Jensen: log(E[X]) >= E[log(X)]
mean_x = np.mean(x)
log_of_mean = np.log(mean_x)
mean_of_log = np.mean(np.log(x))
print("=== Concave function: f(x) = log(x) ===")
print(f"x = {x}")
print(f"E[X] = {mean_x:.4f}")
print(f"log(E[X]) = {log_of_mean:.6f}")
print(f"E[log(X)] = {mean_of_log:.6f}")
print(f"log(E[X]) >= E[log(X)]? {log_of_mean >= mean_of_log - 1e-10}")
print(f"Gap: {log_of_mean - mean_of_log:.6f}")
print()
# --- Convex case: f = x^2 ---
# Jensen: (E[X])^2 <= E[X^2]
mean_squared = mean_x ** 2
mean_of_squares = np.mean(x ** 2)
print("=== Convex function: f(x) = x^2 ===")
print(f"x = {x}")
print(f"E[X] = {mean_x:.4f}")
print(f"(E[X])^2 = {mean_squared:.6f}")
print(f"E[X^2] = {mean_of_squares:.6f}")
print(f"(E[X])^2 <= E[X^2]? {mean_squared <= mean_of_squares + 1e-10}")
print(f"Gap: {mean_of_squares - mean_squared:.6f}")
print()
print(f"Note: E[X^2] - (E[X])^2 = Var(X) = {np.var(x):.6f}")
print(f"This shows Jensen's gap for x^2 is exactly the variance!")
Summary¶
Concept |
Formula |
Key Property |
|---|---|---|
Shannon Entropy |
\(H(X) = -\sum p(x) \log_2 p(x)\) |
Maximized for uniform distribution |
Cross-Entropy |
\(H(p,q) = -\sum p(x) \log_2 q(x)\) |
\(H(p,q) \geq H(p)\) |
KL Divergence |
\(D_{KL}(p | q) = \sum p(x) \ln \frac{p(x)}{q(x)}\) |
\(\geq 0\), not symmetric |
Mutual Information |
\(I(X;Y) = H(X) + H(Y) - H(X,Y)\) |
\(= 0\) iff independent |
Perplexity |
\(PP = 2^H\) |
Lower = better model |
Information Gain |
$IG = H(\text{parent}) - \sum \frac{ |
child |
Jensen’s Inequality |
concave \(f\): \(f(E[X]) \geq E[f(X)]\) |
Proves Gibbs’ inequality |
Connections:
Cross-entropy = Entropy + KL divergence: \(H(p,q) = H(p) + D_{KL}(p \| q)\)
Mutual information = KL divergence between joint and product of marginals
Perplexity is the exponential of cross-entropy (a more interpretable metric)
Jensen’s inequality underlies the proof that \(D_{KL} \geq 0\)