Run this notebook: Open in Colab Open in Kaggle

Lab 02: Information Theory & Entropy¶

Sources:

Deep Learning Interviews by Shlomo Kashani (PDF: resources/2201.00650v2.pdf)
Speech and Language Processing (Jurafsky & Martin), Ch 3.3, 3.7

Topics covered:

Shannon’s Entropy
Cross-Entropy
KL Divergence
Mutual Information
Perplexity and its relation to entropy
Information Gain for classification
Jensen’s Inequality

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

np.random.seed(42)
%matplotlib inline

PART 1: Shannon’s Entropy¶

Reference: DLI – “Shannon’s Entropy”, SLP Ch 3.7

Shannon entropy measures the average amount of “surprise” or “information” in a random variable $X$:

\[H(X) = -\sum_{x} p(x) \cdot \log_2 p(x)\]

Properties:

$H(X) \geq 0$ (entropy is non-negative)
$H(X)$ is maximized when $X$ is uniformly distributed over its support
$H(X) = 0$ only when one outcome has probability 1 (no uncertainty)

Binary entropy formula (Bernoulli random variable with $P(X=1) = p$):

\[H(p) = -p \log_2 p - (1-p) \log_2 (1-p)\]

The binary entropy function reaches its maximum of 1 bit at $p = 0.5$.

def entropy(probs):
    """
    Compute Shannon entropy H(X) = -sum(p * log2(p)).
    Handle p=0 by treating 0*log(0) = 0.
    """
    probs = np.array(probs, dtype=float)
    # Filter out zero probabilities to avoid log(0)
    mask = probs > 0
    return -np.sum(probs[mask] * np.log2(probs[mask]))


# Quick tests
print("Entropy of fair coin [0.5, 0.5]:", entropy([0.5, 0.5]), "bits")
print("Entropy of certain outcome [1, 0]:", entropy([1.0, 0.0]), "bits")
print("Entropy of fair die [1/6]*6:", entropy([1/6]*6), "bits")

def binary_entropy(p):
    """Compute entropy of a Bernoulli random variable with P(X=1) = p."""
    if p == 0 or p == 1:
        return 0.0
    return -p * np.log2(p) - (1 - p) * np.log2(1 - p)


# Quick tests
print("H(0.0) =", binary_entropy(0.0))
print("H(0.5) =", binary_entropy(0.5))
print("H(0.9) =", binary_entropy(0.9))
print("H(1.0) =", binary_entropy(1.0))

# Plot binary entropy curve and mark maximum at p=0.5
p_values = np.linspace(0.001, 0.999, 200)
h_values = np.array([binary_entropy(p) for p in p_values])

plt.figure(figsize=(8, 5))
plt.plot(p_values, h_values, 'b-', linewidth=2)
plt.scatter([0.5], [binary_entropy(0.5)], color='red', s=100, zorder=5,
            label=f'Max at p=0.5, H={binary_entropy(0.5):.2f} bit')
plt.xlabel('p', fontsize=12)
plt.ylabel('H(p) [bits]', fontsize=12)
plt.title('Binary Entropy Function', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Compare fair die vs loaded die entropy
fair_die = np.array([1/6] * 6)
loaded_die = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.5])

h_fair = entropy(fair_die)
h_loaded = entropy(loaded_die)

print(f"Fair die probabilities:   {fair_die}")
print(f"Loaded die probabilities: {loaded_die}")
print(f"")
print(f"Entropy of fair die:   {h_fair:.4f} bits")
print(f"Entropy of loaded die: {h_loaded:.4f} bits")
print(f"")
print(f"Fair die has HIGHER entropy ({h_fair:.4f} > {h_loaded:.4f}).")
print(f"This is because the fair die has more uncertainty (uniform distribution).")
print(f"The loaded die is more predictable (outcome 6 is heavily favored).")

# Entropy maximization demo:
# Generate 1000 random distributions over K=8 outcomes,
# compute entropy of each, show uniform has the highest.

K = 8
n_samples = 1000

# Generate random distributions using Dirichlet(1,...,1)
random_dists = np.random.dirichlet(np.ones(K), size=n_samples)
entropies = np.array([entropy(d) for d in random_dists])

uniform_entropy = entropy(np.ones(K) / K)

plt.figure(figsize=(8, 5))
plt.hist(entropies, bins=40, edgecolor='black', alpha=0.7, label='Random distributions')
plt.axvline(uniform_entropy, color='red', linewidth=2, linestyle='--',
            label=f'Uniform entropy = {uniform_entropy:.4f} bits')
plt.xlabel('Entropy (bits)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title(f'Entropy of {n_samples} random distributions over K={K} outcomes', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Uniform entropy (theoretical max): {uniform_entropy:.4f} bits")
print(f"Max entropy among random samples:  {entropies.max():.4f} bits")
print(f"Mean entropy among random samples:  {entropies.mean():.4f} bits")
print(f"All random samples have entropy <= uniform: {np.all(entropies <= uniform_entropy + 1e-10)}")

PART 2: Cross-Entropy¶

Reference: DLI – “The Logit Function and Entropy”, SLP Ch 3.7

Cross-entropy between a true distribution $p$ and a model distribution $q$:

\[H(p, q) = -\sum_{x} p(x) \cdot \log_2 q(x)\]

Properties:

$H(p, q) \geq H(p)$ – cross-entropy is at least as large as entropy (equality iff $p = q$)
Relation to KL divergence: $H(p, q) = H(p) + D_{KL}(p \| q)$
Cross-entropy is the most commonly used loss function for classification in deep learning

def cross_entropy(p, q):
    """
    Compute cross-entropy H(p, q) = -sum(p * log2(q)).
    Clip q to [1e-15, 1] to avoid log(0).
    """
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)
    q_clipped = np.clip(q, 1e-15, 1.0)
    return -np.sum(p * np.log2(q_clipped))


# Quick test: cross-entropy of p with itself should equal entropy of p
p_test = np.array([0.25, 0.25, 0.25, 0.25])
print(f"H(p)    = {entropy(p_test):.4f} bits")
print(f"H(p, p) = {cross_entropy(p_test, p_test):.4f} bits  (should match H(p))")

# Verify H(p, q) >= H(p) for sample distributions
p = np.array([0.25, 0.25, 0.25, 0.25])
q = np.array([0.1, 0.2, 0.3, 0.4])

h_p = entropy(p)
h_pq = cross_entropy(p, q)

print(f"p = {p}")
print(f"q = {q}")
print(f"")
print(f"H(p)    = {h_p:.4f} bits")
print(f"H(p, q) = {h_pq:.4f} bits")
print(f"")
print(f"H(p, q) >= H(p)? {h_pq >= h_p - 1e-10}  (difference = {h_pq - h_p:.4f} bits)")
print(f"")
print(f"The difference H(p,q) - H(p) = {h_pq - h_p:.4f} bits is the KL divergence D_KL(p||q).")

PART 3: KL Divergence¶

Reference: DLI – “Kullback-Leibler Divergence (KLD)”

KL divergence measures how different distribution $q$ is from distribution $p$:

\[D_{KL}(p \| q) = \sum_{x} p(x) \cdot \log\!\left(\frac{p(x)}{q(x)}\right) = H(p, q) - H(p)\]

Properties:

$D_{KL}(p \| q) \geq 0$ (Gibbs’ inequality), with equality iff $p = q$
Not symmetric: $D_{KL}(p \| q) \neq D_{KL}(q \| p)$ in general
$D_{KL}(p \| q)$ is not a true distance metric (no symmetry, no triangle inequality)
Uses natural log (nats) by convention in information theory / ML

def kl_divergence(p, q):
    """
    Compute D_KL(p || q) = sum(p * ln(p / q)).
    Uses natural log (nats). Clip to avoid numerical issues.
    """
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)
    q = np.clip(q, 1e-15, 1.0)
    mask = p > 0
    return np.sum(p[mask] * np.log(p[mask] / q[mask]))


# Quick tests
p_test = np.array([0.5, 0.5])
print(f"D_KL(p || p) = {kl_divergence(p_test, p_test):.6f} nats  (should be 0)")
print(f"D_KL([0.5,0.5] || [0.9,0.1]) = {kl_divergence([0.5,0.5], [0.9,0.1]):.6f} nats")

# Demonstrate asymmetry: D_KL(p||q) != D_KL(q||p)
p = np.array([0.4, 0.6])
q = np.array([0.8, 0.2])

kl_pq = kl_divergence(p, q)
kl_qp = kl_divergence(q, p)

print(f"p = {p}")
print(f"q = {q}")
print(f"")
print(f"D_KL(p || q) = {kl_pq:.6f} nats")
print(f"D_KL(q || p) = {kl_qp:.6f} nats")
print(f"")
print(f"Are they equal? {np.isclose(kl_pq, kl_qp)}")
print(f"Difference: |D_KL(p||q) - D_KL(q||p)| = {abs(kl_pq - kl_qp):.6f} nats")
print(f"")
print(f"KL divergence is NOT symmetric -- it is not a true distance metric.")

def kl_gaussians(mu1, sigma1, mu2, sigma2):
    """
    Compute KL divergence between two univariate Gaussians (closed form).
    D_KL(N(mu1, sigma1^2) || N(mu2, sigma2^2))
      = log(sigma2/sigma1) + (sigma1^2 + (mu1-mu2)^2) / (2*sigma2^2) - 0.5
    """
    return (np.log(sigma2 / sigma1)
            + (sigma1**2 + (mu1 - mu2)**2) / (2 * sigma2**2)
            - 0.5)


# Plot KL divergence as a function of mu2 (varying mean of q)
mu1, sigma1 = 0.0, 1.0
sigma2 = 1.0
mu2_values = np.linspace(-5, 5, 200)
kl_values = [kl_gaussians(mu1, sigma1, mu2, sigma2) for mu2 in mu2_values]

plt.figure(figsize=(8, 5))
plt.plot(mu2_values, kl_values, 'b-', linewidth=2)
plt.xlabel(r'$\mu_2$', fontsize=13)
plt.ylabel(r'$D_{KL}(\mathcal{N}(0,1) \| \mathcal{N}(\mu_2,1))$ [nats]', fontsize=12)
plt.title(r'KL divergence between $\mathcal{N}(0,1)$ and $\mathcal{N}(\mu_2,1)$', fontsize=13)
plt.axvline(0, color='red', linestyle='--', alpha=0.5, label=r'$\mu_2 = \mu_1 = 0$ (KL=0)')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"KL at mu2=0 (same as mu1): {kl_gaussians(mu1, sigma1, 0.0, sigma2):.6f} nats")
print(f"KL at mu2=2:               {kl_gaussians(mu1, sigma1, 2.0, sigma2):.6f} nats")
print(f"KL at mu2=-3:              {kl_gaussians(mu1, sigma1, -3.0, sigma2):.6f} nats")

# DLI-style problem:
# p = [0.36, 0.48, 0.16], q = [1/3, 1/3, 1/3]
# Compute both directions of KL divergence.

p = np.array([0.36, 0.48, 0.16])
q = np.array([1/3, 1/3, 1/3])

kl_pq = kl_divergence(p, q)
kl_qp = kl_divergence(q, p)

print("DLI-style problem")
print(f"p = {p}")
print(f"q = [{1/3:.4f}, {1/3:.4f}, {1/3:.4f}]  (uniform)")
print(f"")
print(f"D_KL(p || q) = {kl_pq:.6f} nats")
print(f"D_KL(q || p) = {kl_qp:.6f} nats")
print(f"")
print("Interpretation:")
print("  D_KL(p || q) -- 'forward KL': measures info lost when q (uniform)")
print("                  is used to approximate the true distribution p.")
print("  D_KL(q || p) -- 'reverse KL': measures info lost when p")
print("                  is used to approximate q (uniform).")
print(f"")
print(f"In ML, D_KL(p || q) is typically used where p is the true data")
print(f"distribution and q is the model distribution.")

PART 4: Mutual Information¶

Reference: DLI – “Mutual Information”

Mutual information measures how much knowing $X$ tells us about $Y$ (and vice versa):

\[I(X; Y) = H(X) + H(Y) - H(X, Y) = D_{KL}\big(p(x,y) \| p(x)\,p(y)\big)\]

Properties:

$I(X; Y) \geq 0$, with equality iff $X$ and $Y$ are independent
$I(X; Y) = I(Y; X)$ (symmetric)
$I(X; Y) \leq \min(H(X), H(Y))$

def mutual_information(joint_prob):
    """
    Compute I(X;Y) from a joint probability table.
    joint_prob: 2D array of shape (|X|, |Y|), entries sum to 1.
    Returns mutual information in nats.
    """
    joint_prob = np.array(joint_prob, dtype=float)
    # Marginals
    p_x = joint_prob.sum(axis=1)  # sum over Y
    p_y = joint_prob.sum(axis=0)  # sum over X
    
    mi = 0.0
    for i in range(joint_prob.shape[0]):
        for j in range(joint_prob.shape[1]):
            if joint_prob[i, j] > 0 and p_x[i] > 0 and p_y[j] > 0:
                mi += joint_prob[i, j] * np.log(joint_prob[i, j] / (p_x[i] * p_y[j]))
    return mi


# Quick test with a simple joint distribution
joint_test = np.array([[0.3, 0.1],
                       [0.1, 0.5]])
print(f"Joint distribution:\n{joint_test}")
print(f"I(X;Y) = {mutual_information(joint_test):.6f} nats")

# Verify I(X;Y) = 0 for independent variables
# Independent means joint = outer product of marginals
p_x = np.array([0.3, 0.7])
p_y = np.array([0.4, 0.6])
joint_independent = np.outer(p_x, p_y)

mi_indep = mutual_information(joint_independent)

print("Independent variables:")
print(f"  p(X) = {p_x}")
print(f"  p(Y) = {p_y}")
print(f"  Joint = outer product of marginals:")
print(f"  {joint_independent}")
print(f"  I(X;Y) = {mi_indep:.10f} nats  (should be ~0)")
print(f"  Is essentially zero? {np.isclose(mi_indep, 0, atol=1e-10)}")

# Example with correlated variables: I(X;Y) > 0
# Joint distribution where X and Y are correlated
joint_correlated = np.array([[0.4, 0.05],
                             [0.05, 0.5]])

mi_corr = mutual_information(joint_correlated)
p_x_corr = joint_correlated.sum(axis=1)
p_y_corr = joint_correlated.sum(axis=0)

print("Correlated variables:")
print(f"  Joint distribution:")
print(f"  {joint_correlated}")
print(f"  Marginals: p(X) = {p_x_corr}, p(Y) = {p_y_corr}")
print(f"  I(X;Y) = {mi_corr:.6f} nats")
print(f"  I(X;Y) > 0? {mi_corr > 0}")
print(f"")
print(f"  When X=0, Y=0 is very likely (0.4/0.45 = {0.4/0.45:.2%}),")
print(f"  and when X=1, Y=1 is very likely (0.5/0.55 = {0.5/0.55:.2%}).")
print(f"  So knowing X tells us a lot about Y -> high mutual information.")

PART 5: Perplexity¶

Reference: SLP Ch 3.3 – Evaluating Language Models: Perplexity

Perplexity is the inverse probability of the test set, normalized by the number of words $N$:

\[PP(W) = P(w_1 w_2 \dots w_N)^{-1/N}\]

Relation to entropy:

\[PP = 2^{H}\]

where $H$ is the cross-entropy of the model on the test data.

Interpretation: Lower perplexity = better model (less “surprised” by the data). A perplexity of $k$ means the model is as uncertain as if it had to choose uniformly among $k$ options at each step.

def perplexity_from_probs(probs):
    """
    Compute perplexity given a sequence of word probabilities.
    PP = exp(-(1/N) * sum(ln(p_i)))
    """
    probs = np.array(probs, dtype=float)
    N = len(probs)
    log_sum = np.sum(np.log(probs))
    return np.exp(-log_sum / N)


def entropy_to_perplexity(H):
    """Convert entropy (bits) to perplexity: PP = 2^H."""
    return 2 ** H


def perplexity_to_entropy(PP):
    """Convert perplexity to entropy (bits): H = log2(PP)."""
    return np.log2(PP)


# Quick tests
print("--- Perplexity utilities ---")
print(f"Entropy 3.0 bits -> Perplexity {entropy_to_perplexity(3.0):.2f}")
print(f"Perplexity 8.0 -> Entropy {perplexity_to_entropy(8.0):.2f} bits")
print(f"Round-trip: H=5 -> PP={entropy_to_perplexity(5.0):.2f} -> H={perplexity_to_entropy(entropy_to_perplexity(5.0)):.2f}")

# Compare uniform model (V=1000) vs frequency-based model perplexity

V = 1000  # vocabulary size
test_sequence = ["the", "cat", "sat", "on", "the", "mat", "and", "the", "dog", "ran"]

# Model A: uniform probability 1/V for every word
probs_A = np.array([1/V] * len(test_sequence))

# Model B: frequency-based probabilities (mock)
freq_probs = {
    "the": 0.07, "cat": 0.01, "sat": 0.005, "on": 0.03,
    "mat": 0.002, "and": 0.04, "dog": 0.01, "ran": 0.005
}
probs_B = np.array([freq_probs[w] for w in test_sequence])

pp_A = perplexity_from_probs(probs_A)
pp_B = perplexity_from_probs(probs_B)

print(f"Test sequence: {test_sequence}")
print(f"")
print(f"Model A (uniform, V={V}):")
print(f"  Probabilities: {probs_A[:3]}... (all {1/V})")
print(f"  Perplexity: {pp_A:.2f}")
print(f"")
print(f"Model B (frequency-based):")
print(f"  Probabilities: {probs_B}")
print(f"  Perplexity: {pp_B:.2f}")
print(f"")
print(f"Model B has LOWER perplexity ({pp_B:.2f} < {pp_A:.2f}), so it is the better model.")
print(f"The frequency-based model is less 'surprised' by common words like 'the'.")

PART 6: Information Gain¶

Reference: DLI – “Classification and Information Gain”

Information Gain (IG) is the reduction in entropy achieved by splitting a dataset:

\[IG = H(\text{parent}) - \frac{|\text{left}|}{|\text{parent}|} H(\text{left}) - \frac{|\text{right}|}{|\text{parent}|} H(\text{right})\]

Used to select the best feature/threshold for splitting in decision trees (ID3, C4.5). Higher information gain = better split (more reduction in uncertainty).

def _label_entropy(labels):
    """Compute entropy of a label array using class frequencies."""
    labels = np.array(labels)
    if len(labels) == 0:
        return 0.0
    counts = np.array(list(Counter(labels).values()), dtype=float)
    probs = counts / counts.sum()
    return entropy(probs)


def information_gain(parent_labels, left_labels, right_labels):
    """
    Compute information gain from splitting parent into left and right.
    IG = H(parent) - (|left|/|parent|)*H(left) - (|right|/|parent|)*H(right)
    """
    parent_labels = np.array(parent_labels)
    left_labels = np.array(left_labels)
    right_labels = np.array(right_labels)
    
    n_parent = len(parent_labels)
    n_left = len(left_labels)
    n_right = len(right_labels)
    
    h_parent = _label_entropy(parent_labels)
    h_left = _label_entropy(left_labels)
    h_right = _label_entropy(right_labels)
    
    ig = h_parent - (n_left / n_parent) * h_left - (n_right / n_parent) * h_right
    return ig


# Quick test
parent = [0, 0, 0, 1, 1, 1]
left =   [0, 0, 0]       # pure
right =  [1, 1, 1]       # pure
print(f"Perfect split IG: {information_gain(parent, left, right):.4f} bits")

left2 =  [0, 0, 1]       # impure
right2 = [0, 1, 1]       # impure
print(f"Impure split IG:  {information_gain(parent, left2, right2):.4f} bits")

# Find best split threshold for a simple dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=float)
y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1, 1])

best_ig = -1
best_threshold = None

# Try midpoints between consecutive feature values as thresholds
thresholds = (X[:-1] + X[1:]) / 2.0

print("Threshold | Left labels          | Right labels         | IG")
print("-" * 75)

for t in thresholds:
    left_mask = X <= t
    right_mask = X > t
    left_labels = y[left_mask]
    right_labels = y[right_mask]
    ig = information_gain(y, left_labels, right_labels)
    
    print(f"  {t:5.1f}   | {str(left_labels):20s} | {str(right_labels):20s} | {ig:.4f}")
    
    if ig > best_ig:
        best_ig = ig
        best_threshold = t

print(f"")
print(f"Best threshold: {best_threshold}")
print(f"Best information gain: {best_ig:.4f} bits")

PART 7: Jensen’s Inequality¶

Reference: DLI – “Jensen’s inequality”

For a concave function $f$ (e.g., $\log$):

\[f(\mathbb{E}[X]) \geq \mathbb{E}[f(X)]\]

For a convex function $f$ (e.g., $x^2$):

\[f(\mathbb{E}[X]) \leq \mathbb{E}[f(X)]\]

Jensen’s inequality is fundamental to proving that KL divergence is non-negative (Gibbs’ inequality), and it appears throughout information theory and variational inference.

# Verify Jensen's inequality numerically
x = np.array([1.0, 2.0, 5.0, 10.0, 0.5])

# --- Concave case: f = log ---
# Jensen: log(E[X]) >= E[log(X)]
mean_x = np.mean(x)
log_of_mean = np.log(mean_x)
mean_of_log = np.mean(np.log(x))

print("=== Concave function: f(x) = log(x) ===")
print(f"x = {x}")
print(f"E[X] = {mean_x:.4f}")
print(f"log(E[X])  = {log_of_mean:.6f}")
print(f"E[log(X)]  = {mean_of_log:.6f}")
print(f"log(E[X]) >= E[log(X)]? {log_of_mean >= mean_of_log - 1e-10}")
print(f"Gap: {log_of_mean - mean_of_log:.6f}")
print()

# --- Convex case: f = x^2 ---
# Jensen: (E[X])^2 <= E[X^2]
mean_squared = mean_x ** 2
mean_of_squares = np.mean(x ** 2)

print("=== Convex function: f(x) = x^2 ===")
print(f"x = {x}")
print(f"E[X] = {mean_x:.4f}")
print(f"(E[X])^2  = {mean_squared:.6f}")
print(f"E[X^2]    = {mean_of_squares:.6f}")
print(f"(E[X])^2 <= E[X^2]? {mean_squared <= mean_of_squares + 1e-10}")
print(f"Gap: {mean_of_squares - mean_squared:.6f}")
print()
print(f"Note: E[X^2] - (E[X])^2 = Var(X) = {np.var(x):.6f}")
print(f"This shows Jensen's gap for x^2 is exactly the variance!")

Summary¶

Concept	Formula	Key Property
Shannon Entropy	$H(X) = -\sum p(x) \log_2 p(x)$	Maximized for uniform distribution
Cross-Entropy	$H(p,q) = -\sum p(x) \log_2 q(x)$	$H(p,q) \geq H(p)$
KL Divergence	$D_{KL}(p \| q) = \sum p(x) \ln \frac{p(x)}{q(x)}$	$\geq 0$, not symmetric
Mutual Information	$I(X;Y) = H(X) + H(Y) - H(X,Y)$	$= 0$ iff independent
Perplexity	$PP = 2^H$	Lower = better model
Information Gain	$IG = H(\text{parent}) - \sum \frac{	child
Jensen’s Inequality	concave $f$: $f(E[X]) \geq E[f(X)]$	Proves Gibbs’ inequality

Connections:

Cross-entropy = Entropy + KL divergence: $H(p,q) = H(p) + D_{KL}(p \| q)$
Mutual information = KL divergence between joint and product of marginals
Perplexity is the exponential of cross-entropy (a more interpretable metric)
Jensen’s inequality underlies the proof that $D_{KL} \geq 0$

Concept	Formula	Key Property
Shannon Entropy	\(H(X) = -\sum p(x) \log_2 p(x)\)	Maximized for uniform distribution
Cross-Entropy	\(H(p,q) = -\sum p(x) \log_2 q(x)\)	\(H(p,q) \geq H(p)\)
KL Divergence	\(D_{KL}(p \| q) = \sum p(x) \ln \frac{p(x)}{q(x)}\)	\(\geq 0\), not symmetric
Mutual Information	\(I(X;Y) = H(X) + H(Y) - H(X,Y)\)	\(= 0\) iff independent
Perplexity	\(PP = 2^H\)	Lower = better model
Information Gain	$IG = H(\text{parent}) - \sum \frac{	child
Jensen’s Inequality	concave \(f\): \(f(E[X]) \geq E[f(X)]\)	Proves Gibbs’ inequality