Lab 03: Calculus, Gradients & BackpropagationΒΆ
Sources:
Deep Learning Interviews (Kashani, 2022) β Chapters on Calculus, Algorithmic Differentiation
Speech & Language Processing (Jurafsky & Martin) β Ch 6.6: Training Neural Nets
Topics covered:
Numerical differentiation (central & forward difference)
The chain rule
Partial derivatives & gradients
Gradient descent algorithm
Backpropagation on computation graphs
Taylor series expansion
import numpy as np
import matplotlib.pyplot as plt
from math import factorial
np.random.seed(42)
print("Imports loaded.")
print(f"NumPy version: {np.__version__}")
PART 1: Numerical DifferentiationΒΆ
The derivative of a function \(f\) at a point \(x\) is defined as:
In practice, we approximate this numerically with a small but finite \(h\).
Forward difference (first-order accurate, \(O(h)\) error):
Central difference (second-order accurate, \(O(h^2)\) error):
Central difference is more accurate because the \(O(h)\) error terms cancel out due to symmetry.
def numerical_derivative(f, x, h=1e-5):
"""Compute f'(x) using central difference."""
return (f(x + h) - f(x - h)) / (2 * h)
# Quick sanity check
f_test = lambda x: x ** 3
print(f"d/dx(x^3) at x=2: numerical = {numerical_derivative(f_test, 2.0):.10f}")
print(f" analytical = {3 * 2.0**2:.10f}")
def numerical_gradient(f, x, h=1e-5):
"""
Compute gradient of f at point x (a numpy array).
Uses central difference along each coordinate axis.
Returns: gradient vector of same shape as x.
"""
x = np.array(x, dtype=float)
grad = np.zeros_like(x)
for i in range(len(x)):
x_plus = x.copy()
x_minus = x.copy()
x_plus[i] += h
x_minus[i] -= h
grad[i] = (f(x_plus) - f(x_minus)) / (2 * h)
return grad
# Quick sanity check: gradient of f(x,y) = x^2 + y^2 at (3,4) should be [6, 8]
f_multi = lambda v: v[0]**2 + v[1]**2
print(f"Gradient of x^2+y^2 at (3,4): {numerical_gradient(f_multi, np.array([3.0, 4.0]))}")
print(f"Expected: [6. 8.]")
# Verify numerical derivatives against analytical derivatives of common functions
x = 2.0
functions = [
("x^2", lambda x: x**2, lambda x: 2*x),
("sin(x)", lambda x: np.sin(x), lambda x: np.cos(x)),
("e^x", lambda x: np.exp(x), lambda x: np.exp(x)),
("ln(x)", lambda x: np.log(x), lambda x: 1.0/x),
]
print(f"Verifying derivatives at x = {x}")
print(f"{'Function':<10} {'Numerical':>14} {'Analytical':>14} {'Abs Error':>14}")
print("-" * 55)
for name, f, f_prime in functions:
num = numerical_derivative(f, x)
ana = f_prime(x)
err = abs(num - ana)
print(f"{name:<10} {num:>14.10f} {ana:>14.10f} {err:>14.2e}")
# Plot |error| vs h for f(x)=x^2 at x=3
# Shows the sweet spot around h = 1e-5 to 1e-8
f = lambda x: x ** 2
true_deriv = 2 * 3.0 # f'(3) = 6
h_values = np.logspace(-15, 0, 100)
errors_central = [abs(numerical_derivative(f, 3.0, h=h) - true_deriv) for h in h_values]
errors_forward = [abs((f(3.0 + h) - f(3.0)) / h - true_deriv) for h in h_values]
plt.figure(figsize=(9, 5))
plt.loglog(h_values, errors_central, 'b-', linewidth=2, label='Central difference')
plt.loglog(h_values, errors_forward, 'r--', linewidth=2, label='Forward difference')
plt.xlabel('Step size h', fontsize=12)
plt.ylabel('|Error|', fontsize=12)
plt.title(r"Numerical Differentiation Error for $f(x) = x^2$ at $x = 3$", fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, which='both', alpha=0.3)
# Mark the sweet spot
best_h_central = h_values[np.argmin(errors_central)]
best_h_forward = h_values[np.argmin(errors_forward)]
plt.axvline(best_h_central, color='blue', linestyle=':', alpha=0.5,
label=f'Best h (central) ~ {best_h_central:.0e}')
plt.axvline(best_h_forward, color='red', linestyle=':', alpha=0.5,
label=f'Best h (forward) ~ {best_h_forward:.0e}')
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()
print(f"Best h for central difference: {best_h_central:.2e} (error = {min(errors_central):.2e})")
print(f"Best h for forward difference: {best_h_forward:.2e} (error = {min(errors_forward):.2e})")
print("\nKey insight: Too large h -> truncation error. Too small h -> floating-point cancellation.")
print("Central difference achieves ~1e-10 accuracy; forward difference only ~1e-8.")
PART 2: The Chain RuleΒΆ
If \(y = f(g(x))\), i.e., \(y = f(u)\) where \(u = g(x)\), then:
For longer chains \(y = f(g(h(x)))\):
The chain rule is the foundation of backpropagation: we decompose a complex function into simple steps and multiply local derivatives.
# Verify chain rule for y = sin(x^2)
# g(x) = x^2, f(u) = sin(u)
# dy/dx = cos(x^2) * 2x
x = 1.5
f1 = lambda x: np.sin(x ** 2)
analytical_1 = np.cos(x ** 2) * 2 * x
numerical_1 = numerical_derivative(f1, x)
print("Chain Rule Verification: y = sin(x^2)")
print(f" x = {x}")
print(f" Analytical dy/dx = cos(x^2) * 2x = {analytical_1:.10f}")
print(f" Numerical dy/dx = {numerical_1:.10f}")
print(f" Absolute error = {abs(analytical_1 - numerical_1):.2e}")
print()
# Verify chain rule for y = (3x + 1)^4
# g(x) = 3x + 1, f(u) = u^4
# dy/dx = 4*(3x+1)^3 * 3 = 12*(3x+1)^3
x = 1.5
f2 = lambda x: (3 * x + 1) ** 4
analytical_2 = 12 * (3 * x + 1) ** 3
numerical_2 = numerical_derivative(f2, x)
print("Chain Rule Verification: y = (3x + 1)^4")
print(f" x = {x}")
print(f" Analytical dy/dx = 12*(3x+1)^3 = {analytical_2:.10f}")
print(f" Numerical dy/dx = {numerical_2:.10f}")
print(f" Absolute error = {abs(analytical_2 - numerical_2):.2e}")
# Multi-step chain rule: y = exp(sin(x^2 + 1))
# Let u = x^2 + 1, v = sin(u), y = exp(v)
# dy/dx = dy/dv * dv/du * du/dx
# = exp(sin(x^2+1)) * cos(x^2+1) * 2x
x = 0.5
f3 = lambda x: np.exp(np.sin(x ** 2 + 1))
u = x ** 2 + 1 # = 1.25
v = np.sin(u) # = sin(1.25)
y = np.exp(v) # = exp(sin(1.25))
du_dx = 2 * x # = 1.0
dv_du = np.cos(u) # = cos(1.25)
dy_dv = np.exp(v) # = exp(sin(1.25))
analytical_3 = dy_dv * dv_du * du_dx
numerical_3 = numerical_derivative(f3, x)
print("Multi-step Chain Rule: y = exp(sin(x^2 + 1))")
print(f" x = {x}")
print(f" Step by step:")
print(f" u = x^2 + 1 = {u:.6f}")
print(f" v = sin(u) = {v:.6f}")
print(f" y = exp(v) = {y:.6f}")
print(f" du/dx = 2x = {du_dx:.6f}")
print(f" dv/du = cos(u) = {dv_du:.6f}")
print(f" dy/dv = exp(v) = {dy_dv:.6f}")
print(f" Analytical dy/dx = {dy_dv:.4f} * {dv_du:.4f} * {du_dx:.4f} = {analytical_3:.10f}")
print(f" Numerical dy/dx = {numerical_3:.10f}")
print(f" Absolute error = {abs(analytical_3 - numerical_3):.2e}")
PART 3: Partial Derivatives & GradientsΒΆ
For a scalar function of multiple variables \(f(x_1, x_2, \ldots, x_n)\), the gradient is the vector of all partial derivatives:
Key properties:
\(\nabla f\) points in the direction of steepest ascent.
\(-\nabla f\) points in the direction of steepest descent (used in gradient descent).
\(|\nabla f|\) gives the rate of maximum increase.
At a local minimum, \(\nabla f = \mathbf{0}\).
# Gradient of f(x,y) = x^2 + 2xy + y^2
# df/dx = 2x + 2y
# df/dy = 2x + 2y
# At (1, 2): gradient = [2*1 + 2*2, 2*1 + 2*2] = [6, 6]
f = lambda v: v[0]**2 + 2*v[0]*v[1] + v[1]**2
point = np.array([1.0, 2.0])
analytical_grad = np.array([2*point[0] + 2*point[1], 2*point[0] + 2*point[1]])
numerical_grad = numerical_gradient(f, point)
print("Gradient of f(x,y) = x^2 + 2xy + y^2")
print(f" Point: (x, y) = ({point[0]}, {point[1]})")
print(f" Analytical gradient: {analytical_grad}")
print(f" Numerical gradient: {numerical_grad}")
print(f" Max absolute error: {np.max(np.abs(analytical_grad - numerical_grad)):.2e}")
print()
print(f" Note: f(x,y) = (x+y)^2, so the gradient [2(x+y), 2(x+y)] = [6, 6] at (1,2).")
# Gradient of MSE loss: L(w) = (1/N) * sum( (y_i - w*x_i)^2 )
# dL/dw = -(2/N) * sum( x_i * (y_i - w*x_i) )
x_data = np.array([1.0, 2.0, 3.0, 4.0])
y_data = np.array([2.0, 4.0, 5.0, 8.0])
w = 1.5
N = len(x_data)
# Analytical gradient
residuals = y_data - w * x_data
analytical_dLdw = -(2.0 / N) * np.sum(x_data * residuals)
# Numerical gradient
def mse_loss(w_scalar):
return np.mean((y_data - w_scalar * x_data) ** 2)
numerical_dLdw = numerical_derivative(mse_loss, w)
print("MSE Loss Gradient")
print(f" Data: x = {x_data}, y = {y_data}")
print(f" Weight: w = {w}")
print(f" Predictions: w*x = {w * x_data}")
print(f" Residuals: y - w*x = {residuals}")
print(f" Loss L(w) = {mse_loss(w):.6f}")
print(f" Analytical dL/dw = {analytical_dLdw:.10f}")
print(f" Numerical dL/dw = {numerical_dLdw:.10f}")
print(f" Absolute error = {abs(analytical_dLdw - numerical_dLdw):.2e}")
print(f"\n Since dL/dw = {analytical_dLdw:.4f} < 0, increasing w will decrease the loss.")
PART 4: Gradient DescentΒΆ
Gradient descent is an iterative optimization algorithm:
where \(\alpha\) is the learning rate (step size).
Learning rate effects:
Too small: converges very slowly, may get stuck.
Too large: overshoots the minimum, may diverge.
Just right: steady convergence to a local minimum.
For a convex loss surface (like MSE with linear regression), gradient descent converges to the global minimum.
def gradient_descent_1d(f, f_grad, x0, lr=0.1, n_steps=50):
"""
Minimize f(x) using gradient descent.
Args:
f: function to minimize
f_grad: gradient (derivative) of f
x0: initial point
lr: learning rate (alpha)
n_steps: number of iterations
Returns:
x_final: the final x after optimization
history: list of (x, f(x)) at each step
"""
x = x0
history = [(x, f(x))]
for _ in range(n_steps):
grad = f_grad(x)
x = x - lr * grad
history.append((x, f(x)))
return x, history
print("gradient_descent_1d defined.")
# Minimize f(x) = (x - 3)^2 + 1
# True minimum: x = 3, f(3) = 1
# Gradient: f'(x) = 2(x - 3)
f = lambda x: (x - 3) ** 2 + 1
f_grad = lambda x: 2 * (x - 3)
x_final, history = gradient_descent_1d(f, f_grad, x0=0.0, lr=0.1, n_steps=50)
xs = [h[0] for h in history]
fs = [h[1] for h in history]
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
# Left: function curve with GD path
x_range = np.linspace(-1, 6, 200)
axes[0].plot(x_range, f(x_range), 'b-', linewidth=2, label='$f(x) = (x-3)^2 + 1$')
axes[0].plot(xs, fs, 'ro-', markersize=4, linewidth=1, alpha=0.7, label='GD steps')
axes[0].plot(xs[0], fs[0], 'gs', markersize=10, label=f'Start (x={xs[0]:.1f})')
axes[0].plot(xs[-1], fs[-1], 'r*', markersize=15, label=f'End (x={xs[-1]:.4f})')
axes[0].set_xlabel('x', fontsize=12)
axes[0].set_ylabel('f(x)', fontsize=12)
axes[0].set_title('Gradient Descent Path', fontsize=13)
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)
# Right: convergence curve
axes[1].plot(range(len(fs)), fs, 'b-', linewidth=2)
axes[1].set_xlabel('Step', fontsize=12)
axes[1].set_ylabel('f(x)', fontsize=12)
axes[1].set_title('Convergence Curve', fontsize=13)
axes[1].axhline(y=1.0, color='r', linestyle='--', alpha=0.5, label='True minimum f=1')
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Starting point: x = 0.0, f(x) = {f(0.0):.4f}")
print(f"After 50 steps: x = {x_final:.6f}, f(x) = {f(x_final):.6f}")
print(f"True minimum: x = 3.0, f(x) = 1.0")
# Gradient descent for linear regression on y = 2x + 1 + noise
np.random.seed(42)
X = np.random.randn(100)
y = 2 * X + 1 + 0.1 * np.random.randn(100)
# Initialize parameters
w, b = 0.0, 0.0
lr = 0.05
n_steps = 200
losses = []
N = len(X)
for step in range(n_steps):
# Forward: predictions
y_pred = w * X + b
# Loss: MSE
loss = np.mean((y - y_pred) ** 2)
losses.append(loss)
# Gradients
dL_dw = -(2.0 / N) * np.sum(X * (y - y_pred))
dL_db = -(2.0 / N) * np.sum(y - y_pred)
# Update
w = w - lr * dL_dw
b = b - lr * dL_db
# Plot loss curve
plt.figure(figsize=(9, 5))
plt.plot(losses, 'b-', linewidth=2)
plt.xlabel('Step', fontsize=12)
plt.ylabel('MSE Loss', fontsize=12)
plt.title('Linear Regression via Gradient Descent', fontsize=13)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Final parameters after {n_steps} steps:")
print(f" w = {w:.6f} (true = 2.0)")
print(f" b = {b:.6f} (true = 1.0)")
print(f" Final loss = {losses[-1]:.6f}")
# Learning rate experiment: GD on f(x) = x^4 - 3x^3 + 2
# f'(x) = 4x^3 - 9x^2
f = lambda x: x**4 - 3*x**3 + 2
f_grad = lambda x: 4*x**3 - 9*x**2
learning_rates = [0.001, 0.01, 0.1]
colors = ['blue', 'green', 'red']
x0 = 4.0
n_steps = 100
plt.figure(figsize=(10, 5))
for lr, color in zip(learning_rates, colors):
x = x0
f_history = [f(x)]
diverged = False
for step in range(n_steps):
grad = f_grad(x)
x = x - lr * grad
val = f(x)
# Clip for plotting if diverging
if abs(val) > 1e10:
diverged = True
break
f_history.append(val)
label = f'lr = {lr}'
if diverged:
label += ' (DIVERGED)'
plt.plot(f_history, color=color, linewidth=2, label=label)
plt.xlabel('Step', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title(r'Learning Rate Comparison on $f(x) = x^4 - 3x^3 + 2$', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.ylim(-15, 60)
plt.tight_layout()
plt.show()
print("Observations:")
print(" lr = 0.001: Slow but steady convergence")
print(" lr = 0.01: Good convergence rate")
print(" lr = 0.1: May overshoot or diverge due to large gradients")
PART 5: BackpropagationΒΆ
Backpropagation applies the chain rule to a computation graph.
For each node in the graph:
Forward pass: compute the output from inputs.
Backward pass: multiply the upstream gradient (from the loss) by the local gradient (derivative of this nodeβs operation).
This recursive application propagates gradients from the loss backward through every parameter in the network.
# Manual backprop on a simple computation graph
#
# Forward:
# x = 2, y = 3
# a = x * y = 6
# b = a + 1 = 7
# L = b^2 = 49
#
# Backward:
# dL/db = 2b = 14
# db/da = 1
# dL/da = dL/db * db/da = 14 * 1 = 14
# da/dx = y = 3
# da/dy = x = 2
# dL/dx = dL/da * da/dx = 14 * 3 = 42
# dL/dy = dL/da * da/dy = 14 * 2 = 28
# --- Forward pass ---
x_val, y_val = 2.0, 3.0
a = x_val * y_val # 6
b = a + 1 # 7
L = b ** 2 # 49
print("=== Forward Pass ===")
print(f" x = {x_val}, y = {y_val}")
print(f" a = x * y = {a}")
print(f" b = a + 1 = {b}")
print(f" L = b^2 = {L}")
# --- Backward pass ---
dL_db = 2 * b # 14
db_da = 1.0
dL_da = dL_db * db_da # 14
da_dx = y_val # 3
da_dy = x_val # 2
dL_dx = dL_da * da_dx # 42
dL_dy = dL_da * da_dy # 28
print("\n=== Backward Pass ===")
print(f" dL/db = 2b = {dL_db}")
print(f" dL/da = dL/db * 1 = {dL_da}")
print(f" dL/dx = dL/da * y = {dL_da} * {da_dx} = {dL_dx}")
print(f" dL/dy = dL/da * x = {dL_da} * {da_dy} = {dL_dy}")
# --- Numerical verification ---
def L_of_x(x):
a = x * y_val
b = a + 1
return b ** 2
def L_of_y(y):
a = x_val * y
b = a + 1
return b ** 2
num_dL_dx = numerical_derivative(L_of_x, x_val)
num_dL_dy = numerical_derivative(L_of_y, y_val)
print("\n=== Numerical Verification ===")
print(f" dL/dx: backprop = {dL_dx:.4f}, numerical = {num_dL_dx:.4f}, match = {np.isclose(dL_dx, num_dL_dx)}")
print(f" dL/dy: backprop = {dL_dy:.4f}, numerical = {num_dL_dy:.4f}, match = {np.isclose(dL_dy, num_dL_dy)}")
# Backprop through a 1-layer neural network with ReLU
#
# Forward: z = w*x + b, a = ReLU(z), L = (a - y)^2
#
# Backward:
# dL/da = 2(a - y)
# da/dz = 1 if z > 0 else 0 (ReLU derivative)
# dz/dw = x
# dz/db = 1
# dL/dw = dL/da * da/dz * dz/dw
# dL/db = dL/da * da/dz * dz/db
x_in, w, b_param, y_true = 1.0, 0.5, 0.1, 1.0
# --- Forward ---
z = w * x_in + b_param # 0.6
a = max(0, z) # 0.6 (ReLU, z > 0)
L = (a - y_true) ** 2 # 0.16
print("=== Forward Pass ===")
print(f" x = {x_in}, w = {w}, b = {b_param}, y_true = {y_true}")
print(f" z = w*x + b = {z}")
print(f" a = ReLU(z) = {a}")
print(f" L = (a - y)^2 = {L}")
# --- Backward ---
dL_da = 2 * (a - y_true) # 2*(0.6 - 1.0) = -0.8
da_dz = 1.0 if z > 0 else 0.0 # 1.0 (since z = 0.6 > 0)
dL_dz = dL_da * da_dz # -0.8
dL_dw = dL_dz * x_in # -0.8 * 1.0 = -0.8
dL_db = dL_dz * 1.0 # -0.8
print("\n=== Backward Pass ===")
print(f" dL/da = 2(a - y) = {dL_da}")
print(f" da/dz = (z > 0) = {da_dz}")
print(f" dL/dz = dL/da * da/dz = {dL_dz}")
print(f" dL/dw = dL/dz * x = {dL_dw}")
print(f" dL/db = dL/dz * 1 = {dL_db}")
# --- Numerical verification ---
def L_of_w(w_val):
z = w_val * x_in + b_param
a = max(0, z)
return (a - y_true) ** 2
def L_of_b(b_val):
z = w * x_in + b_val
a = max(0, z)
return (a - y_true) ** 2
num_dL_dw = numerical_derivative(L_of_w, w)
num_dL_db = numerical_derivative(L_of_b, b_param)
print("\n=== Numerical Verification ===")
print(f" dL/dw: backprop = {dL_dw:.6f}, numerical = {num_dL_dw:.6f}, match = {np.isclose(dL_dw, num_dL_dw)}")
print(f" dL/db: backprop = {dL_db:.6f}, numerical = {num_dL_db:.6f}, match = {np.isclose(dL_db, num_dL_db)}")
# Backprop through sigmoid with binary cross-entropy loss
#
# Forward: z = w*x + b, a = sigmoid(z), L = BCE(y, a) = -(y*log(a) + (1-y)*log(1-a))
#
# Backward (the elegant result):
# dL/da = -y/a + (1-y)/(1-a)
# da/dz = a * (1 - a) (sigmoid derivative)
# dL/dz = dL/da * da/dz
# = [-y/a + (1-y)/(1-a)] * a*(1-a)
# = -y*(1-a) + (1-y)*a
# = a - y <<< the elegant result!
# dL/dw = (a - y) * x
# dL/db = (a - y)
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
x_in, w, b_param, y_true = 2.0, 0.3, -0.1, 1.0
# --- Forward ---
z = w * x_in + b_param # 0.5
a = sigmoid(z) # sigmoid(0.5)
L = -(y_true * np.log(a) + (1 - y_true) * np.log(1 - a)) # BCE
print("=== Forward Pass ===")
print(f" x = {x_in}, w = {w}, b = {b_param}, y = {y_true}")
print(f" z = w*x + b = {z}")
print(f" a = sigmoid(z) = {a:.6f}")
print(f" L = BCE(y, a) = {L:.6f}")
# --- Backward: step by step ---
dL_da = -y_true / a + (1 - y_true) / (1 - a)
da_dz = a * (1 - a)
dL_dz_long = dL_da * da_dz
# The elegant shortcut
dL_dz_elegant = a - y_true
dL_dw = dL_dz_elegant * x_in
dL_db = dL_dz_elegant * 1.0
print("\n=== Backward Pass ===")
print(f" dL/da = -y/a + (1-y)/(1-a) = {dL_da:.6f}")
print(f" da/dz = a*(1-a) = {da_dz:.6f}")
print(f" dL/dz (step by step) = {dL_dz_long:.6f}")
print(f" dL/dz (elegant: a - y) = {dL_dz_elegant:.6f}")
print(f" They match: {np.isclose(dL_dz_long, dL_dz_elegant)}")
print(f"\n dL/dw = (a - y) * x = {dL_dw:.6f}")
print(f" dL/db = (a - y) = {dL_db:.6f}")
# --- Numerical verification ---
def L_of_w_sig(w_val):
z = w_val * x_in + b_param
a = sigmoid(z)
return -(y_true * np.log(a) + (1 - y_true) * np.log(1 - a))
def L_of_b_sig(b_val):
z = w * x_in + b_val
a = sigmoid(z)
return -(y_true * np.log(a) + (1 - y_true) * np.log(1 - a))
num_dL_dw = numerical_derivative(L_of_w_sig, w)
num_dL_db = numerical_derivative(L_of_b_sig, b_param)
print("\n=== Numerical Verification ===")
print(f" dL/dw: backprop = {dL_dw:.6f}, numerical = {num_dL_dw:.6f}, match = {np.isclose(dL_dw, num_dL_dw)}")
print(f" dL/db: backprop = {dL_db:.6f}, numerical = {num_dL_db:.6f}, match = {np.isclose(dL_db, num_dL_db)}")
print("\nKey insight: For sigmoid + BCE, dL/dz = a - y. This is why sigmoid+BCE")
print("is such a natural pairing -- the gradient update is simply (prediction - truth).")
PART 6: Taylor SeriesΒΆ
The Taylor series expansion of \(f(x)\) around point \(a\) is:
Special cases around \(a = 0\) (Maclaurin series):
Taylor series connect to gradient descent: the first-order Taylor approximation \(f(x) \approx f(a) + f'(a)(x-a)\) is exactly the linear model that gradient descent follows locally.
def taylor_exp(x, n_terms=5):
"""
Taylor approximation of e^x around a = 0.
e^x = sum_{n=0}^{N-1} x^n / n!
"""
result = 0.0
for n in range(n_terms):
result += x ** n / factorial(n)
return result
# Verify
test_x = 1.0
print(f"Taylor approximations of e^{test_x} = {np.exp(test_x):.10f}")
for n in [1, 2, 3, 5, 10, 20]:
approx = taylor_exp(test_x, n_terms=n)
err = abs(approx - np.exp(test_x))
print(f" {n:2d} terms: {approx:.10f} (error = {err:.2e})")
def taylor_sin(x, n_terms=5):
"""
Taylor approximation of sin(x) around a = 0.
sin(x) = sum_{n=0}^{N-1} (-1)^n * x^(2n+1) / (2n+1)!
"""
result = 0.0
for n in range(n_terms):
result += ((-1) ** n) * (x ** (2 * n + 1)) / factorial(2 * n + 1)
return result
# Verify
test_x = np.pi / 4 # sin(pi/4) = sqrt(2)/2
print(f"Taylor approximations of sin({test_x:.4f}) = {np.sin(test_x):.10f}")
for n in [1, 2, 3, 5, 10]:
approx = taylor_sin(test_x, n_terms=n)
err = abs(approx - np.sin(test_x))
print(f" {n:2d} terms: {approx:.10f} (error = {err:.2e})")
# Plot e^x and its Taylor approximations of order 1, 2, 3, 5, 10 on [-3, 3]
x_range = np.linspace(-3, 3, 300)
orders = [1, 2, 3, 5, 10]
colors = ['red', 'orange', 'green', 'purple', 'brown']
plt.figure(figsize=(10, 6))
# True function
plt.plot(x_range, np.exp(x_range), 'b-', linewidth=3, label='$e^x$ (exact)', zorder=10)
# Taylor approximations
for order, color in zip(orders, colors):
y_taylor = np.array([taylor_exp(xi, n_terms=order) for xi in x_range])
plt.plot(x_range, y_taylor, '--', color=color, linewidth=1.5,
label=f'Order {order} ({order} terms)')
plt.ylim(-5, 25)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title(r'Taylor Series Approximations of $e^x$ around $a = 0$', fontsize=13)
plt.legend(fontsize=10, loc='upper left')
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linewidth=0.5)
plt.axvline(x=0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()
print("Observations:")
print(" - Higher-order approximations are accurate over a wider range.")
print(" - All approximations are exact at x = 0 (the expansion point).")
print(" - By order 10, the approximation is nearly indistinguishable from e^x on [-3, 3].")
SummaryΒΆ
In this lab we covered the calculus foundations that power deep learning:
Concept |
Key Formula |
Why It Matters |
|---|---|---|
Numerical differentiation |
\(f'(x) \approx \frac{f(x+h) - f(x-h)}{2h}\) |
Gradient checking / debugging |
Chain rule |
\(\frac{dy}{dx} = \frac{dy}{du} \cdot \frac{du}{dx}\) |
Foundation of backpropagation |
Gradient |
\(\nabla f = [\partial f / \partial x_i]\) |
Direction of steepest ascent |
Gradient descent |
\(w \leftarrow w - \alpha \nabla L\) |
How neural networks learn |
Backpropagation |
upstream \(\times\) local gradient |
Efficient gradient computation |
Taylor series |
\(f(x) \approx \sum \frac{f^{(n)}(a)}{n!}(x-a)^n\) |
Local polynomial approximation |
Key takeaways:
Central difference is \(O(h^2)\) accurate, best around \(h \approx 10^{-5}\) to \(10^{-8}\).
Backpropagation is just the chain rule applied systematically on a computation graph.
For sigmoid + BCE, the gradient simplifies to the elegant \(\frac{\partial L}{\partial z} = a - y\).
Learning rate is the most critical hyperparameter in gradient descent.
Taylor series connect derivatives to function approximation β gradient descent follows the first-order approximation.