Run this notebook: Open in Colab Open in Kaggle

Lab 03: Calculus, Gradients & Backpropagation¶

Sources:

Deep Learning Interviews (Kashani, 2022) – Chapters on Calculus, Algorithmic Differentiation
Speech & Language Processing (Jurafsky & Martin) – Ch 6.6: Training Neural Nets

Topics covered:

Numerical differentiation (central & forward difference)
The chain rule
Partial derivatives & gradients
Gradient descent algorithm
Backpropagation on computation graphs
Taylor series expansion

import numpy as np
import matplotlib.pyplot as plt
from math import factorial

np.random.seed(42)
print("Imports loaded.")
print(f"NumPy version: {np.__version__}")

PART 1: Numerical Differentiation¶

The derivative of a function \(f\) at a point \(x\) is defined as:

\[f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}\]

In practice, we approximate this numerically with a small but finite \(h\).

Forward difference (first-order accurate, \(O(h)\) error):

\[f'(x) \approx \frac{f(x+h) - f(x)}{h}\]

Central difference (second-order accurate, \(O(h^2)\) error):

\[f'(x) \approx \frac{f(x+h) - f(x-h)}{2h}\]

Central difference is more accurate because the \(O(h)\) error terms cancel out due to symmetry.

def numerical_derivative(f, x, h=1e-5):
    """Compute f'(x) using central difference."""
    return (f(x + h) - f(x - h)) / (2 * h)


# Quick sanity check
f_test = lambda x: x ** 3
print(f"d/dx(x^3) at x=2: numerical = {numerical_derivative(f_test, 2.0):.10f}")
print(f"                   analytical = {3 * 2.0**2:.10f}")

def numerical_gradient(f, x, h=1e-5):
    """
    Compute gradient of f at point x (a numpy array).
    Uses central difference along each coordinate axis.
    Returns: gradient vector of same shape as x.
    """
    x = np.array(x, dtype=float)
    grad = np.zeros_like(x)
    for i in range(len(x)):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[i] += h
        x_minus[i] -= h
        grad[i] = (f(x_plus) - f(x_minus)) / (2 * h)
    return grad


# Quick sanity check: gradient of f(x,y) = x^2 + y^2 at (3,4) should be [6, 8]
f_multi = lambda v: v[0]**2 + v[1]**2
print(f"Gradient of x^2+y^2 at (3,4): {numerical_gradient(f_multi, np.array([3.0, 4.0]))}")
print(f"Expected: [6. 8.]")

# Verify numerical derivatives against analytical derivatives of common functions

x = 2.0

functions = [
    ("x^2",    lambda x: x**2,      lambda x: 2*x),
    ("sin(x)", lambda x: np.sin(x), lambda x: np.cos(x)),
    ("e^x",    lambda x: np.exp(x), lambda x: np.exp(x)),
    ("ln(x)",  lambda x: np.log(x), lambda x: 1.0/x),
]

print(f"Verifying derivatives at x = {x}")
print(f"{'Function':<10} {'Numerical':>14} {'Analytical':>14} {'Abs Error':>14}")
print("-" * 55)
for name, f, f_prime in functions:
    num = numerical_derivative(f, x)
    ana = f_prime(x)
    err = abs(num - ana)
    print(f"{name:<10} {num:>14.10f} {ana:>14.10f} {err:>14.2e}")

# Plot |error| vs h for f(x)=x^2 at x=3
# Shows the sweet spot around h = 1e-5 to 1e-8

f = lambda x: x ** 2
true_deriv = 2 * 3.0  # f'(3) = 6

h_values = np.logspace(-15, 0, 100)
errors_central = [abs(numerical_derivative(f, 3.0, h=h) - true_deriv) for h in h_values]
errors_forward = [abs((f(3.0 + h) - f(3.0)) / h - true_deriv) for h in h_values]

plt.figure(figsize=(9, 5))
plt.loglog(h_values, errors_central, 'b-', linewidth=2, label='Central difference')
plt.loglog(h_values, errors_forward, 'r--', linewidth=2, label='Forward difference')
plt.xlabel('Step size h', fontsize=12)
plt.ylabel('|Error|', fontsize=12)
plt.title(r"Numerical Differentiation Error for $f(x) = x^2$ at $x = 3$", fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, which='both', alpha=0.3)

# Mark the sweet spot
best_h_central = h_values[np.argmin(errors_central)]
best_h_forward = h_values[np.argmin(errors_forward)]
plt.axvline(best_h_central, color='blue', linestyle=':', alpha=0.5,
            label=f'Best h (central) ~ {best_h_central:.0e}')
plt.axvline(best_h_forward, color='red', linestyle=':', alpha=0.5,
            label=f'Best h (forward) ~ {best_h_forward:.0e}')
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()

print(f"Best h for central difference: {best_h_central:.2e} (error = {min(errors_central):.2e})")
print(f"Best h for forward difference:  {best_h_forward:.2e} (error = {min(errors_forward):.2e})")
print("\nKey insight: Too large h -> truncation error. Too small h -> floating-point cancellation.")
print("Central difference achieves ~1e-10 accuracy; forward difference only ~1e-8.")

PART 2: The Chain Rule¶

If \(y = f(g(x))\), i.e., \(y = f(u)\) where \(u = g(x)\), then:

\[\frac{dy}{dx} = \frac{dy}{du} \cdot \frac{du}{dx} = f'(g(x)) \cdot g'(x)\]

For longer chains \(y = f(g(h(x)))\):

\[\frac{dy}{dx} = f'(g(h(x))) \cdot g'(h(x)) \cdot h'(x)\]

The chain rule is the foundation of backpropagation: we decompose a complex function into simple steps and multiply local derivatives.

# Verify chain rule for y = sin(x^2)
# g(x) = x^2, f(u) = sin(u)
# dy/dx = cos(x^2) * 2x

x = 1.5
f1 = lambda x: np.sin(x ** 2)
analytical_1 = np.cos(x ** 2) * 2 * x
numerical_1 = numerical_derivative(f1, x)

print("Chain Rule Verification: y = sin(x^2)")
print(f"  x = {x}")
print(f"  Analytical dy/dx = cos(x^2) * 2x = {analytical_1:.10f}")
print(f"  Numerical  dy/dx                  = {numerical_1:.10f}")
print(f"  Absolute error                    = {abs(analytical_1 - numerical_1):.2e}")
print()

# Verify chain rule for y = (3x + 1)^4
# g(x) = 3x + 1, f(u) = u^4
# dy/dx = 4*(3x+1)^3 * 3 = 12*(3x+1)^3

x = 1.5
f2 = lambda x: (3 * x + 1) ** 4
analytical_2 = 12 * (3 * x + 1) ** 3
numerical_2 = numerical_derivative(f2, x)

print("Chain Rule Verification: y = (3x + 1)^4")
print(f"  x = {x}")
print(f"  Analytical dy/dx = 12*(3x+1)^3 = {analytical_2:.10f}")
print(f"  Numerical  dy/dx                = {numerical_2:.10f}")
print(f"  Absolute error                  = {abs(analytical_2 - numerical_2):.2e}")

# Multi-step chain rule: y = exp(sin(x^2 + 1))
# Let u = x^2 + 1,  v = sin(u),  y = exp(v)
# dy/dx = dy/dv * dv/du * du/dx
#       = exp(sin(x^2+1)) * cos(x^2+1) * 2x

x = 0.5
f3 = lambda x: np.exp(np.sin(x ** 2 + 1))

u = x ** 2 + 1          # = 1.25
v = np.sin(u)            # = sin(1.25)
y = np.exp(v)            # = exp(sin(1.25))

du_dx = 2 * x            # = 1.0
dv_du = np.cos(u)        # = cos(1.25)
dy_dv = np.exp(v)        # = exp(sin(1.25))

analytical_3 = dy_dv * dv_du * du_dx
numerical_3 = numerical_derivative(f3, x)

print("Multi-step Chain Rule: y = exp(sin(x^2 + 1))")
print(f"  x = {x}")
print(f"  Step by step:")
print(f"    u = x^2 + 1       = {u:.6f}")
print(f"    v = sin(u)        = {v:.6f}")
print(f"    y = exp(v)        = {y:.6f}")
print(f"    du/dx = 2x        = {du_dx:.6f}")
print(f"    dv/du = cos(u)    = {dv_du:.6f}")
print(f"    dy/dv = exp(v)    = {dy_dv:.6f}")
print(f"  Analytical dy/dx = {dy_dv:.4f} * {dv_du:.4f} * {du_dx:.4f} = {analytical_3:.10f}")
print(f"  Numerical  dy/dx                                   = {numerical_3:.10f}")
print(f"  Absolute error                                     = {abs(analytical_3 - numerical_3):.2e}")

PART 3: Partial Derivatives & Gradients¶

For a scalar function of multiple variables \(f(x_1, x_2, \ldots, x_n)\), the gradient is the vector of all partial derivatives:

\[\nabla f = \left[ \frac{\partial f}{\partial x_1}, \frac{\partial f}{\partial x_2}, \ldots, \frac{\partial f}{\partial x_n} \right]\]

Key properties:

\(\nabla f\) points in the direction of steepest ascent.
\(-\nabla f\) points in the direction of steepest descent (used in gradient descent).
\(|\nabla f|\) gives the rate of maximum increase.
At a local minimum, \(\nabla f = \mathbf{0}\).

# Gradient of f(x,y) = x^2 + 2xy + y^2
# df/dx = 2x + 2y
# df/dy = 2x + 2y
# At (1, 2): gradient = [2*1 + 2*2, 2*1 + 2*2] = [6, 6]

f = lambda v: v[0]**2 + 2*v[0]*v[1] + v[1]**2
point = np.array([1.0, 2.0])

analytical_grad = np.array([2*point[0] + 2*point[1], 2*point[0] + 2*point[1]])
numerical_grad = numerical_gradient(f, point)

print("Gradient of f(x,y) = x^2 + 2xy + y^2")
print(f"  Point: (x, y) = ({point[0]}, {point[1]})")
print(f"  Analytical gradient: {analytical_grad}")
print(f"  Numerical  gradient: {numerical_grad}")
print(f"  Max absolute error:  {np.max(np.abs(analytical_grad - numerical_grad)):.2e}")
print()
print(f"  Note: f(x,y) = (x+y)^2, so the gradient [2(x+y), 2(x+y)] = [6, 6] at (1,2).")

# Gradient of MSE loss: L(w) = (1/N) * sum( (y_i - w*x_i)^2 )
# dL/dw = -(2/N) * sum( x_i * (y_i - w*x_i) )

x_data = np.array([1.0, 2.0, 3.0, 4.0])
y_data = np.array([2.0, 4.0, 5.0, 8.0])
w = 1.5
N = len(x_data)

# Analytical gradient
residuals = y_data - w * x_data
analytical_dLdw = -(2.0 / N) * np.sum(x_data * residuals)

# Numerical gradient
def mse_loss(w_scalar):
    return np.mean((y_data - w_scalar * x_data) ** 2)

numerical_dLdw = numerical_derivative(mse_loss, w)

print("MSE Loss Gradient")
print(f"  Data: x = {x_data}, y = {y_data}")
print(f"  Weight: w = {w}")
print(f"  Predictions: w*x = {w * x_data}")
print(f"  Residuals:   y - w*x = {residuals}")
print(f"  Loss L(w) = {mse_loss(w):.6f}")
print(f"  Analytical dL/dw = {analytical_dLdw:.10f}")
print(f"  Numerical  dL/dw = {numerical_dLdw:.10f}")
print(f"  Absolute error   = {abs(analytical_dLdw - numerical_dLdw):.2e}")
print(f"\n  Since dL/dw = {analytical_dLdw:.4f} < 0, increasing w will decrease the loss.")

PART 4: Gradient Descent¶

Gradient descent is an iterative optimization algorithm:

\[w \leftarrow w - \alpha \, \nabla L(w)\]

where \(\alpha\) is the learning rate (step size).

Learning rate effects:

Too small: converges very slowly, may get stuck.
Too large: overshoots the minimum, may diverge.
Just right: steady convergence to a local minimum.

For a convex loss surface (like MSE with linear regression), gradient descent converges to the global minimum.

def gradient_descent_1d(f, f_grad, x0, lr=0.1, n_steps=50):
    """
    Minimize f(x) using gradient descent.

    Args:
        f: function to minimize
        f_grad: gradient (derivative) of f
        x0: initial point
        lr: learning rate (alpha)
        n_steps: number of iterations

    Returns:
        x_final: the final x after optimization
        history: list of (x, f(x)) at each step
    """
    x = x0
    history = [(x, f(x))]
    for _ in range(n_steps):
        grad = f_grad(x)
        x = x - lr * grad
        history.append((x, f(x)))
    return x, history


print("gradient_descent_1d defined.")

# Minimize f(x) = (x - 3)^2 + 1
# True minimum: x = 3, f(3) = 1
# Gradient: f'(x) = 2(x - 3)

f = lambda x: (x - 3) ** 2 + 1
f_grad = lambda x: 2 * (x - 3)

x_final, history = gradient_descent_1d(f, f_grad, x0=0.0, lr=0.1, n_steps=50)

xs = [h[0] for h in history]
fs = [h[1] for h in history]

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Left: function curve with GD path
x_range = np.linspace(-1, 6, 200)
axes[0].plot(x_range, f(x_range), 'b-', linewidth=2, label='$f(x) = (x-3)^2 + 1$')
axes[0].plot(xs, fs, 'ro-', markersize=4, linewidth=1, alpha=0.7, label='GD steps')
axes[0].plot(xs[0], fs[0], 'gs', markersize=10, label=f'Start (x={xs[0]:.1f})')
axes[0].plot(xs[-1], fs[-1], 'r*', markersize=15, label=f'End (x={xs[-1]:.4f})')
axes[0].set_xlabel('x', fontsize=12)
axes[0].set_ylabel('f(x)', fontsize=12)
axes[0].set_title('Gradient Descent Path', fontsize=13)
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)

# Right: convergence curve
axes[1].plot(range(len(fs)), fs, 'b-', linewidth=2)
axes[1].set_xlabel('Step', fontsize=12)
axes[1].set_ylabel('f(x)', fontsize=12)
axes[1].set_title('Convergence Curve', fontsize=13)
axes[1].axhline(y=1.0, color='r', linestyle='--', alpha=0.5, label='True minimum f=1')
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Starting point: x = 0.0, f(x) = {f(0.0):.4f}")
print(f"After 50 steps: x = {x_final:.6f}, f(x) = {f(x_final):.6f}")
print(f"True minimum:   x = 3.0,     f(x) = 1.0")

# Gradient descent for linear regression on y = 2x + 1 + noise

np.random.seed(42)
X = np.random.randn(100)
y = 2 * X + 1 + 0.1 * np.random.randn(100)

# Initialize parameters
w, b = 0.0, 0.0
lr = 0.05
n_steps = 200
losses = []
N = len(X)

for step in range(n_steps):
    # Forward: predictions
    y_pred = w * X + b

    # Loss: MSE
    loss = np.mean((y - y_pred) ** 2)
    losses.append(loss)

    # Gradients
    dL_dw = -(2.0 / N) * np.sum(X * (y - y_pred))
    dL_db = -(2.0 / N) * np.sum(y - y_pred)

    # Update
    w = w - lr * dL_dw
    b = b - lr * dL_db

# Plot loss curve
plt.figure(figsize=(9, 5))
plt.plot(losses, 'b-', linewidth=2)
plt.xlabel('Step', fontsize=12)
plt.ylabel('MSE Loss', fontsize=12)
plt.title('Linear Regression via Gradient Descent', fontsize=13)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Final parameters after {n_steps} steps:")
print(f"  w = {w:.6f}  (true = 2.0)")
print(f"  b = {b:.6f}  (true = 1.0)")
print(f"  Final loss = {losses[-1]:.6f}")

# Learning rate experiment: GD on f(x) = x^4 - 3x^3 + 2
# f'(x) = 4x^3 - 9x^2

f = lambda x: x**4 - 3*x**3 + 2
f_grad = lambda x: 4*x**3 - 9*x**2

learning_rates = [0.001, 0.01, 0.1]
colors = ['blue', 'green', 'red']
x0 = 4.0
n_steps = 100

plt.figure(figsize=(10, 5))

for lr, color in zip(learning_rates, colors):
    x = x0
    f_history = [f(x)]
    diverged = False
    for step in range(n_steps):
        grad = f_grad(x)
        x = x - lr * grad
        val = f(x)
        # Clip for plotting if diverging
        if abs(val) > 1e10:
            diverged = True
            break
        f_history.append(val)

    label = f'lr = {lr}'
    if diverged:
        label += ' (DIVERGED)'
    plt.plot(f_history, color=color, linewidth=2, label=label)

plt.xlabel('Step', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.title(r'Learning Rate Comparison on $f(x) = x^4 - 3x^3 + 2$', fontsize=13)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.ylim(-15, 60)
plt.tight_layout()
plt.show()

print("Observations:")
print("  lr = 0.001: Slow but steady convergence")
print("  lr = 0.01:  Good convergence rate")
print("  lr = 0.1:   May overshoot or diverge due to large gradients")

PART 5: Backpropagation¶

Backpropagation applies the chain rule to a computation graph.

For each node in the graph:

Forward pass: compute the output from inputs.
Backward pass: multiply the upstream gradient (from the loss) by the local gradient (derivative of this node’s operation).

\[\frac{\partial L}{\partial \text{input}} = \frac{\partial L}{\partial \text{output}} \cdot \frac{\partial \text{output}}{\partial \text{input}}\]

This recursive application propagates gradients from the loss backward through every parameter in the network.

# Manual backprop on a simple computation graph
#
# Forward:
#   x = 2, y = 3
#   a = x * y     = 6
#   b = a + 1     = 7
#   L = b^2       = 49
#
# Backward:
#   dL/db = 2b = 14
#   db/da = 1
#   dL/da = dL/db * db/da = 14 * 1 = 14
#   da/dx = y = 3
#   da/dy = x = 2
#   dL/dx = dL/da * da/dx = 14 * 3 = 42
#   dL/dy = dL/da * da/dy = 14 * 2 = 28

# --- Forward pass ---
x_val, y_val = 2.0, 3.0
a = x_val * y_val      # 6
b = a + 1               # 7
L = b ** 2              # 49

print("=== Forward Pass ===")
print(f"  x = {x_val}, y = {y_val}")
print(f"  a = x * y = {a}")
print(f"  b = a + 1 = {b}")
print(f"  L = b^2   = {L}")

# --- Backward pass ---
dL_db = 2 * b           # 14
db_da = 1.0
dL_da = dL_db * db_da   # 14
da_dx = y_val           # 3
da_dy = x_val           # 2
dL_dx = dL_da * da_dx   # 42
dL_dy = dL_da * da_dy   # 28

print("\n=== Backward Pass ===")
print(f"  dL/db = 2b         = {dL_db}")
print(f"  dL/da = dL/db * 1  = {dL_da}")
print(f"  dL/dx = dL/da * y  = {dL_da} * {da_dx} = {dL_dx}")
print(f"  dL/dy = dL/da * x  = {dL_da} * {da_dy} = {dL_dy}")

# --- Numerical verification ---
def L_of_x(x):
    a = x * y_val
    b = a + 1
    return b ** 2

def L_of_y(y):
    a = x_val * y
    b = a + 1
    return b ** 2

num_dL_dx = numerical_derivative(L_of_x, x_val)
num_dL_dy = numerical_derivative(L_of_y, y_val)

print("\n=== Numerical Verification ===")
print(f"  dL/dx: backprop = {dL_dx:.4f}, numerical = {num_dL_dx:.4f}, match = {np.isclose(dL_dx, num_dL_dx)}")
print(f"  dL/dy: backprop = {dL_dy:.4f}, numerical = {num_dL_dy:.4f}, match = {np.isclose(dL_dy, num_dL_dy)}")

# Backprop through a 1-layer neural network with ReLU
#
# Forward: z = w*x + b,  a = ReLU(z),  L = (a - y)^2
#
# Backward:
#   dL/da = 2(a - y)
#   da/dz = 1 if z > 0 else 0     (ReLU derivative)
#   dz/dw = x
#   dz/db = 1
#   dL/dw = dL/da * da/dz * dz/dw
#   dL/db = dL/da * da/dz * dz/db

x_in, w, b_param, y_true = 1.0, 0.5, 0.1, 1.0

# --- Forward ---
z = w * x_in + b_param          # 0.6
a = max(0, z)                   # 0.6 (ReLU, z > 0)
L = (a - y_true) ** 2           # 0.16

print("=== Forward Pass ===")
print(f"  x = {x_in}, w = {w}, b = {b_param}, y_true = {y_true}")
print(f"  z = w*x + b = {z}")
print(f"  a = ReLU(z) = {a}")
print(f"  L = (a - y)^2 = {L}")

# --- Backward ---
dL_da = 2 * (a - y_true)       # 2*(0.6 - 1.0) = -0.8
da_dz = 1.0 if z > 0 else 0.0  # 1.0 (since z = 0.6 > 0)
dL_dz = dL_da * da_dz          # -0.8
dL_dw = dL_dz * x_in           # -0.8 * 1.0 = -0.8
dL_db = dL_dz * 1.0            # -0.8

print("\n=== Backward Pass ===")
print(f"  dL/da = 2(a - y) = {dL_da}")
print(f"  da/dz = (z > 0)  = {da_dz}")
print(f"  dL/dz = dL/da * da/dz = {dL_dz}")
print(f"  dL/dw = dL/dz * x     = {dL_dw}")
print(f"  dL/db = dL/dz * 1     = {dL_db}")

# --- Numerical verification ---
def L_of_w(w_val):
    z = w_val * x_in + b_param
    a = max(0, z)
    return (a - y_true) ** 2

def L_of_b(b_val):
    z = w * x_in + b_val
    a = max(0, z)
    return (a - y_true) ** 2

num_dL_dw = numerical_derivative(L_of_w, w)
num_dL_db = numerical_derivative(L_of_b, b_param)

print("\n=== Numerical Verification ===")
print(f"  dL/dw: backprop = {dL_dw:.6f}, numerical = {num_dL_dw:.6f}, match = {np.isclose(dL_dw, num_dL_dw)}")
print(f"  dL/db: backprop = {dL_db:.6f}, numerical = {num_dL_db:.6f}, match = {np.isclose(dL_db, num_dL_db)}")

# Backprop through sigmoid with binary cross-entropy loss
#
# Forward: z = w*x + b,  a = sigmoid(z),  L = BCE(y, a) = -(y*log(a) + (1-y)*log(1-a))
#
# Backward (the elegant result):
#   dL/da = -y/a + (1-y)/(1-a)
#   da/dz = a * (1 - a)           (sigmoid derivative)
#   dL/dz = dL/da * da/dz
#         = [-y/a + (1-y)/(1-a)] * a*(1-a)
#         = -y*(1-a) + (1-y)*a
#         = a - y                 <<< the elegant result!
#   dL/dw = (a - y) * x
#   dL/db = (a - y)

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

x_in, w, b_param, y_true = 2.0, 0.3, -0.1, 1.0

# --- Forward ---
z = w * x_in + b_param                                        # 0.5
a = sigmoid(z)                                                 # sigmoid(0.5)
L = -(y_true * np.log(a) + (1 - y_true) * np.log(1 - a))     # BCE

print("=== Forward Pass ===")
print(f"  x = {x_in}, w = {w}, b = {b_param}, y = {y_true}")
print(f"  z = w*x + b     = {z}")
print(f"  a = sigmoid(z)  = {a:.6f}")
print(f"  L = BCE(y, a)   = {L:.6f}")

# --- Backward: step by step ---
dL_da = -y_true / a + (1 - y_true) / (1 - a)
da_dz = a * (1 - a)
dL_dz_long = dL_da * da_dz

# The elegant shortcut
dL_dz_elegant = a - y_true

dL_dw = dL_dz_elegant * x_in
dL_db = dL_dz_elegant * 1.0

print("\n=== Backward Pass ===")
print(f"  dL/da = -y/a + (1-y)/(1-a) = {dL_da:.6f}")
print(f"  da/dz = a*(1-a)             = {da_dz:.6f}")
print(f"  dL/dz (step by step)        = {dL_dz_long:.6f}")
print(f"  dL/dz (elegant: a - y)      = {dL_dz_elegant:.6f}")
print(f"  They match: {np.isclose(dL_dz_long, dL_dz_elegant)}")
print(f"\n  dL/dw = (a - y) * x = {dL_dw:.6f}")
print(f"  dL/db = (a - y)     = {dL_db:.6f}")

# --- Numerical verification ---
def L_of_w_sig(w_val):
    z = w_val * x_in + b_param
    a = sigmoid(z)
    return -(y_true * np.log(a) + (1 - y_true) * np.log(1 - a))

def L_of_b_sig(b_val):
    z = w * x_in + b_val
    a = sigmoid(z)
    return -(y_true * np.log(a) + (1 - y_true) * np.log(1 - a))

num_dL_dw = numerical_derivative(L_of_w_sig, w)
num_dL_db = numerical_derivative(L_of_b_sig, b_param)

print("\n=== Numerical Verification ===")
print(f"  dL/dw: backprop = {dL_dw:.6f}, numerical = {num_dL_dw:.6f}, match = {np.isclose(dL_dw, num_dL_dw)}")
print(f"  dL/db: backprop = {dL_db:.6f}, numerical = {num_dL_db:.6f}, match = {np.isclose(dL_db, num_dL_db)}")
print("\nKey insight: For sigmoid + BCE, dL/dz = a - y. This is why sigmoid+BCE")
print("is such a natural pairing -- the gradient update is simply (prediction - truth).")

PART 6: Taylor Series¶

The Taylor series expansion of \(f(x)\) around point \(a\) is:

\[f(x) \approx f(a) + f'(a)(x-a) + \frac{f''(a)}{2!}(x-a)^2 + \frac{f'''(a)}{3!}(x-a)^3 + \cdots = \sum_{n=0}^{\infty} \frac{f^{(n)}(a)}{n!}(x-a)^n\]

Special cases around \(a = 0\) (Maclaurin series):

\[e^x = 1 + x + \frac{x^2}{2!} + \frac{x^3}{3!} + \cdots = \sum_{n=0}^{\infty} \frac{x^n}{n!}\]

\[\sin(x) = x - \frac{x^3}{3!} + \frac{x^5}{5!} - \cdots = \sum_{n=0}^{\infty} \frac{(-1)^n x^{2n+1}}{(2n+1)!}\]

Taylor series connect to gradient descent: the first-order Taylor approximation \(f(x) \approx f(a) + f'(a)(x-a)\) is exactly the linear model that gradient descent follows locally.

def taylor_exp(x, n_terms=5):
    """
    Taylor approximation of e^x around a = 0.
    e^x = sum_{n=0}^{N-1} x^n / n!
    """
    result = 0.0
    for n in range(n_terms):
        result += x ** n / factorial(n)
    return result


# Verify
test_x = 1.0
print(f"Taylor approximations of e^{test_x} = {np.exp(test_x):.10f}")
for n in [1, 2, 3, 5, 10, 20]:
    approx = taylor_exp(test_x, n_terms=n)
    err = abs(approx - np.exp(test_x))
    print(f"  {n:2d} terms: {approx:.10f}  (error = {err:.2e})")

def taylor_sin(x, n_terms=5):
    """
    Taylor approximation of sin(x) around a = 0.
    sin(x) = sum_{n=0}^{N-1} (-1)^n * x^(2n+1) / (2n+1)!
    """
    result = 0.0
    for n in range(n_terms):
        result += ((-1) ** n) * (x ** (2 * n + 1)) / factorial(2 * n + 1)
    return result


# Verify
test_x = np.pi / 4  # sin(pi/4) = sqrt(2)/2
print(f"Taylor approximations of sin({test_x:.4f}) = {np.sin(test_x):.10f}")
for n in [1, 2, 3, 5, 10]:
    approx = taylor_sin(test_x, n_terms=n)
    err = abs(approx - np.sin(test_x))
    print(f"  {n:2d} terms: {approx:.10f}  (error = {err:.2e})")

# Plot e^x and its Taylor approximations of order 1, 2, 3, 5, 10 on [-3, 3]

x_range = np.linspace(-3, 3, 300)
orders = [1, 2, 3, 5, 10]
colors = ['red', 'orange', 'green', 'purple', 'brown']

plt.figure(figsize=(10, 6))

# True function
plt.plot(x_range, np.exp(x_range), 'b-', linewidth=3, label='$e^x$ (exact)', zorder=10)

# Taylor approximations
for order, color in zip(orders, colors):
    y_taylor = np.array([taylor_exp(xi, n_terms=order) for xi in x_range])
    plt.plot(x_range, y_taylor, '--', color=color, linewidth=1.5,
             label=f'Order {order} ({order} terms)')

plt.ylim(-5, 25)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title(r'Taylor Series Approximations of $e^x$ around $a = 0$', fontsize=13)
plt.legend(fontsize=10, loc='upper left')
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linewidth=0.5)
plt.axvline(x=0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

print("Observations:")
print("  - Higher-order approximations are accurate over a wider range.")
print("  - All approximations are exact at x = 0 (the expansion point).")
print("  - By order 10, the approximation is nearly indistinguishable from e^x on [-3, 3].")

Summary¶

In this lab we covered the calculus foundations that power deep learning:

Concept	Key Formula	Why It Matters
Numerical differentiation	\(f'(x) \approx \frac{f(x+h) - f(x-h)}{2h}\)	Gradient checking / debugging
Chain rule	\(\frac{dy}{dx} = \frac{dy}{du} \cdot \frac{du}{dx}\)	Foundation of backpropagation
Gradient	\(\nabla f = [\partial f / \partial x_i]\)	Direction of steepest ascent
Gradient descent	\(w \leftarrow w - \alpha \nabla L\)	How neural networks learn
Backpropagation	upstream \(\times\) local gradient	Efficient gradient computation
Taylor series	\(f(x) \approx \sum \frac{f^{(n)}(a)}{n!}(x-a)^n\)	Local polynomial approximation

Key takeaways:

Central difference is \(O(h^2)\) accurate, best around \(h \approx 10^{-5}\) to \(10^{-8}\).
Backpropagation is just the chain rule applied systematically on a computation graph.
For sigmoid + BCE, the gradient simplifies to the elegant \(\frac{\partial L}{\partial z} = a - y\).
Learning rate is the most critical hyperparameter in gradient descent.
Taylor series connect derivatives to function approximation – gradient descent follows the first-order approximation.