Run this notebook: Open in Colab Open in Kaggle

Lab 04: Neural Networks from Scratch¶

Source: Speech & Language Processing (Jurafsky/Martin) - Chapter 6: Neural Networks
PDF: resources/ed3book_jan26.pdf

Topics Covered¶

Neural network units (neurons) and activation functions
Logic gates as neurons (AND, OR, NOT)
The XOR problem and why it needs hidden layers
Feedforward neural networks
Backpropagation and gradient descent
SGD with momentum and Adam optimizer
Text classification with neural networks

import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple

np.random.seed(42)

Part 1: Neural Network Units¶

Reference: SLP Ch 6.1

A single neuron computes:

\[y = f(\mathbf{w} \cdot \mathbf{x} + b)\]

where \(f\) is an activation function (sigmoid, tanh, ReLU, etc.).

Common activation functions:

Sigmoid: \(\sigma(z) = \frac{1}{1 + e^{-z}}\) — output in (0, 1)
Tanh: \(\tanh(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}}\) — output in (-1, 1)
ReLU: \(\text{ReLU}(z) = \max(0, z)\) — most widely used in deep networks

# Exercise 1.1: Implement a single neuron

class Neuron:
    """A single neural network unit."""

    def __init__(self, n_inputs: int, activation: str = "sigmoid"):
        self.weights = np.random.randn(n_inputs) * 0.1
        self.bias = 0.0
        self.activation = activation

    def _activate(self, z):
        if self.activation == "sigmoid":
            return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
        elif self.activation == "tanh":
            return np.tanh(z)
        elif self.activation == "relu":
            return np.maximum(0, z)
        return z  # linear

    def forward(self, x: np.ndarray) -> float:
        """Compute output: f(w . x + b)"""
        z = np.dot(self.weights, x) + self.bias
        return self._activate(z)


# Test the neuron
neuron = Neuron(3, activation="sigmoid")
x = np.array([1.0, 0.5, -0.3])
output = neuron.forward(x)
print(f"Neuron with 3 inputs:")
print(f"  Weights: {neuron.weights.round(4)}")
print(f"  Bias: {neuron.bias}")
print(f"  Input: {x}")
print(f"  Output: {output:.4f}")

# Exercise 1.2: Visualize activation functions

z = np.linspace(-5, 5, 200)

sigmoid = 1.0 / (1.0 + np.exp(-z))
tanh = np.tanh(z)
relu = np.maximum(0, z)

fig, axes = plt.subplots(1, 3, figsize=(14, 4))

axes[0].plot(z, sigmoid, 'b-', linewidth=2)
axes[0].set_title('Sigmoid')
axes[0].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
axes[0].axvline(x=0, color='gray', linestyle='--', alpha=0.5)
axes[0].set_ylabel('f(z)')
axes[0].grid(True, alpha=0.3)

axes[1].plot(z, tanh, 'r-', linewidth=2)
axes[1].set_title('Tanh')
axes[1].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[1].axvline(x=0, color='gray', linestyle='--', alpha=0.5)
axes[1].grid(True, alpha=0.3)

axes[2].plot(z, relu, 'g-', linewidth=2)
axes[2].set_title('ReLU')
axes[2].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[2].axvline(x=0, color='gray', linestyle='--', alpha=0.5)
axes[2].grid(True, alpha=0.3)

for ax in axes:
    ax.set_xlabel('z')

plt.tight_layout()
plt.show()

# Exercise 1.3: Implement logic gates as neurons
# A single sigmoid neuron can compute AND, OR, NOT

def logic_gate_neurons():
    """Implement logic gates using single neurons with sigmoid activation."""
    inputs_2 = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
    inputs_1 = np.array([[0], [1]])

    # AND gate: both inputs must be high
    and_neuron = Neuron(2, "sigmoid")
    and_neuron.weights = np.array([10.0, 10.0])
    and_neuron.bias = -15.0

    # OR gate: either input can be high
    or_neuron = Neuron(2, "sigmoid")
    or_neuron.weights = np.array([10.0, 10.0])
    or_neuron.bias = -5.0

    # NOT gate: inverts input
    not_neuron = Neuron(1, "sigmoid")
    not_neuron.weights = np.array([-10.0])
    not_neuron.bias = 5.0

    print("AND gate:")
    for x in inputs_2:
        y = and_neuron.forward(x)
        print(f"  {x[0]} AND {x[1]} = {y:.4f} (expected: {int(x[0]) & int(x[1])})")

    print("\nOR gate:")
    for x in inputs_2:
        y = or_neuron.forward(x)
        print(f"  {x[0]} OR  {x[1]} = {y:.4f} (expected: {int(x[0]) | int(x[1])})")

    print("\nNOT gate:")
    for x in inputs_1:
        y = not_neuron.forward(x)
        print(f"  NOT {x[0]} = {y:.4f} (expected: {1 - int(x[0])})")

logic_gate_neurons()

Part 2: The XOR Problem¶

Reference: SLP Ch 6.2

XOR (exclusive OR) returns 1 when exactly one input is 1:

x1	x2	XOR
0	0	0
0	1	1
1	0	1
1	1	0

Key insight: XOR is not linearly separable – no single line can separate the 0s from the 1s. This is why we need hidden layers.

Solution: \(\text{XOR}(x_1, x_2) = \text{AND}(\text{OR}(x_1, x_2), \text{NAND}(x_1, x_2))\)

# Exercise 2.1: Show that XOR is not linearly separable

X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([0, 1, 1, 0])

# Try many random single neurons -- none can solve XOR
best_acc = 0
for trial in range(10000):
    w = np.random.randn(2) * 5
    b = np.random.randn() * 5
    preds = (1.0 / (1.0 + np.exp(-(X_xor @ w + b))) > 0.5).astype(int)
    acc = np.mean(preds == y_xor)
    best_acc = max(best_acc, acc)

print(f"Best accuracy from 10,000 random single neurons: {best_acc:.2%}")
print("A single neuron CANNOT solve XOR (max 75% = gets 3 of 4 right)")

# Visualize
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
colors = ['red' if y == 0 else 'blue' for y in y_xor]
ax.scatter(X_xor[:, 0], X_xor[:, 1], c=colors, s=200, edgecolors='black', zorder=5)
ax.set_title('XOR: Not Linearly Separable')
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_xlim(-0.5, 1.5)
ax.set_ylim(-0.5, 1.5)
ax.grid(True, alpha=0.3)
ax.legend(['Red=0, Blue=1'], loc='upper right')
plt.show()

# Exercise 2.2: Solve XOR with a 2-layer network
# XOR(x1, x2) = AND(OR(x1, x2), NAND(x1, x2))

sigmoid = lambda z: 1.0 / (1.0 + np.exp(-z))

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])

# Hidden layer
h1 = sigmoid(10 * X[:, 0] + 10 * X[:, 1] - 5)    # OR gate
h2 = sigmoid(-10 * X[:, 0] - 10 * X[:, 1] + 15)   # NAND gate
H = np.column_stack([h1, h2])

# Output layer
y_pred = sigmoid(10 * H[:, 0] + 10 * H[:, 1] - 15)  # AND gate

print("XOR with 2-layer network:")
print(f"{'x1':>3} {'x2':>3} | {'h1(OR)':>8} {'h2(NAND)':>10} | {'y_pred':>8} {'expected':>10}")
print("-" * 55)
for i in range(4):
    expected = int(X[i, 0]) ^ int(X[i, 1])
    print(f"{X[i,0]:3.0f} {X[i,1]:3.0f} | {h1[i]:8.4f} {h2[i]:10.4f} | {y_pred[i]:8.4f} {expected:10d}")

# Exercise 2.3: Learn XOR via gradient descent

np.random.seed(42)

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=float)
y = np.array([[0], [1], [1], [0]], dtype=float)

# 2-layer network: 2 inputs -> 4 hidden -> 1 output
W1 = np.random.randn(2, 4) * 0.5
b1 = np.zeros((1, 4))
W2 = np.random.randn(4, 1) * 0.5
b2 = np.zeros((1, 1))

lr = 1.0
losses = []

for epoch in range(5000):
    # Forward
    z1 = X @ W1 + b1
    h = 1.0 / (1.0 + np.exp(-z1))  # sigmoid
    z2 = h @ W2 + b2
    y_hat = 1.0 / (1.0 + np.exp(-z2))  # sigmoid

    # Loss: binary cross-entropy
    eps = 1e-8
    loss = -np.mean(y * np.log(y_hat + eps) + (1 - y) * np.log(1 - y_hat + eps))
    losses.append(loss)

    # Backward
    dz2 = y_hat - y                           # (4, 1)
    dW2 = h.T @ dz2 / 4                       # (4, 1)
    db2 = np.mean(dz2, axis=0, keepdims=True)  # (1, 1)
    dh = dz2 @ W2.T                           # (4, 4)
    dz1 = dh * h * (1 - h)                    # sigmoid derivative
    dW1 = X.T @ dz1 / 4                       # (2, 4)
    db1 = np.mean(dz1, axis=0, keepdims=True)  # (1, 4)

    # Update
    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1

# Final predictions
z1 = X @ W1 + b1
h = 1.0 / (1.0 + np.exp(-z1))
z2 = h @ W2 + b2
y_hat = 1.0 / (1.0 + np.exp(-z2))

print("Learned XOR via gradient descent:")
for i in range(4):
    print(f"  XOR({X[i,0]:.0f}, {X[i,1]:.0f}) = {y_hat[i,0]:.4f} (expected: {y[i,0]:.0f})")

plt.figure(figsize=(8, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Binary Cross-Entropy Loss')
plt.title('XOR Learning Curve')
plt.grid(True, alpha=0.3)
plt.show()

Part 3: Feedforward Neural Network¶

Reference: SLP Ch 6.3

A feedforward (or multi-layer perceptron) network computes:

\[\mathbf{h}_l = f(\mathbf{W}_l \mathbf{h}_{l-1} + \mathbf{b}_l)\]

\[\hat{\mathbf{y}} = \text{softmax}(\mathbf{W}_{out} \mathbf{h}_L + \mathbf{b}_{out})\]

Key components:

Xavier initialization: scale weights by \(\sqrt{2/n_{in}}\) to keep variance stable
Softmax output: for multi-class classification
Cross-entropy loss: \(L = -\sum_c y_c \log \hat{y}_c\)

# Exercise 3.1: Feedforward Neural Network with full backpropagation

class FeedforwardNN:
    """Multi-layer feedforward neural network."""

    def __init__(self, layer_sizes: List[int], activation: str = "relu"):
        """
        Args:
            layer_sizes: [input_dim, hidden1, hidden2, ..., output_dim]
        """
        self.weights = []
        self.biases = []
        self.activation = activation
        self.layer_sizes = layer_sizes

        # Xavier initialization
        for i in range(len(layer_sizes) - 1):
            scale = np.sqrt(2.0 / layer_sizes[i])
            W = np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * scale
            b = np.zeros(layer_sizes[i + 1])
            self.weights.append(W)
            self.biases.append(b)

    def _activate(self, z):
        if self.activation == "relu":
            return np.maximum(0, z)
        elif self.activation == "sigmoid":
            return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
        elif self.activation == "tanh":
            return np.tanh(z)
        return z

    def _activate_derivative(self, z):
        if self.activation == "relu":
            return (z > 0).astype(float)
        elif self.activation == "sigmoid":
            s = self._activate(z)
            return s * (1 - s)
        elif self.activation == "tanh":
            return 1 - np.tanh(z) ** 2
        return np.ones_like(z)

    def _softmax(self, z):
        z_shifted = z - np.max(z, axis=-1, keepdims=True)
        exp_z = np.exp(z_shifted)
        return exp_z / np.sum(exp_z, axis=-1, keepdims=True)

    def forward(self, X: np.ndarray):
        """Forward pass. Returns (output, activations, pre_activations)."""
        activations = [X]
        pre_activations = []
        h = X

        # Hidden layers
        for i in range(len(self.weights) - 1):
            z = h @ self.weights[i] + self.biases[i]
            pre_activations.append(z)
            h = self._activate(z)
            activations.append(h)

        # Output layer (softmax)
        z_out = h @ self.weights[-1] + self.biases[-1]
        pre_activations.append(z_out)
        output = self._softmax(z_out)
        activations.append(output)

        return output, activations, pre_activations

    def backward(self, y_true_onehot, activations, pre_activations):
        """Backpropagation. Returns (weight_grads, bias_grads)."""
        N = y_true_onehot.shape[0]
        n_layers = len(self.weights)
        weight_grads = [None] * n_layers
        bias_grads = [None] * n_layers

        # Output layer: dL/dz = predictions - y_true (softmax + cross-entropy)
        delta = activations[-1] - y_true_onehot

        # Work backwards through layers
        for i in range(n_layers - 1, -1, -1):
            weight_grads[i] = activations[i].T @ delta / N
            bias_grads[i] = np.mean(delta, axis=0)
            if i > 0:
                delta = (delta @ self.weights[i].T) * self._activate_derivative(pre_activations[i - 1])

        return weight_grads, bias_grads

    def fit(self, X, y, lr=0.01, n_epochs=100, batch_size=32, verbose=True):
        """Train using mini-batch gradient descent."""
        n_classes = len(np.unique(y))
        y_onehot = np.eye(n_classes)[y]
        losses = []

        for epoch in range(n_epochs):
            indices = np.random.permutation(len(X))
            X_shuffled = X[indices]
            y_shuffled = y_onehot[indices]
            epoch_loss = 0.0
            n_batches = 0

            for start in range(0, len(X), batch_size):
                X_batch = X_shuffled[start:start + batch_size]
                y_batch = y_shuffled[start:start + batch_size]

                # Forward
                output, activations, pre_activations = self.forward(X_batch)

                # Loss: cross-entropy
                eps = 1e-8
                loss = -np.mean(np.sum(y_batch * np.log(output + eps), axis=1))
                epoch_loss += loss
                n_batches += 1

                # Backward
                w_grads, b_grads = self.backward(y_batch, activations, pre_activations)

                # Update
                for j in range(len(self.weights)):
                    self.weights[j] -= lr * w_grads[j]
                    self.biases[j] -= lr * b_grads[j]

            avg_loss = epoch_loss / n_batches
            losses.append(avg_loss)

            if verbose and (epoch + 1) % (n_epochs // 5) == 0:
                acc = np.mean(self.predict(X) == y)
                print(f"  Epoch {epoch+1:4d}: loss={avg_loss:.4f}, accuracy={acc:.4f}")

        return losses

    def predict(self, X):
        """Predict class labels."""
        output, _, _ = self.forward(X)
        return np.argmax(output, axis=1)


print("FeedforwardNN class defined.")

# Exercise 3.2: Train on a circular decision boundary (non-linear)

np.random.seed(42)

# Generate circular data
N = 300
X_train = np.random.randn(N, 2)
y_train = ((X_train[:, 0] ** 2 + X_train[:, 1] ** 2) < 1.5).astype(int)

# Train
net = FeedforwardNN([2, 16, 8, 2], activation="relu")
print("Training feedforward network on circular data:")
losses = net.fit(X_train, y_train, lr=0.1, n_epochs=200, batch_size=32)

# Final accuracy
preds = net.predict(X_train)
print(f"\nFinal train accuracy: {np.mean(preds == y_train):.4f}")

# Plot loss curve and decision boundary
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(losses)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Cross-Entropy Loss')
axes[0].set_title('Training Loss')
axes[0].grid(True, alpha=0.3)

# Decision boundary
xx, yy = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
grid = np.column_stack([xx.ravel(), yy.ravel()])
Z = net.predict(grid).reshape(xx.shape)
axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='RdBu')
axes[1].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='RdBu', edgecolors='black', s=20)
axes[1].set_title('Learned Decision Boundary')
axes[1].set_xlabel('x1')
axes[1].set_ylabel('x2')

plt.tight_layout()
plt.show()

Part 4: Backpropagation in Detail¶

Reference: SLP Ch 6.5

Backpropagation computes gradients using the chain rule, working backwards from the loss.

For a layer with \(\mathbf{z} = \mathbf{W}\mathbf{h} + \mathbf{b}\) and \(\mathbf{h}' = f(\mathbf{z})\):

\[\frac{\partial L}{\partial \mathbf{W}} = \frac{\partial L}{\partial \mathbf{z}} \cdot \mathbf{h}^T\]

\[\frac{\partial L}{\partial \mathbf{h}} = \mathbf{W}^T \cdot \frac{\partial L}{\partial \mathbf{z}}\]

The computation graph makes this systematic: each node knows its local gradient, and we chain them together.

# Exercise 4.1: Verify backprop with numerical gradients

def numerical_gradient(net, X, y_onehot, layer_idx, param_type='weight', eps=1e-5):
    """Compute numerical gradient for verification."""
    if param_type == 'weight':
        param = net.weights[layer_idx]
    else:
        param = net.biases[layer_idx]

    grad = np.zeros_like(param)
    it = np.nditer(param, flags=['multi_index'])
    while not it.finished:
        idx = it.multi_index
        old_val = param[idx]

        param[idx] = old_val + eps
        out_plus, _, _ = net.forward(X)
        loss_plus = -np.mean(np.sum(y_onehot * np.log(out_plus + 1e-8), axis=1))

        param[idx] = old_val - eps
        out_minus, _, _ = net.forward(X)
        loss_minus = -np.mean(np.sum(y_onehot * np.log(out_minus + 1e-8), axis=1))

        grad[idx] = (loss_plus - loss_minus) / (2 * eps)
        param[idx] = old_val
        it.iternext()

    return grad


# Small network for gradient checking
np.random.seed(42)
net_check = FeedforwardNN([2, 3, 2], activation="sigmoid")
X_check = np.random.randn(4, 2)
y_check = np.array([0, 1, 1, 0])
y_onehot_check = np.eye(2)[y_check]

# Analytic gradients
output, activations, pre_activations = net_check.forward(X_check)
w_grads, b_grads = net_check.backward(y_onehot_check, activations, pre_activations)

# Numerical gradients
for layer in range(len(net_check.weights)):
    num_grad = numerical_gradient(net_check, X_check, y_onehot_check, layer, 'weight')
    ana_grad = w_grads[layer]
    diff = np.max(np.abs(num_grad - ana_grad))
    rel_diff = np.linalg.norm(num_grad - ana_grad) / (np.linalg.norm(num_grad) + np.linalg.norm(ana_grad) + 1e-8)
    print(f"Layer {layer} weights: max_diff={diff:.2e}, relative_diff={rel_diff:.2e} {'OK' if rel_diff < 1e-5 else 'FAIL'}")

# Exercise 4.2: Computation graph example
# f(x, y, z) = (x + y) * z
# Show forward and backward passes step by step

x, y, z = 2.0, 3.0, -4.0

# Forward pass
q = x + y       # q = 5
f = q * z        # f = -20

print("Computation graph: f(x, y, z) = (x + y) * z")
print(f"\nForward pass:")
print(f"  x={x}, y={y}, z={z}")
print(f"  q = x + y = {q}")
print(f"  f = q * z = {f}")

# Backward pass
df_df = 1.0           # trivial
df_dq = z             # d(q*z)/dq = z
df_dz = q             # d(q*z)/dz = q
df_dx = df_dq * 1.0   # dq/dx = 1 (addition)
df_dy = df_dq * 1.0   # dq/dy = 1 (addition)

print(f"\nBackward pass:")
print(f"  df/df = {df_df}")
print(f"  df/dq = z = {df_dq}")
print(f"  df/dz = q = {df_dz}")
print(f"  df/dx = df/dq * dq/dx = {df_dq} * 1 = {df_dx}")
print(f"  df/dy = df/dq * dq/dy = {df_dq} * 1 = {df_dy}")

# Verify with numerical
eps = 1e-5
num_dx = ((x + eps + y) * z - (x - eps + y) * z) / (2 * eps)
num_dy = ((x + y + eps) * z - (x + y - eps) * z) / (2 * eps)
num_dz = ((x + y) * (z + eps) - (x + y) * (z - eps)) / (2 * eps)
print(f"\nNumerical verification:")
print(f"  df/dx = {num_dx:.4f} (analytic: {df_dx})")
print(f"  df/dy = {num_dy:.4f} (analytic: {df_dy})")
print(f"  df/dz = {num_dz:.4f} (analytic: {df_dz})")

Part 5: Optimizers¶

Reference: SLP Ch 6.6

SGD with Momentum¶

\[\mathbf{v}_t = \mu \mathbf{v}_{t-1} - \eta \nabla L\]

\[\boldsymbol{\theta}_t = \boldsymbol{\theta}_{t-1} + \mathbf{v}_t\]

Momentum helps accelerate SGD in the relevant direction and dampens oscillations.

Adam (Adaptive Moment Estimation)¶

\[\mathbf{m}_t = \beta_1 \mathbf{m}_{t-1} + (1 - \beta_1) \nabla L\]

\[\mathbf{v}_t = \beta_2 \mathbf{v}_{t-1} + (1 - \beta_2) (\nabla L)^2\]

\[\hat{\mathbf{m}}_t = \frac{\mathbf{m}_t}{1 - \beta_1^t}, \quad \hat{\mathbf{v}}_t = \frac{\mathbf{v}_t}{1 - \beta_2^t}\]

\[\boldsymbol{\theta}_t = \boldsymbol{\theta}_{t-1} - \eta \frac{\hat{\mathbf{m}}_t}{\sqrt{\hat{\mathbf{v}}_t} + \epsilon}\]

Adam adapts the learning rate per-parameter using running averages of gradients and squared gradients.

# Exercise 5.1: SGD with Momentum

def sgd_with_momentum(params, grads, velocities, lr=0.01, momentum=0.9):
    """SGD with momentum update."""
    new_params = []
    new_velocities = []
    for p, g, v in zip(params, grads, velocities):
        v_new = momentum * v - lr * g
        p_new = p + v_new
        new_params.append(p_new)
        new_velocities.append(v_new)
    return new_params, new_velocities


# Demo: minimize Rosenbrock-like function f(x,y) = (1-x)^2 + 100*(y-x^2)^2
def rosenbrock_grad(xy):
    x, y = xy[0], xy[1]
    dx = -2 * (1 - x) + 200 * (y - x**2) * (-2 * x)
    dy = 200 * (y - x**2)
    return np.array([dx, dy])

# SGD without momentum
np.random.seed(42)
pos_sgd = np.array([-1.0, 1.0])
path_sgd = [pos_sgd.copy()]
for _ in range(500):
    g = rosenbrock_grad(pos_sgd)
    pos_sgd = pos_sgd - 0.001 * g
    path_sgd.append(pos_sgd.copy())
path_sgd = np.array(path_sgd)

# SGD with momentum
pos_mom = np.array([-1.0, 1.0])
vel = np.zeros(2)
path_mom = [pos_mom.copy()]
for _ in range(500):
    g = rosenbrock_grad(pos_mom)
    [pos_mom], [vel] = sgd_with_momentum([pos_mom], [g], [vel], lr=0.001, momentum=0.9)
    path_mom.append(pos_mom.copy())
path_mom = np.array(path_mom)

print(f"SGD final position:      ({path_sgd[-1, 0]:.4f}, {path_sgd[-1, 1]:.4f})")
print(f"Momentum final position: ({path_mom[-1, 0]:.4f}, {path_mom[-1, 1]:.4f})")
print(f"Optimal: (1.0, 1.0)")

# Exercise 5.2: Adam Optimizer

class Adam:
    """Adam optimizer."""

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None  # first moment
        self.v = None  # second moment
        self.t = 0     # time step

    def step(self, params: List[np.ndarray], grads: List[np.ndarray]) -> List[np.ndarray]:
        """One Adam update step."""
        if self.m is None:
            self.m = [np.zeros_like(p) for p in params]
            self.v = [np.zeros_like(p) for p in params]

        self.t += 1
        new_params = []

        for i, (p, g) in enumerate(zip(params, grads)):
            # Update biased first moment estimate
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * g
            # Update biased second raw moment estimate
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * g ** 2
            # Bias-corrected estimates
            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)
            # Update parameters
            p_new = p - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
            new_params.append(p_new)

        return new_params


# Demo: Adam on the same function
optimizer = Adam(lr=0.01)
pos_adam = np.array([-1.0, 1.0])
path_adam = [pos_adam.copy()]
for _ in range(500):
    g = rosenbrock_grad(pos_adam)
    [pos_adam] = optimizer.step([pos_adam], [g])
    path_adam.append(pos_adam.copy())
path_adam = np.array(path_adam)

print(f"Adam final position: ({path_adam[-1, 0]:.4f}, {path_adam[-1, 1]:.4f})")
print(f"Optimal: (1.0, 1.0)")

# Exercise 5.3: Compare optimizers visually

fig, ax = plt.subplots(1, 1, figsize=(10, 8))

# Contour plot of the function
xx, yy = np.meshgrid(np.linspace(-2, 2, 200), np.linspace(-1, 3, 200))
zz = (1 - xx)**2 + 100 * (yy - xx**2)**2
ax.contour(xx, yy, np.log10(zz + 1), levels=30, cmap='viridis', alpha=0.5)

# Plot paths
ax.plot(path_sgd[:, 0], path_sgd[:, 1], 'r.-', alpha=0.5, markersize=2, label='SGD')
ax.plot(path_mom[:, 0], path_mom[:, 1], 'b.-', alpha=0.5, markersize=2, label='SGD + Momentum')
ax.plot(path_adam[:, 0], path_adam[:, 1], 'g.-', alpha=0.5, markersize=2, label='Adam')

# Mark start and optimum
ax.plot(-1, 1, 'ko', markersize=10, label='Start')
ax.plot(1, 1, 'r*', markersize=15, label='Optimum')

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Optimizer Comparison on Rosenbrock Function')
ax.legend()
ax.set_xlim(-2, 2)
ax.set_ylim(-1, 3)
plt.tight_layout()
plt.show()

# Exercise 5.4: Train FeedforwardNN with Adam

class FeedforwardNNAdam(FeedforwardNN):
    """FeedforwardNN with Adam optimizer."""

    def fit_adam(self, X, y, lr=0.001, n_epochs=100, batch_size=32, verbose=True):
        n_classes = len(np.unique(y))
        y_onehot = np.eye(n_classes)[y]
        losses = []

        # One Adam optimizer for all weight/bias params
        all_params = self.weights + self.biases
        adam = Adam(lr=lr)

        for epoch in range(n_epochs):
            indices = np.random.permutation(len(X))
            X_shuffled = X[indices]
            y_shuffled = y_onehot[indices]
            epoch_loss = 0.0
            n_batches = 0

            for start in range(0, len(X), batch_size):
                X_batch = X_shuffled[start:start + batch_size]
                y_batch = y_shuffled[start:start + batch_size]

                output, activations, pre_activations = self.forward(X_batch)
                loss = -np.mean(np.sum(y_batch * np.log(output + 1e-8), axis=1))
                epoch_loss += loss
                n_batches += 1

                w_grads, b_grads = self.backward(y_batch, activations, pre_activations)
                all_grads = w_grads + b_grads
                all_params = self.weights + self.biases
                updated = adam.step(all_params, all_grads)

                n_w = len(self.weights)
                self.weights = updated[:n_w]
                self.biases = updated[n_w:]

            avg_loss = epoch_loss / n_batches
            losses.append(avg_loss)

            if verbose and (epoch + 1) % (n_epochs // 5) == 0:
                acc = np.mean(self.predict(X) == y)
                print(f"  Epoch {epoch+1:4d}: loss={avg_loss:.4f}, accuracy={acc:.4f}")

        return losses


# Compare SGD vs Adam on circular data
np.random.seed(42)
N = 300
X_data = np.random.randn(N, 2)
y_data = ((X_data[:, 0] ** 2 + X_data[:, 1] ** 2) < 1.5).astype(int)

# SGD
net_sgd = FeedforwardNN([2, 16, 8, 2], activation="relu")
np.random.seed(42)
net_sgd.__init__([2, 16, 8, 2], activation="relu")
print("SGD training:")
losses_sgd = net_sgd.fit(X_data, y_data, lr=0.1, n_epochs=100, batch_size=32)

# Adam
np.random.seed(42)
net_adam = FeedforwardNNAdam([2, 16, 8, 2], activation="relu")
print("\nAdam training:")
losses_adam = net_adam.fit_adam(X_data, y_data, lr=0.01, n_epochs=100, batch_size=32)

plt.figure(figsize=(8, 4))
plt.plot(losses_sgd, label='SGD (lr=0.1)')
plt.plot(losses_adam, label='Adam (lr=0.01)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('SGD vs Adam Training Curves')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Part 6: Neural Networks for NLP Classification¶

Reference: SLP Ch 6.4

For text classification with neural networks:

Represent text as a vector (e.g., bag-of-words, TF-IDF)
Feed through hidden layers with non-linear activations
Output layer with softmax for class probabilities

This is a simple but effective approach – BERT and other transformers follow the same principle at a much larger scale.

# Exercise 6.1: Bag-of-words text classification

def build_vocab(texts):
    """Build vocabulary from list of texts."""
    vocab = {}
    for text in texts:
        for word in text.lower().split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

def text_to_bow(text, vocab):
    """Convert text to bag-of-words vector."""
    vec = np.zeros(len(vocab))
    for word in text.lower().split():
        if word in vocab:
            vec[vocab[word]] += 1
    return vec


# Training data: simple sentiment classification
train_data = [
    ("I love this movie", 1),
    ("This film is great", 1),
    ("Wonderful acting and story", 1),
    ("Best movie ever", 1),
    ("Amazing performance", 1),
    ("Really enjoyed this film", 1),
    ("I hate this movie", 0),
    ("This film is terrible", 0),
    ("Awful acting and story", 0),
    ("Worst movie ever", 0),
    ("Terrible performance", 0),
    ("Really hated this film", 0),
]

test_data = [
    ("I love this great film", 1),
    ("This is the worst movie", 0),
    ("Amazing movie and great acting", 1),
    ("Awful and terrible", 0),
]

# Build vocab and features
texts = [t for t, _ in train_data]
vocab = build_vocab(texts)
print(f"Vocabulary size: {len(vocab)}")
print(f"Vocab: {vocab}\n")

X_train_text = np.array([text_to_bow(t, vocab) for t, _ in train_data])
y_train_text = np.array([l for _, l in train_data])

X_test_text = np.array([text_to_bow(t, vocab) for t, _ in test_data])
y_test_text = np.array([l for _, l in test_data])

# Train neural network
np.random.seed(42)
text_net = FeedforwardNN([len(vocab), 8, 2], activation="relu")
print("Training text classifier:")
losses = text_net.fit(X_train_text, y_train_text, lr=0.1, n_epochs=200, batch_size=4)

# Evaluate
train_preds = text_net.predict(X_train_text)
test_preds = text_net.predict(X_test_text)
print(f"\nTrain accuracy: {np.mean(train_preds == y_train_text):.2%}")
print(f"Test accuracy:  {np.mean(test_preds == y_test_text):.2%}")

print("\nTest predictions:")
for (text, label), pred in zip(test_data, test_preds):
    sentiment = "Positive" if pred == 1 else "Negative"
    correct = "correct" if pred == label else "WRONG"
    print(f"  '{text}' -> {sentiment} ({correct})")

# Exercise 6.2: Multi-class classification with spiral data

np.random.seed(42)

# Generate 3-class spiral dataset
N_per_class = 100
n_classes = 3
X_spiral = np.zeros((N_per_class * n_classes, 2))
y_spiral = np.zeros(N_per_class * n_classes, dtype=int)

for k in range(n_classes):
    ix = range(N_per_class * k, N_per_class * (k + 1))
    r = np.linspace(0.0, 1, N_per_class)
    t = np.linspace(k * 4, (k + 1) * 4, N_per_class) + np.random.randn(N_per_class) * 0.2
    X_spiral[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
    y_spiral[ix] = k

# Train
net_spiral = FeedforwardNN([2, 64, 32, 3], activation="relu")
print("Training on 3-class spiral data:")
losses_spiral = net_spiral.fit(X_spiral, y_spiral, lr=0.5, n_epochs=500, batch_size=32)

acc = np.mean(net_spiral.predict(X_spiral) == y_spiral)
print(f"\nFinal accuracy: {acc:.2%}")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(losses_spiral)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Spiral Training Loss')
axes[0].grid(True, alpha=0.3)

xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 200), np.linspace(-1.5, 1.5, 200))
grid = np.c_[xx.ravel(), yy.ravel()]
Z = net_spiral.predict(grid).reshape(xx.shape)
axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='Set1')
scatter = axes[1].scatter(X_spiral[:, 0], X_spiral[:, 1], c=y_spiral, cmap='Set1',
                          edgecolors='black', s=15)
axes[1].set_title(f'3-Class Spiral (accuracy: {acc:.1%})')
axes[1].set_xlabel('x1')
axes[1].set_ylabel('x2')

plt.tight_layout()
plt.show()

Part 7: Regularization and Dropout¶

Reference: SLP Ch 6.7

Neural networks can easily overfit. Common regularization techniques:

L2 Regularization (Weight Decay): Add \(\lambda \|\mathbf{w}\|^2\) to the loss
Dropout: During training, randomly zero out neurons with probability \(p\). At test time, scale by \((1-p)\).
Early Stopping: Stop training when validation loss stops improving

# Exercise 7.1: Implement dropout

def dropout(h, p=0.5, training=True):
    """
    Inverted dropout.
    During training: zero out with probability p, scale by 1/(1-p)
    During testing: return h unchanged
    """
    if not training or p == 0:
        return h
    mask = (np.random.rand(*h.shape) > p).astype(float)
    return h * mask / (1 - p)


# Demonstrate dropout
np.random.seed(42)
h = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0])

print(f"Original activations: {h}")
print(f"Mean: {h.mean():.2f}\n")

for p in [0.1, 0.3, 0.5, 0.7]:
    # Average over many samples
    means = []
    for _ in range(1000):
        h_dropped = dropout(h, p=p, training=True)
        means.append(h_dropped.mean())
    print(f"Dropout p={p}: avg mean = {np.mean(means):.2f} (expected ~{h.mean():.2f} due to scaling)")

print(f"\nTest time (no dropout): mean = {dropout(h, p=0.5, training=False).mean():.2f}")

# Exercise 7.2: Demonstrate overfitting and regularization

np.random.seed(42)

# Small dataset that's easy to overfit
N_small = 30
X_small = np.random.randn(N_small, 2)
y_small = ((X_small[:, 0] ** 2 + X_small[:, 1] ** 2) < 1.0).astype(int)

# Large test set
X_test_large = np.random.randn(500, 2)
y_test_large = ((X_test_large[:, 0] ** 2 + X_test_large[:, 1] ** 2) < 1.0).astype(int)

# Train a big network (easy to overfit)
np.random.seed(42)
net_overfit = FeedforwardNN([2, 32, 32, 2], activation="relu")
print("Training large network on small dataset (no regularization):")
losses_overfit = net_overfit.fit(X_small, y_small, lr=0.1, n_epochs=500, batch_size=8, verbose=True)

train_acc = np.mean(net_overfit.predict(X_small) == y_small)
test_acc = np.mean(net_overfit.predict(X_test_large) == y_test_large)
print(f"\nTrain accuracy: {train_acc:.2%}")
print(f"Test accuracy:  {test_acc:.2%}")
print(f"Gap: {train_acc - test_acc:.2%} (overfitting!)")

# Smaller network (implicit regularization)
np.random.seed(42)
net_small = FeedforwardNN([2, 4, 2], activation="relu")
print("\nTraining small network (implicit regularization):")
losses_small = net_small.fit(X_small, y_small, lr=0.1, n_epochs=500, batch_size=8, verbose=True)

train_acc_s = np.mean(net_small.predict(X_small) == y_small)
test_acc_s = np.mean(net_small.predict(X_test_large) == y_test_large)
print(f"\nTrain accuracy: {train_acc_s:.2%}")
print(f"Test accuracy:  {test_acc_s:.2%}")
print(f"Gap: {train_acc_s - test_acc_s:.2%}")

Summary¶

In this lab we built neural networks from scratch:

Single neurons can compute linear functions like AND, OR, NOT
XOR requires hidden layers – it’s not linearly separable
Feedforward networks stack layers with non-linear activations to learn complex decision boundaries
Backpropagation computes gradients efficiently using the chain rule
Optimizers (SGD, Momentum, Adam) determine how we update weights
Neural networks for NLP use text representations (BoW) as input features
Regularization (dropout, smaller networks, early stopping) prevents overfitting

These foundations underpin all modern deep learning, from CNNs to Transformers.