Lab 04: Neural Networks from ScratchΒΆ
Source: Speech & Language Processing (Jurafsky/Martin) - Chapter 6: Neural Networks
PDF: resources/ed3book_jan26.pdf
Topics CoveredΒΆ
Neural network units (neurons) and activation functions
Logic gates as neurons (AND, OR, NOT)
The XOR problem and why it needs hidden layers
Feedforward neural networks
Backpropagation and gradient descent
SGD with momentum and Adam optimizer
Text classification with neural networks
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple
np.random.seed(42)
Part 1: Neural Network UnitsΒΆ
Reference: SLP Ch 6.1
A single neuron computes:
where \(f\) is an activation function (sigmoid, tanh, ReLU, etc.).
Common activation functions:
Sigmoid: \(\sigma(z) = \frac{1}{1 + e^{-z}}\) β output in (0, 1)
Tanh: \(\tanh(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}}\) β output in (-1, 1)
ReLU: \(\text{ReLU}(z) = \max(0, z)\) β most widely used in deep networks
# Exercise 1.1: Implement a single neuron
class Neuron:
"""A single neural network unit."""
def __init__(self, n_inputs: int, activation: str = "sigmoid"):
self.weights = np.random.randn(n_inputs) * 0.1
self.bias = 0.0
self.activation = activation
def _activate(self, z):
if self.activation == "sigmoid":
return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
elif self.activation == "tanh":
return np.tanh(z)
elif self.activation == "relu":
return np.maximum(0, z)
return z # linear
def forward(self, x: np.ndarray) -> float:
"""Compute output: f(w . x + b)"""
z = np.dot(self.weights, x) + self.bias
return self._activate(z)
# Test the neuron
neuron = Neuron(3, activation="sigmoid")
x = np.array([1.0, 0.5, -0.3])
output = neuron.forward(x)
print(f"Neuron with 3 inputs:")
print(f" Weights: {neuron.weights.round(4)}")
print(f" Bias: {neuron.bias}")
print(f" Input: {x}")
print(f" Output: {output:.4f}")
# Exercise 1.2: Visualize activation functions
z = np.linspace(-5, 5, 200)
sigmoid = 1.0 / (1.0 + np.exp(-z))
tanh = np.tanh(z)
relu = np.maximum(0, z)
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
axes[0].plot(z, sigmoid, 'b-', linewidth=2)
axes[0].set_title('Sigmoid')
axes[0].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
axes[0].axvline(x=0, color='gray', linestyle='--', alpha=0.5)
axes[0].set_ylabel('f(z)')
axes[0].grid(True, alpha=0.3)
axes[1].plot(z, tanh, 'r-', linewidth=2)
axes[1].set_title('Tanh')
axes[1].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[1].axvline(x=0, color='gray', linestyle='--', alpha=0.5)
axes[1].grid(True, alpha=0.3)
axes[2].plot(z, relu, 'g-', linewidth=2)
axes[2].set_title('ReLU')
axes[2].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[2].axvline(x=0, color='gray', linestyle='--', alpha=0.5)
axes[2].grid(True, alpha=0.3)
for ax in axes:
ax.set_xlabel('z')
plt.tight_layout()
plt.show()
# Exercise 1.3: Implement logic gates as neurons
# A single sigmoid neuron can compute AND, OR, NOT
def logic_gate_neurons():
"""Implement logic gates using single neurons with sigmoid activation."""
inputs_2 = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
inputs_1 = np.array([[0], [1]])
# AND gate: both inputs must be high
and_neuron = Neuron(2, "sigmoid")
and_neuron.weights = np.array([10.0, 10.0])
and_neuron.bias = -15.0
# OR gate: either input can be high
or_neuron = Neuron(2, "sigmoid")
or_neuron.weights = np.array([10.0, 10.0])
or_neuron.bias = -5.0
# NOT gate: inverts input
not_neuron = Neuron(1, "sigmoid")
not_neuron.weights = np.array([-10.0])
not_neuron.bias = 5.0
print("AND gate:")
for x in inputs_2:
y = and_neuron.forward(x)
print(f" {x[0]} AND {x[1]} = {y:.4f} (expected: {int(x[0]) & int(x[1])})")
print("\nOR gate:")
for x in inputs_2:
y = or_neuron.forward(x)
print(f" {x[0]} OR {x[1]} = {y:.4f} (expected: {int(x[0]) | int(x[1])})")
print("\nNOT gate:")
for x in inputs_1:
y = not_neuron.forward(x)
print(f" NOT {x[0]} = {y:.4f} (expected: {1 - int(x[0])})")
logic_gate_neurons()
Part 2: The XOR ProblemΒΆ
Reference: SLP Ch 6.2
XOR (exclusive OR) returns 1 when exactly one input is 1:
x1 |
x2 |
XOR |
|---|---|---|
0 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
0 |
Key insight: XOR is not linearly separable β no single line can separate the 0s from the 1s. This is why we need hidden layers.
Solution: \(\text{XOR}(x_1, x_2) = \text{AND}(\text{OR}(x_1, x_2), \text{NAND}(x_1, x_2))\)
# Exercise 2.1: Show that XOR is not linearly separable
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([0, 1, 1, 0])
# Try many random single neurons -- none can solve XOR
best_acc = 0
for trial in range(10000):
w = np.random.randn(2) * 5
b = np.random.randn() * 5
preds = (1.0 / (1.0 + np.exp(-(X_xor @ w + b))) > 0.5).astype(int)
acc = np.mean(preds == y_xor)
best_acc = max(best_acc, acc)
print(f"Best accuracy from 10,000 random single neurons: {best_acc:.2%}")
print("A single neuron CANNOT solve XOR (max 75% = gets 3 of 4 right)")
# Visualize
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
colors = ['red' if y == 0 else 'blue' for y in y_xor]
ax.scatter(X_xor[:, 0], X_xor[:, 1], c=colors, s=200, edgecolors='black', zorder=5)
ax.set_title('XOR: Not Linearly Separable')
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_xlim(-0.5, 1.5)
ax.set_ylim(-0.5, 1.5)
ax.grid(True, alpha=0.3)
ax.legend(['Red=0, Blue=1'], loc='upper right')
plt.show()
# Exercise 2.2: Solve XOR with a 2-layer network
# XOR(x1, x2) = AND(OR(x1, x2), NAND(x1, x2))
sigmoid = lambda z: 1.0 / (1.0 + np.exp(-z))
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
# Hidden layer
h1 = sigmoid(10 * X[:, 0] + 10 * X[:, 1] - 5) # OR gate
h2 = sigmoid(-10 * X[:, 0] - 10 * X[:, 1] + 15) # NAND gate
H = np.column_stack([h1, h2])
# Output layer
y_pred = sigmoid(10 * H[:, 0] + 10 * H[:, 1] - 15) # AND gate
print("XOR with 2-layer network:")
print(f"{'x1':>3} {'x2':>3} | {'h1(OR)':>8} {'h2(NAND)':>10} | {'y_pred':>8} {'expected':>10}")
print("-" * 55)
for i in range(4):
expected = int(X[i, 0]) ^ int(X[i, 1])
print(f"{X[i,0]:3.0f} {X[i,1]:3.0f} | {h1[i]:8.4f} {h2[i]:10.4f} | {y_pred[i]:8.4f} {expected:10d}")
# Exercise 2.3: Learn XOR via gradient descent
np.random.seed(42)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=float)
y = np.array([[0], [1], [1], [0]], dtype=float)
# 2-layer network: 2 inputs -> 4 hidden -> 1 output
W1 = np.random.randn(2, 4) * 0.5
b1 = np.zeros((1, 4))
W2 = np.random.randn(4, 1) * 0.5
b2 = np.zeros((1, 1))
lr = 1.0
losses = []
for epoch in range(5000):
# Forward
z1 = X @ W1 + b1
h = 1.0 / (1.0 + np.exp(-z1)) # sigmoid
z2 = h @ W2 + b2
y_hat = 1.0 / (1.0 + np.exp(-z2)) # sigmoid
# Loss: binary cross-entropy
eps = 1e-8
loss = -np.mean(y * np.log(y_hat + eps) + (1 - y) * np.log(1 - y_hat + eps))
losses.append(loss)
# Backward
dz2 = y_hat - y # (4, 1)
dW2 = h.T @ dz2 / 4 # (4, 1)
db2 = np.mean(dz2, axis=0, keepdims=True) # (1, 1)
dh = dz2 @ W2.T # (4, 4)
dz1 = dh * h * (1 - h) # sigmoid derivative
dW1 = X.T @ dz1 / 4 # (2, 4)
db1 = np.mean(dz1, axis=0, keepdims=True) # (1, 4)
# Update
W2 -= lr * dW2
b2 -= lr * db2
W1 -= lr * dW1
b1 -= lr * db1
# Final predictions
z1 = X @ W1 + b1
h = 1.0 / (1.0 + np.exp(-z1))
z2 = h @ W2 + b2
y_hat = 1.0 / (1.0 + np.exp(-z2))
print("Learned XOR via gradient descent:")
for i in range(4):
print(f" XOR({X[i,0]:.0f}, {X[i,1]:.0f}) = {y_hat[i,0]:.4f} (expected: {y[i,0]:.0f})")
plt.figure(figsize=(8, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Binary Cross-Entropy Loss')
plt.title('XOR Learning Curve')
plt.grid(True, alpha=0.3)
plt.show()
Part 3: Feedforward Neural NetworkΒΆ
Reference: SLP Ch 6.3
A feedforward (or multi-layer perceptron) network computes:
Key components:
Xavier initialization: scale weights by \(\sqrt{2/n_{in}}\) to keep variance stable
Softmax output: for multi-class classification
Cross-entropy loss: \(L = -\sum_c y_c \log \hat{y}_c\)
# Exercise 3.1: Feedforward Neural Network with full backpropagation
class FeedforwardNN:
"""Multi-layer feedforward neural network."""
def __init__(self, layer_sizes: List[int], activation: str = "relu"):
"""
Args:
layer_sizes: [input_dim, hidden1, hidden2, ..., output_dim]
"""
self.weights = []
self.biases = []
self.activation = activation
self.layer_sizes = layer_sizes
# Xavier initialization
for i in range(len(layer_sizes) - 1):
scale = np.sqrt(2.0 / layer_sizes[i])
W = np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * scale
b = np.zeros(layer_sizes[i + 1])
self.weights.append(W)
self.biases.append(b)
def _activate(self, z):
if self.activation == "relu":
return np.maximum(0, z)
elif self.activation == "sigmoid":
return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
elif self.activation == "tanh":
return np.tanh(z)
return z
def _activate_derivative(self, z):
if self.activation == "relu":
return (z > 0).astype(float)
elif self.activation == "sigmoid":
s = self._activate(z)
return s * (1 - s)
elif self.activation == "tanh":
return 1 - np.tanh(z) ** 2
return np.ones_like(z)
def _softmax(self, z):
z_shifted = z - np.max(z, axis=-1, keepdims=True)
exp_z = np.exp(z_shifted)
return exp_z / np.sum(exp_z, axis=-1, keepdims=True)
def forward(self, X: np.ndarray):
"""Forward pass. Returns (output, activations, pre_activations)."""
activations = [X]
pre_activations = []
h = X
# Hidden layers
for i in range(len(self.weights) - 1):
z = h @ self.weights[i] + self.biases[i]
pre_activations.append(z)
h = self._activate(z)
activations.append(h)
# Output layer (softmax)
z_out = h @ self.weights[-1] + self.biases[-1]
pre_activations.append(z_out)
output = self._softmax(z_out)
activations.append(output)
return output, activations, pre_activations
def backward(self, y_true_onehot, activations, pre_activations):
"""Backpropagation. Returns (weight_grads, bias_grads)."""
N = y_true_onehot.shape[0]
n_layers = len(self.weights)
weight_grads = [None] * n_layers
bias_grads = [None] * n_layers
# Output layer: dL/dz = predictions - y_true (softmax + cross-entropy)
delta = activations[-1] - y_true_onehot
# Work backwards through layers
for i in range(n_layers - 1, -1, -1):
weight_grads[i] = activations[i].T @ delta / N
bias_grads[i] = np.mean(delta, axis=0)
if i > 0:
delta = (delta @ self.weights[i].T) * self._activate_derivative(pre_activations[i - 1])
return weight_grads, bias_grads
def fit(self, X, y, lr=0.01, n_epochs=100, batch_size=32, verbose=True):
"""Train using mini-batch gradient descent."""
n_classes = len(np.unique(y))
y_onehot = np.eye(n_classes)[y]
losses = []
for epoch in range(n_epochs):
indices = np.random.permutation(len(X))
X_shuffled = X[indices]
y_shuffled = y_onehot[indices]
epoch_loss = 0.0
n_batches = 0
for start in range(0, len(X), batch_size):
X_batch = X_shuffled[start:start + batch_size]
y_batch = y_shuffled[start:start + batch_size]
# Forward
output, activations, pre_activations = self.forward(X_batch)
# Loss: cross-entropy
eps = 1e-8
loss = -np.mean(np.sum(y_batch * np.log(output + eps), axis=1))
epoch_loss += loss
n_batches += 1
# Backward
w_grads, b_grads = self.backward(y_batch, activations, pre_activations)
# Update
for j in range(len(self.weights)):
self.weights[j] -= lr * w_grads[j]
self.biases[j] -= lr * b_grads[j]
avg_loss = epoch_loss / n_batches
losses.append(avg_loss)
if verbose and (epoch + 1) % (n_epochs // 5) == 0:
acc = np.mean(self.predict(X) == y)
print(f" Epoch {epoch+1:4d}: loss={avg_loss:.4f}, accuracy={acc:.4f}")
return losses
def predict(self, X):
"""Predict class labels."""
output, _, _ = self.forward(X)
return np.argmax(output, axis=1)
print("FeedforwardNN class defined.")
# Exercise 3.2: Train on a circular decision boundary (non-linear)
np.random.seed(42)
# Generate circular data
N = 300
X_train = np.random.randn(N, 2)
y_train = ((X_train[:, 0] ** 2 + X_train[:, 1] ** 2) < 1.5).astype(int)
# Train
net = FeedforwardNN([2, 16, 8, 2], activation="relu")
print("Training feedforward network on circular data:")
losses = net.fit(X_train, y_train, lr=0.1, n_epochs=200, batch_size=32)
# Final accuracy
preds = net.predict(X_train)
print(f"\nFinal train accuracy: {np.mean(preds == y_train):.4f}")
# Plot loss curve and decision boundary
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(losses)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Cross-Entropy Loss')
axes[0].set_title('Training Loss')
axes[0].grid(True, alpha=0.3)
# Decision boundary
xx, yy = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
grid = np.column_stack([xx.ravel(), yy.ravel()])
Z = net.predict(grid).reshape(xx.shape)
axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='RdBu')
axes[1].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='RdBu', edgecolors='black', s=20)
axes[1].set_title('Learned Decision Boundary')
axes[1].set_xlabel('x1')
axes[1].set_ylabel('x2')
plt.tight_layout()
plt.show()
Part 4: Backpropagation in DetailΒΆ
Reference: SLP Ch 6.5
Backpropagation computes gradients using the chain rule, working backwards from the loss.
For a layer with \(\mathbf{z} = \mathbf{W}\mathbf{h} + \mathbf{b}\) and \(\mathbf{h}' = f(\mathbf{z})\):
The computation graph makes this systematic: each node knows its local gradient, and we chain them together.
# Exercise 4.1: Verify backprop with numerical gradients
def numerical_gradient(net, X, y_onehot, layer_idx, param_type='weight', eps=1e-5):
"""Compute numerical gradient for verification."""
if param_type == 'weight':
param = net.weights[layer_idx]
else:
param = net.biases[layer_idx]
grad = np.zeros_like(param)
it = np.nditer(param, flags=['multi_index'])
while not it.finished:
idx = it.multi_index
old_val = param[idx]
param[idx] = old_val + eps
out_plus, _, _ = net.forward(X)
loss_plus = -np.mean(np.sum(y_onehot * np.log(out_plus + 1e-8), axis=1))
param[idx] = old_val - eps
out_minus, _, _ = net.forward(X)
loss_minus = -np.mean(np.sum(y_onehot * np.log(out_minus + 1e-8), axis=1))
grad[idx] = (loss_plus - loss_minus) / (2 * eps)
param[idx] = old_val
it.iternext()
return grad
# Small network for gradient checking
np.random.seed(42)
net_check = FeedforwardNN([2, 3, 2], activation="sigmoid")
X_check = np.random.randn(4, 2)
y_check = np.array([0, 1, 1, 0])
y_onehot_check = np.eye(2)[y_check]
# Analytic gradients
output, activations, pre_activations = net_check.forward(X_check)
w_grads, b_grads = net_check.backward(y_onehot_check, activations, pre_activations)
# Numerical gradients
for layer in range(len(net_check.weights)):
num_grad = numerical_gradient(net_check, X_check, y_onehot_check, layer, 'weight')
ana_grad = w_grads[layer]
diff = np.max(np.abs(num_grad - ana_grad))
rel_diff = np.linalg.norm(num_grad - ana_grad) / (np.linalg.norm(num_grad) + np.linalg.norm(ana_grad) + 1e-8)
print(f"Layer {layer} weights: max_diff={diff:.2e}, relative_diff={rel_diff:.2e} {'OK' if rel_diff < 1e-5 else 'FAIL'}")
# Exercise 4.2: Computation graph example
# f(x, y, z) = (x + y) * z
# Show forward and backward passes step by step
x, y, z = 2.0, 3.0, -4.0
# Forward pass
q = x + y # q = 5
f = q * z # f = -20
print("Computation graph: f(x, y, z) = (x + y) * z")
print(f"\nForward pass:")
print(f" x={x}, y={y}, z={z}")
print(f" q = x + y = {q}")
print(f" f = q * z = {f}")
# Backward pass
df_df = 1.0 # trivial
df_dq = z # d(q*z)/dq = z
df_dz = q # d(q*z)/dz = q
df_dx = df_dq * 1.0 # dq/dx = 1 (addition)
df_dy = df_dq * 1.0 # dq/dy = 1 (addition)
print(f"\nBackward pass:")
print(f" df/df = {df_df}")
print(f" df/dq = z = {df_dq}")
print(f" df/dz = q = {df_dz}")
print(f" df/dx = df/dq * dq/dx = {df_dq} * 1 = {df_dx}")
print(f" df/dy = df/dq * dq/dy = {df_dq} * 1 = {df_dy}")
# Verify with numerical
eps = 1e-5
num_dx = ((x + eps + y) * z - (x - eps + y) * z) / (2 * eps)
num_dy = ((x + y + eps) * z - (x + y - eps) * z) / (2 * eps)
num_dz = ((x + y) * (z + eps) - (x + y) * (z - eps)) / (2 * eps)
print(f"\nNumerical verification:")
print(f" df/dx = {num_dx:.4f} (analytic: {df_dx})")
print(f" df/dy = {num_dy:.4f} (analytic: {df_dy})")
print(f" df/dz = {num_dz:.4f} (analytic: {df_dz})")
Part 5: OptimizersΒΆ
Reference: SLP Ch 6.6
SGD with MomentumΒΆ
Momentum helps accelerate SGD in the relevant direction and dampens oscillations.
Adam (Adaptive Moment Estimation)ΒΆ
Adam adapts the learning rate per-parameter using running averages of gradients and squared gradients.
# Exercise 5.1: SGD with Momentum
def sgd_with_momentum(params, grads, velocities, lr=0.01, momentum=0.9):
"""SGD with momentum update."""
new_params = []
new_velocities = []
for p, g, v in zip(params, grads, velocities):
v_new = momentum * v - lr * g
p_new = p + v_new
new_params.append(p_new)
new_velocities.append(v_new)
return new_params, new_velocities
# Demo: minimize Rosenbrock-like function f(x,y) = (1-x)^2 + 100*(y-x^2)^2
def rosenbrock_grad(xy):
x, y = xy[0], xy[1]
dx = -2 * (1 - x) + 200 * (y - x**2) * (-2 * x)
dy = 200 * (y - x**2)
return np.array([dx, dy])
# SGD without momentum
np.random.seed(42)
pos_sgd = np.array([-1.0, 1.0])
path_sgd = [pos_sgd.copy()]
for _ in range(500):
g = rosenbrock_grad(pos_sgd)
pos_sgd = pos_sgd - 0.001 * g
path_sgd.append(pos_sgd.copy())
path_sgd = np.array(path_sgd)
# SGD with momentum
pos_mom = np.array([-1.0, 1.0])
vel = np.zeros(2)
path_mom = [pos_mom.copy()]
for _ in range(500):
g = rosenbrock_grad(pos_mom)
[pos_mom], [vel] = sgd_with_momentum([pos_mom], [g], [vel], lr=0.001, momentum=0.9)
path_mom.append(pos_mom.copy())
path_mom = np.array(path_mom)
print(f"SGD final position: ({path_sgd[-1, 0]:.4f}, {path_sgd[-1, 1]:.4f})")
print(f"Momentum final position: ({path_mom[-1, 0]:.4f}, {path_mom[-1, 1]:.4f})")
print(f"Optimal: (1.0, 1.0)")
# Exercise 5.2: Adam Optimizer
class Adam:
"""Adam optimizer."""
def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None # first moment
self.v = None # second moment
self.t = 0 # time step
def step(self, params: List[np.ndarray], grads: List[np.ndarray]) -> List[np.ndarray]:
"""One Adam update step."""
if self.m is None:
self.m = [np.zeros_like(p) for p in params]
self.v = [np.zeros_like(p) for p in params]
self.t += 1
new_params = []
for i, (p, g) in enumerate(zip(params, grads)):
# Update biased first moment estimate
self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * g
# Update biased second raw moment estimate
self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * g ** 2
# Bias-corrected estimates
m_hat = self.m[i] / (1 - self.beta1 ** self.t)
v_hat = self.v[i] / (1 - self.beta2 ** self.t)
# Update parameters
p_new = p - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
new_params.append(p_new)
return new_params
# Demo: Adam on the same function
optimizer = Adam(lr=0.01)
pos_adam = np.array([-1.0, 1.0])
path_adam = [pos_adam.copy()]
for _ in range(500):
g = rosenbrock_grad(pos_adam)
[pos_adam] = optimizer.step([pos_adam], [g])
path_adam.append(pos_adam.copy())
path_adam = np.array(path_adam)
print(f"Adam final position: ({path_adam[-1, 0]:.4f}, {path_adam[-1, 1]:.4f})")
print(f"Optimal: (1.0, 1.0)")
# Exercise 5.3: Compare optimizers visually
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
# Contour plot of the function
xx, yy = np.meshgrid(np.linspace(-2, 2, 200), np.linspace(-1, 3, 200))
zz = (1 - xx)**2 + 100 * (yy - xx**2)**2
ax.contour(xx, yy, np.log10(zz + 1), levels=30, cmap='viridis', alpha=0.5)
# Plot paths
ax.plot(path_sgd[:, 0], path_sgd[:, 1], 'r.-', alpha=0.5, markersize=2, label='SGD')
ax.plot(path_mom[:, 0], path_mom[:, 1], 'b.-', alpha=0.5, markersize=2, label='SGD + Momentum')
ax.plot(path_adam[:, 0], path_adam[:, 1], 'g.-', alpha=0.5, markersize=2, label='Adam')
# Mark start and optimum
ax.plot(-1, 1, 'ko', markersize=10, label='Start')
ax.plot(1, 1, 'r*', markersize=15, label='Optimum')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Optimizer Comparison on Rosenbrock Function')
ax.legend()
ax.set_xlim(-2, 2)
ax.set_ylim(-1, 3)
plt.tight_layout()
plt.show()
# Exercise 5.4: Train FeedforwardNN with Adam
class FeedforwardNNAdam(FeedforwardNN):
"""FeedforwardNN with Adam optimizer."""
def fit_adam(self, X, y, lr=0.001, n_epochs=100, batch_size=32, verbose=True):
n_classes = len(np.unique(y))
y_onehot = np.eye(n_classes)[y]
losses = []
# One Adam optimizer for all weight/bias params
all_params = self.weights + self.biases
adam = Adam(lr=lr)
for epoch in range(n_epochs):
indices = np.random.permutation(len(X))
X_shuffled = X[indices]
y_shuffled = y_onehot[indices]
epoch_loss = 0.0
n_batches = 0
for start in range(0, len(X), batch_size):
X_batch = X_shuffled[start:start + batch_size]
y_batch = y_shuffled[start:start + batch_size]
output, activations, pre_activations = self.forward(X_batch)
loss = -np.mean(np.sum(y_batch * np.log(output + 1e-8), axis=1))
epoch_loss += loss
n_batches += 1
w_grads, b_grads = self.backward(y_batch, activations, pre_activations)
all_grads = w_grads + b_grads
all_params = self.weights + self.biases
updated = adam.step(all_params, all_grads)
n_w = len(self.weights)
self.weights = updated[:n_w]
self.biases = updated[n_w:]
avg_loss = epoch_loss / n_batches
losses.append(avg_loss)
if verbose and (epoch + 1) % (n_epochs // 5) == 0:
acc = np.mean(self.predict(X) == y)
print(f" Epoch {epoch+1:4d}: loss={avg_loss:.4f}, accuracy={acc:.4f}")
return losses
# Compare SGD vs Adam on circular data
np.random.seed(42)
N = 300
X_data = np.random.randn(N, 2)
y_data = ((X_data[:, 0] ** 2 + X_data[:, 1] ** 2) < 1.5).astype(int)
# SGD
net_sgd = FeedforwardNN([2, 16, 8, 2], activation="relu")
np.random.seed(42)
net_sgd.__init__([2, 16, 8, 2], activation="relu")
print("SGD training:")
losses_sgd = net_sgd.fit(X_data, y_data, lr=0.1, n_epochs=100, batch_size=32)
# Adam
np.random.seed(42)
net_adam = FeedforwardNNAdam([2, 16, 8, 2], activation="relu")
print("\nAdam training:")
losses_adam = net_adam.fit_adam(X_data, y_data, lr=0.01, n_epochs=100, batch_size=32)
plt.figure(figsize=(8, 4))
plt.plot(losses_sgd, label='SGD (lr=0.1)')
plt.plot(losses_adam, label='Adam (lr=0.01)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('SGD vs Adam Training Curves')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
Part 6: Neural Networks for NLP ClassificationΒΆ
Reference: SLP Ch 6.4
For text classification with neural networks:
Represent text as a vector (e.g., bag-of-words, TF-IDF)
Feed through hidden layers with non-linear activations
Output layer with softmax for class probabilities
This is a simple but effective approach β BERT and other transformers follow the same principle at a much larger scale.
# Exercise 6.1: Bag-of-words text classification
def build_vocab(texts):
"""Build vocabulary from list of texts."""
vocab = {}
for text in texts:
for word in text.lower().split():
if word not in vocab:
vocab[word] = len(vocab)
return vocab
def text_to_bow(text, vocab):
"""Convert text to bag-of-words vector."""
vec = np.zeros(len(vocab))
for word in text.lower().split():
if word in vocab:
vec[vocab[word]] += 1
return vec
# Training data: simple sentiment classification
train_data = [
("I love this movie", 1),
("This film is great", 1),
("Wonderful acting and story", 1),
("Best movie ever", 1),
("Amazing performance", 1),
("Really enjoyed this film", 1),
("I hate this movie", 0),
("This film is terrible", 0),
("Awful acting and story", 0),
("Worst movie ever", 0),
("Terrible performance", 0),
("Really hated this film", 0),
]
test_data = [
("I love this great film", 1),
("This is the worst movie", 0),
("Amazing movie and great acting", 1),
("Awful and terrible", 0),
]
# Build vocab and features
texts = [t for t, _ in train_data]
vocab = build_vocab(texts)
print(f"Vocabulary size: {len(vocab)}")
print(f"Vocab: {vocab}\n")
X_train_text = np.array([text_to_bow(t, vocab) for t, _ in train_data])
y_train_text = np.array([l for _, l in train_data])
X_test_text = np.array([text_to_bow(t, vocab) for t, _ in test_data])
y_test_text = np.array([l for _, l in test_data])
# Train neural network
np.random.seed(42)
text_net = FeedforwardNN([len(vocab), 8, 2], activation="relu")
print("Training text classifier:")
losses = text_net.fit(X_train_text, y_train_text, lr=0.1, n_epochs=200, batch_size=4)
# Evaluate
train_preds = text_net.predict(X_train_text)
test_preds = text_net.predict(X_test_text)
print(f"\nTrain accuracy: {np.mean(train_preds == y_train_text):.2%}")
print(f"Test accuracy: {np.mean(test_preds == y_test_text):.2%}")
print("\nTest predictions:")
for (text, label), pred in zip(test_data, test_preds):
sentiment = "Positive" if pred == 1 else "Negative"
correct = "correct" if pred == label else "WRONG"
print(f" '{text}' -> {sentiment} ({correct})")
# Exercise 6.2: Multi-class classification with spiral data
np.random.seed(42)
# Generate 3-class spiral dataset
N_per_class = 100
n_classes = 3
X_spiral = np.zeros((N_per_class * n_classes, 2))
y_spiral = np.zeros(N_per_class * n_classes, dtype=int)
for k in range(n_classes):
ix = range(N_per_class * k, N_per_class * (k + 1))
r = np.linspace(0.0, 1, N_per_class)
t = np.linspace(k * 4, (k + 1) * 4, N_per_class) + np.random.randn(N_per_class) * 0.2
X_spiral[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
y_spiral[ix] = k
# Train
net_spiral = FeedforwardNN([2, 64, 32, 3], activation="relu")
print("Training on 3-class spiral data:")
losses_spiral = net_spiral.fit(X_spiral, y_spiral, lr=0.5, n_epochs=500, batch_size=32)
acc = np.mean(net_spiral.predict(X_spiral) == y_spiral)
print(f"\nFinal accuracy: {acc:.2%}")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(losses_spiral)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Spiral Training Loss')
axes[0].grid(True, alpha=0.3)
xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 200), np.linspace(-1.5, 1.5, 200))
grid = np.c_[xx.ravel(), yy.ravel()]
Z = net_spiral.predict(grid).reshape(xx.shape)
axes[1].contourf(xx, yy, Z, alpha=0.3, cmap='Set1')
scatter = axes[1].scatter(X_spiral[:, 0], X_spiral[:, 1], c=y_spiral, cmap='Set1',
edgecolors='black', s=15)
axes[1].set_title(f'3-Class Spiral (accuracy: {acc:.1%})')
axes[1].set_xlabel('x1')
axes[1].set_ylabel('x2')
plt.tight_layout()
plt.show()
Part 7: Regularization and DropoutΒΆ
Reference: SLP Ch 6.7
Neural networks can easily overfit. Common regularization techniques:
L2 Regularization (Weight Decay): Add \(\lambda \|\mathbf{w}\|^2\) to the loss
Dropout: During training, randomly zero out neurons with probability \(p\). At test time, scale by \((1-p)\).
Early Stopping: Stop training when validation loss stops improving
# Exercise 7.1: Implement dropout
def dropout(h, p=0.5, training=True):
"""
Inverted dropout.
During training: zero out with probability p, scale by 1/(1-p)
During testing: return h unchanged
"""
if not training or p == 0:
return h
mask = (np.random.rand(*h.shape) > p).astype(float)
return h * mask / (1 - p)
# Demonstrate dropout
np.random.seed(42)
h = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0])
print(f"Original activations: {h}")
print(f"Mean: {h.mean():.2f}\n")
for p in [0.1, 0.3, 0.5, 0.7]:
# Average over many samples
means = []
for _ in range(1000):
h_dropped = dropout(h, p=p, training=True)
means.append(h_dropped.mean())
print(f"Dropout p={p}: avg mean = {np.mean(means):.2f} (expected ~{h.mean():.2f} due to scaling)")
print(f"\nTest time (no dropout): mean = {dropout(h, p=0.5, training=False).mean():.2f}")
# Exercise 7.2: Demonstrate overfitting and regularization
np.random.seed(42)
# Small dataset that's easy to overfit
N_small = 30
X_small = np.random.randn(N_small, 2)
y_small = ((X_small[:, 0] ** 2 + X_small[:, 1] ** 2) < 1.0).astype(int)
# Large test set
X_test_large = np.random.randn(500, 2)
y_test_large = ((X_test_large[:, 0] ** 2 + X_test_large[:, 1] ** 2) < 1.0).astype(int)
# Train a big network (easy to overfit)
np.random.seed(42)
net_overfit = FeedforwardNN([2, 32, 32, 2], activation="relu")
print("Training large network on small dataset (no regularization):")
losses_overfit = net_overfit.fit(X_small, y_small, lr=0.1, n_epochs=500, batch_size=8, verbose=True)
train_acc = np.mean(net_overfit.predict(X_small) == y_small)
test_acc = np.mean(net_overfit.predict(X_test_large) == y_test_large)
print(f"\nTrain accuracy: {train_acc:.2%}")
print(f"Test accuracy: {test_acc:.2%}")
print(f"Gap: {train_acc - test_acc:.2%} (overfitting!)")
# Smaller network (implicit regularization)
np.random.seed(42)
net_small = FeedforwardNN([2, 4, 2], activation="relu")
print("\nTraining small network (implicit regularization):")
losses_small = net_small.fit(X_small, y_small, lr=0.1, n_epochs=500, batch_size=8, verbose=True)
train_acc_s = np.mean(net_small.predict(X_small) == y_small)
test_acc_s = np.mean(net_small.predict(X_test_large) == y_test_large)
print(f"\nTrain accuracy: {train_acc_s:.2%}")
print(f"Test accuracy: {test_acc_s:.2%}")
print(f"Gap: {train_acc_s - test_acc_s:.2%}")
SummaryΒΆ
In this lab we built neural networks from scratch:
Single neurons can compute linear functions like AND, OR, NOT
XOR requires hidden layers β itβs not linearly separable
Feedforward networks stack layers with non-linear activations to learn complex decision boundaries
Backpropagation computes gradients efficiently using the chain rule
Optimizers (SGD, Momentum, Adam) determine how we update weights
Neural networks for NLP use text representations (BoW) as input features
Regularization (dropout, smaller networks, early stopping) prevents overfitting
These foundations underpin all modern deep learning, from CNNs to Transformers.