Run this notebook: Open in Colab Open in Kaggle

Lab 01: Words, Tokens & Text Processing¶

Source: Speech and Language Processing (3rd ed.) by Dan Jurafsky & James H. Martin – Chapter 2: Regular Expressions, Text Normalization, Edit Distance

Topics covered:

Words, Types, and Instances (vocabulary size, Heaps’ Law, Zipf’s Law)
Regular Expressions for NLP text processing
Byte-Pair Encoding (BPE) subword tokenization
Minimum Edit Distance (Levenshtein distance, alignment, WER, spelling correction)

import re
import numpy as np
from collections import Counter
from typing import List, Dict, Tuple, Optional

import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

%matplotlib inline
plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["font.size"] = 11

PART 1: Words, Types, and Instances¶

Key definitions (SLP Ch 2.1):

Term	Meaning
Word type	A unique word form in the vocabulary (e.g., “the” counted once)
Word instance (token)	Each individual occurrence of a word in running text
\|V\|	Vocabulary size = number of distinct word types
N	Corpus size = total number of word instances

Heaps’ Law (also called Herdan’s Law): the vocabulary size grows sub-linearly with corpus size:

\[|V| = k \cdot N^{\beta}\]

where \(k \approx 10\)–\(100\) and \(\beta \approx 0.4\)–\(0.6\) for English text.

Zipf’s Law: the frequency of the \(r\)-th most frequent word is inversely proportional to its rank:

\[f(r) \propto \frac{1}{r^{\alpha}}, \quad \alpha \approx 1\]

On a log-log plot, word frequencies form an approximately straight line with slope \(-\alpha\).

def count_types_and_instances(text: str) -> Tuple[int, int]:
    """
    Count word types (unique) and word instances (total) in text.
    Lowercases all words and splits on whitespace.
    
    Returns:
        (num_types, num_instances)
    """
    words = text.lower().split()
    types = len(set(words))
    instances = len(words)
    return types, instances


# --- Demo ---
sample = "They picnicked by the pool then lay back on the grass and looked at the stars"

types, instances = count_types_and_instances(sample)
print(f"Text: \"{sample}\"")
print(f"  Word types  |V| = {types}")
print(f"  Word instances N = {instances}")
print(f"  'the' appears {sample.lower().split().count('the')} times (3 instances, 1 type)")

def type_token_ratio(text: str) -> float:
    """
    Type-Token Ratio (TTR) = |V| / N.
    Higher TTR indicates greater lexical diversity.
    TTR = 1.0 means every word is unique; TTR -> 0 means heavy repetition.
    """
    types, instances = count_types_and_instances(text)
    if instances == 0:
        return 0.0
    return types / instances


# --- Demo ---
texts = {
    "Diverse":    "The quick brown fox jumps over the lazy dog near a tall fence",
    "Repetitive": "the the the the cat sat on the the mat the the the",
    "All unique": "alpha bravo charlie delta echo foxtrot golf hotel india",
}

for label, txt in texts.items():
    ttr = type_token_ratio(txt)
    v, n = count_types_and_instances(txt)
    print(f"  [{label:12s}] TTR = {ttr:.3f}  (|V|={v}, N={n})  \"{txt[:50]}...\"" if len(txt) > 50
          else f"  [{label:12s}] TTR = {ttr:.3f}  (|V|={v}, N={n})  \"{txt}\"")

def verify_heaps_law(text: str):
    """
    Empirically verify Heaps' Law: |V| = k * N^beta.
    Plots observed vocabulary growth vs. tokens processed,
    then fits the Heaps' Law curve to estimate k and beta.
    """
    words = text.lower().split()
    vocab = set()
    vocab_sizes = []
    token_counts = []

    for i, word in enumerate(words, 1):
        vocab.add(word)
        vocab_sizes.append(len(vocab))
        token_counts.append(i)

    token_counts = np.array(token_counts, dtype=float)
    vocab_sizes = np.array(vocab_sizes, dtype=float)

    # Fit Heaps' Law:  |V| = k * N^beta
    def heaps(n, k, beta):
        return k * np.power(n, beta)

    popt, _ = curve_fit(heaps, token_counts, vocab_sizes, p0=[10, 0.5])
    k_fit, beta_fit = popt

    # Plot
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(token_counts, vocab_sizes, label="Observed |V|", alpha=0.8)
    ax.plot(token_counts, heaps(token_counts, k_fit, beta_fit),
            "--", color="red", label=f"Heaps fit: k={k_fit:.2f}, beta={beta_fit:.3f}")
    ax.set_xlabel("Tokens processed (N)")
    ax.set_ylabel("Vocabulary size |V|")
    ax.set_title("Heaps' Law: Vocabulary Growth")
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    print(f"Heaps' Law fit:  |V| = {k_fit:.2f} * N^{beta_fit:.3f}")


# --- Demo on a longer text ---
# We generate a semi-realistic corpus by repeating and shuffling sentences.
long_text = """
Natural language processing is a subfield of linguistics computer science and
artificial intelligence concerned with the interactions between computers and
human language in particular how to program computers to process and analyze
large amounts of natural language data the result is a computer capable of
understanding the contents of documents including the contextual nuances of
the language within them the technology can then accurately extract information
and insights contained in the documents as well as categorize and organize the
documents themselves challenges in natural language processing frequently involve
speech recognition natural language understanding and natural language generation
natural language processing has a wide range of applications including machine
translation text summarization sentiment analysis named entity recognition
information retrieval question answering and dialogue systems the field draws
from many disciplines including computer science and linguistics as well as
statistics and mathematics
""".strip()
# Repeat to make it bigger (simulating a larger corpus)
rng = np.random.default_rng(42)
words_pool = long_text.split()
extended = []
for _ in range(30):
    chunk = words_pool.copy()
    rng.shuffle(chunk)
    extended.extend(chunk)

verify_heaps_law(" ".join(extended))

def zipfs_law(text: str):
    """
    Demonstrate Zipf's Law: f(r) ~ 1/r^alpha.
    Counts word frequencies, ranks them, and plots on a log-log scale.
    A straight line with slope ~ -1 confirms Zipf's Law.
    """
    words = text.lower().split()
    freq = Counter(words)

    # Sort by frequency (descending)
    sorted_freqs = sorted(freq.values(), reverse=True)
    ranks = np.arange(1, len(sorted_freqs) + 1, dtype=float)
    freqs = np.array(sorted_freqs, dtype=float)

    # Fit a power law in log-log space:  log(f) = -alpha * log(r) + log(C)
    log_ranks = np.log10(ranks)
    log_freqs = np.log10(freqs)
    coeffs = np.polyfit(log_ranks, log_freqs, 1)
    alpha = -coeffs[0]
    C = 10 ** coeffs[1]

    # Plot
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.scatter(ranks, freqs, s=10, alpha=0.6, label="Observed frequencies")
    ax.plot(ranks, C / ranks ** alpha, color="red", linewidth=2,
            label=f"Zipf fit: f = {C:.1f} / r^{alpha:.2f}")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlabel("Rank (log scale)")
    ax.set_ylabel("Frequency (log scale)")
    ax.set_title("Zipf's Law: Word Frequency vs. Rank")
    ax.legend()
    ax.grid(True, alpha=0.3, which="both")
    plt.tight_layout()
    plt.show()

    print(f"Zipf's Law fit: alpha = {alpha:.3f}  (ideal ~ 1.0)")
    print(f"Top 10 words by frequency:")
    for word, count in freq.most_common(10):
        print(f"  {word:15s}  freq={count}")


# --- Demo ---
zipfs_law(" ".join(extended))

PART 2: Regular Expressions¶

Reference: SLP Ch 2.1 (Regular Expressions)

Regular expressions are the Swiss-army knife of NLP text processing. Key patterns:

Pattern	Meaning
`.`	Any character (except newline)
`\d`	Digit `[0-9]`
`\w`	Word character `[a-zA-Z0-9_]`
`\s`	Whitespace
`*`, `+`, `?`	0+, 1+, 0 or 1 repetitions
`[abc]`	Character class
`(...)`	Capturing group
`(?:...)`	Non-capturing group
`(?<=...)` / `(?=...)`	Lookbehind / Lookahead

Common NLP tasks: tokenization, normalization, pattern extraction (emails, dates, etc.).

# --- Extract structured information from text using regex ---

text = """
Call me at 555-123-4567 or (555) 987-6543.
My email is [email protected] or [email protected].
The price is $45.99 or $1,200.00 but not EUR 123.50.
Visit https://www.example.com/page?id=42 or http://nlp.stanford.edu for details.
The meeting is on 2024-03-15 at 3:30 PM. Another date: 2023-12-01.
"""

# Phone numbers: 555-123-4567 or (555) 987-6543
phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"

# Email addresses
email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"

# Dollar amounts: $45.99, $1,200.00
dollar_pattern = r"\$[\d,]+\.?\d*"

# URLs: http:// or https://
url_pattern = r"https?://[^\s,]+"

# Dates in YYYY-MM-DD format
date_pattern = r"\d{4}-\d{2}-\d{2}"

patterns = {
    "Phone numbers": phone_pattern,
    "Emails":        email_pattern,
    "Dollar amounts": dollar_pattern,
    "URLs":          url_pattern,
    "Dates (YYYY-MM-DD)": date_pattern,
}

for name, pattern in patterns.items():
    matches = re.findall(pattern, text)
    print(f"{name:22s} -> {matches}")

def normalize_text(text: str) -> str:
    """
    Normalize text for NLP:
      1. Lowercase
      2. Remove punctuation (keep alphanumeric and spaces)
      3. Collapse multiple spaces to a single space
      4. Strip leading/trailing whitespace
    """
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text


# --- Demo ---
raw_texts = [
    "Hello, World!!! How's it going??",
    "   The U.S.A. spent $3.5 billion on NLP research...   ",
    'She said: "NLP is AMAZING!" -- and left.',
]

for raw in raw_texts:
    normed = normalize_text(raw)
    print(f"  Raw:        \"{raw}\"")
    print(f"  Normalized: \"{normed}\"")
    print()

def sentence_tokenize(text: str) -> List[str]:
    """
    Split text into sentences.
    Handles: periods, question marks, exclamation marks.
    Avoids splitting on common abbreviations (Mr., Mrs., Dr., U.S., etc.)
    and decimal numbers.
    
    Strategy: use regex negative lookbehind to avoid splitting after abbreviations.
    """
    # Common abbreviations that end with a period
    abbrevs = r"(?:Mr|Mrs|Ms|Dr|Prof|St|Jr|Sr|Inc|Ltd|vs|etc|e\.g|i\.e|U\.S|U\.K)"
    
    # Split on sentence-ending punctuation (.!?) followed by whitespace and a capital letter,
    # but NOT after known abbreviations.
    pattern = rf"(?<!{abbrevs})(?<=[.!?])\s+(?=[A-Z])"
    
    sentences = re.split(pattern, text.strip())
    # Clean up whitespace in each sentence
    return [s.strip() for s in sentences if s.strip()]


# --- Demo ---
test_text = (
    "Mr. Smith went to Washington. He met Dr. Jones there! "
    "Was it about the U.S. budget? Yes, it was. "
    "They discussed $3.5 billion in funding. Amazing!"
)

sentences = sentence_tokenize(test_text)
print(f"Input: \"{test_text}\"\n")
print(f"Sentences found ({len(sentences)}):")
for i, sent in enumerate(sentences, 1):
    print(f"  {i}. {sent}")

PART 3: Byte-Pair Encoding (BPE)¶

Reference: SLP Ch 2.4 – Subword Tokenization

BPE addresses the open-vocabulary problem: we cannot have every possible word in our vocabulary, so we tokenize into subword units that can be composed to form any word.

Algorithm:

Initialize the vocabulary with individual characters (plus an end-of-word marker </w>)
Count every adjacent pair of symbols across the corpus
Merge the most frequent pair into a single new symbol
Repeat for k merges

The resulting merge table defines a deterministic tokenizer: given a new word, apply the learned merges in order to segment it into subwords.

def _count_pairs(vocab: Dict[str, int]) -> Dict[Tuple[str, str], int]:
    """Count frequency of every adjacent symbol pair across the vocabulary."""
    pairs: Dict[Tuple[str, str], int] = {}
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            pairs[pair] = pairs.get(pair, 0) + freq
    return pairs


def _merge_pair(vocab: Dict[str, int], pair: Tuple[str, str]) -> Dict[str, int]:
    """Merge every occurrence of `pair` in each vocabulary entry."""
    new_vocab: Dict[str, int] = {}
    bigram = " ".join(pair)          # e.g. "e s"
    replacement = "".join(pair)      # e.g. "es"
    for word, freq in vocab.items():
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = freq
    return new_vocab


def train_bpe(corpus: List[str], num_merges: int = 10) -> List[Tuple[str, str]]:
    """
    Train BPE on a corpus of words (with repetitions representing frequency).
    
    Each word is represented as space-separated characters ending with </w>.
    Returns the ordered list of merge operations.
    """
    # Step 1: Build initial vocabulary (character-level + end-of-word marker)
    vocab: Dict[str, int] = {}
    for word in corpus:
        key = " ".join(list(word)) + " </w>"
        vocab[key] = vocab.get(key, 0) + 1

    merges: List[Tuple[str, str]] = []

    for i in range(num_merges):
        pairs = _count_pairs(vocab)
        if not pairs:
            break
        # Find the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        merges.append(best_pair)
        # Merge that pair everywhere in the vocabulary
        vocab = _merge_pair(vocab, best_pair)

    return merges


print("train_bpe defined -- see full demo below.")

def bpe_tokenize(word: str, merges: List[Tuple[str, str]]) -> List[str]:
    """
    Tokenize a single word using learned BPE merges.
    
    Start with the word as a list of individual characters + </w>,
    then greedily apply each merge in order.
    """
    symbols = list(word) + ["</w>"]

    for left, right in merges:
        i = 0
        while i < len(symbols) - 1:
            if symbols[i] == left and symbols[i + 1] == right:
                # Merge the pair into one symbol
                symbols = symbols[:i] + [left + right] + symbols[i + 2:]
            else:
                i += 1

    return symbols


print("bpe_tokenize defined -- see full demo below.")

# --- Full BPE Demo ---
# Corpus from SLP textbook example (with frequencies as repetitions)
corpus = (
    ["low"] * 5
    + ["lower"] * 2
    + ["newest"] * 6
    + ["widest"] * 3
    + ["new"] * 2
)

print("Training corpus (word : count):")
for word, count in Counter(corpus).most_common():
    print(f"  {word:10s} x {count}")
print()

# Train BPE with 10 merges
merges = train_bpe(corpus, num_merges=10)

print("Learned BPE merge operations:")
for i, (a, b) in enumerate(merges, 1):
    print(f"  {i:2d}. '{a}' + '{b}'  ->  '{a}{b}'")

# Tokenize a word not in the training corpus
test_word = "lowest"
tokens = bpe_tokenize(test_word, merges)
print(f"\nTokenization of '{test_word}': {tokens}")
print(f"  (The model segments it into known subwords even though 'lowest' was never seen in training.)")

# A few more test words
for w in ["newer", "widest", "low", "lowing"]:
    toks = bpe_tokenize(w, merges)
    print(f"  '{w}' -> {toks}")

PART 4: Minimum Edit Distance¶

Reference: SLP Ch 2.5 – Minimum Edit Distance

The minimum edit distance (Levenshtein distance) between two strings is the minimum number of editing operations needed to transform one into the other.

Operations and costs (standard Levenshtein):

Operation	Cost
Insert a character	1
Delete a character	1
Substitute one character for another	1 (0 if same)

Dynamic programming formulation:

Let dp[i][j] = min edit distance between source[0..i-1] and target[0..j-1].

Base cases: dp[i][0] = i (delete all), dp[0][j] = j (insert all)

Recurrence:

\[\begin{split}dp[i][j] = \min \begin{cases} dp[i-1][j] + 1 & \text{(delete)} \\ dp[i][j-1] + 1 & \text{(insert)} \\ dp[i-1][j-1] + \text{cost} & \text{(substitute: cost=0 if match, 1 otherwise)} \end{cases}\end{split}\]

Applications: spelling correction, machine translation evaluation, computational biology (sequence alignment), ASR evaluation (Word Error Rate).

def min_edit_distance(source: str, target: str) -> int:
    """
    Compute the Levenshtein distance between source and target strings.
    Uses standard costs: insert=1, delete=1, substitute=1 (0 if match).
    Returns the full DP table value at dp[n][m].
    """
    n = len(source)
    m = len(target)

    # Build DP table of size (n+1) x (m+1)
    dp = np.zeros((n + 1, m + 1), dtype=int)

    # Base cases
    for i in range(n + 1):
        dp[i][0] = i  # delete all characters from source
    for j in range(m + 1):
        dp[0][j] = j  # insert all characters into empty source

    # Fill the table
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if source[i - 1] == target[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,       # delete
                dp[i][j - 1] + 1,       # insert
                dp[i - 1][j - 1] + cost  # substitute (or match)
            )

    return int(dp[n][m])


# --- Demo ---
pairs = [("kitten", "sitting"), ("intention", "execution"), ("sunday", "saturday")]
for s, t in pairs:
    d = min_edit_distance(s, t)
    print(f"  d('{s}', '{t}') = {d}")

def min_edit_distance_with_alignment(source: str, target: str) -> Tuple[int, List[str]]:
    """
    Compute minimum edit distance AND recover the alignment via backtrace.
    
    Returns:
        (distance, operations) where operations is a list of strings like:
        'MATCH s/s', 'SUB s->t', 'DEL s', 'INS t'
    """
    n = len(source)
    m = len(target)

    # DP table
    dp = np.zeros((n + 1, m + 1), dtype=int)
    # Backpointer table: stores which operation led to dp[i][j]
    # 'M' = match/sub from diagonal, 'D' = delete from above, 'I' = insert from left
    bp = [['' for _ in range(m + 1)] for _ in range(n + 1)]

    for i in range(1, n + 1):
        dp[i][0] = i
        bp[i][0] = 'D'
    for j in range(1, m + 1):
        dp[0][j] = j
        bp[0][j] = 'I'

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if source[i - 1] == target[j - 1] else 1

            delete_cost = dp[i - 1][j] + 1
            insert_cost = dp[i][j - 1] + 1
            sub_cost    = dp[i - 1][j - 1] + cost

            dp[i][j] = min(delete_cost, insert_cost, sub_cost)

            if dp[i][j] == sub_cost:
                bp[i][j] = 'M' if cost == 0 else 'S'
            elif dp[i][j] == delete_cost:
                bp[i][j] = 'D'
            else:
                bp[i][j] = 'I'

    # Backtrace to recover operations
    ops = []
    i, j = n, m
    while i > 0 or j > 0:
        if bp[i][j] == 'M':
            ops.append(f"MATCH {source[i-1]}/{target[j-1]}")
            i -= 1; j -= 1
        elif bp[i][j] == 'S':
            ops.append(f"SUB   {source[i-1]}->{target[j-1]}")
            i -= 1; j -= 1
        elif bp[i][j] == 'D':
            ops.append(f"DEL   {source[i-1]}")
            i -= 1
        elif bp[i][j] == 'I':
            ops.append(f"INS   {target[j-1]}")
            j -= 1

    ops.reverse()
    return int(dp[n][m]), ops


# --- Demo ---
for s, t in [("kitten", "sitting"), ("intention", "execution")]:
    dist, ops = min_edit_distance_with_alignment(s, t)
    print(f"'{s}' -> '{t}'  (distance = {dist})")
    print(f"  Alignment ({len(ops)} steps):")
    for op in ops:
        print(f"    {op}")
    print()

def word_error_rate(reference: str, hypothesis: str) -> float:
    """
    Compute Word Error Rate (WER) for ASR evaluation.
    
    WER = edit_distance(ref_words, hyp_words) / len(ref_words)
    
    Uses word-level (not character-level) edit distance:
    each "character" in the edit distance is a whole word.
    """
    ref_words = reference.split()
    hyp_words = hypothesis.split()

    n = len(ref_words)
    m = len(hyp_words)

    # Word-level DP (same algorithm, but over word sequences)
    dp = np.zeros((n + 1, m + 1), dtype=int)
    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,
                dp[i][j - 1] + 1,
                dp[i - 1][j - 1] + cost,
            )

    distance = dp[n][m]
    wer = distance / n if n > 0 else 0.0
    return wer


# --- Demo: ASR evaluation ---
examples = [
    ("the cat sat on the mat",       "the cat sat on the mat"),       # perfect
    ("the cat sat on the mat",       "the cat on the mat"),           # deletion
    ("the cat sat on the mat",       "the cat sat sat on the mat"),   # insertion
    ("the cat sat on the mat",       "a cat sit in the mat"),         # substitutions
    ("she had your dark suit in greasy wash water all year",
     "she had our dark suit and greasy wash water all year"),
]

print("Word Error Rate (WER) examples:")
for ref, hyp in examples:
    wer = word_error_rate(ref, hyp)
    print(f"  REF: \"{ref}\"")
    print(f"  HYP: \"{hyp}\"")
    print(f"  WER: {wer:.2%}")
    print()

def suggest_corrections(word: str, dictionary: List[str], max_dist: int = 2) -> List[Tuple[str, int]]:
    """
    Spelling correction: find all dictionary words within max_dist
    edit distance of the input word.
    
    Returns:
        List of (candidate, distance) tuples sorted by distance then alphabetically.
    """
    candidates = []
    for dict_word in dictionary:
        d = min_edit_distance(word, dict_word)
        if d <= max_dist:
            candidates.append((dict_word, d))

    # Sort by distance first, then alphabetically for ties
    candidates.sort(key=lambda x: (x[1], x[0]))
    return candidates


# --- Demo ---
dictionary = [
    "the", "their", "there", "then", "them", "these", "those",
    "cat", "car", "cart", "care", "case", "cast", "cave",
    "bat", "hat", "mat", "sat", "fat", "rat", "pat",
    "sit", "set", "sat", "suit", "skit", "slit", "spit",
    "kitten", "mitten", "bitten", "written", "sitting", "fitting",
    "intention", "invention", "execution", "extension", "attention",
    "apple", "apply", "ample", "maple",
]
# Remove duplicates and sort
dictionary = sorted(set(dictionary))

test_words = ["kiten", "catt", "ther", "exection"]

print("Spelling correction suggestions:")
for word in test_words:
    suggestions = suggest_corrections(word, dictionary, max_dist=2)
    top = suggestions[:5]  # show top 5
    print(f"  '{word}' -> {[(w, d) for w, d in top]}")

# --- Comprehensive demo: kitten -> sitting, intention -> execution ---
print("=" * 60)
print("COMPREHENSIVE EDIT DISTANCE DEMO")
print("=" * 60)

for source, target in [("kitten", "sitting"), ("intention", "execution")]:
    print(f"\n--- '{source}' -> '{target}' ---")
    
    # Basic distance
    d = min_edit_distance(source, target)
    print(f"  Levenshtein distance: {d}")
    
    # With alignment
    d2, ops = min_edit_distance_with_alignment(source, target)
    print(f"  Alignment trace:")
    src_aligned = []
    tgt_aligned = []
    op_labels = []
    for op in ops:
        parts = op.split()
        if parts[0] == "MATCH":
            c = parts[1].split("/")
            src_aligned.append(c[0])
            tgt_aligned.append(c[1])
            op_labels.append("|")
        elif parts[0] == "SUB":
            pair = parts[1].split("->")
            src_aligned.append(pair[0])
            tgt_aligned.append(pair[1])
            op_labels.append("*")
        elif parts[0] == "DEL":
            src_aligned.append(parts[1])
            tgt_aligned.append("-")
            op_labels.append("D")
        elif parts[0] == "INS":
            src_aligned.append("-")
            tgt_aligned.append(parts[1])
            op_labels.append("I")
    
    print(f"    Source:  {' '.join(src_aligned)}")
    print(f"            {' '.join(op_labels)}")
    print(f"    Target: {' '.join(tgt_aligned)}")
    print(f"    Legend: | = match, * = substitute, D = delete, I = insert")

Summary¶

This lab covered the foundational text-processing concepts from Chapter 2 of Speech and Language Processing:

Part	Topic	Key Takeaway
1	Words, Types, Instances	Vocabulary grows sub-linearly (Heaps’ Law); word frequencies follow a power law (Zipf’s Law)
2	Regular Expressions	Regex is essential for pattern extraction, normalization, and tokenization in NLP
3	Byte-Pair Encoding	BPE builds a subword vocabulary by iteratively merging frequent character pairs – handles unseen words gracefully
4	Minimum Edit Distance	Dynamic programming gives the optimal alignment between strings; powers spelling correction and WER evaluation

Next: Lab 02 will build on these foundations with N-gram Language Models (SLP Chapter 3).