N-Gram

Posted by Dustin Boston in .


The n-gram data structure is a simple container. It is used as a probabilistic model typically used in Natural Language Processing (NLP) to predict sequences of elements such as words or characters. It represents a sequence of \(n\) items from a given dataset, often applied for tasks like language modeling, auto-completion, and text prediction.

The n-gram works by breaking down text data into chunks of \(n\) contiguous elements (e.g., bigram for \({n=2}\), trigram for \({n=3}\)) and counting their occurrences to compute probabilities or frequencies for specific sequences.

Source Code Listing

code.ts

/* eslint-disable max-depth */

/*
DESC: YAKE!
CITE: Campos, R., Mangaravite, V., Pasquali, A., Jorge, A., Nunes, C.,
      & Jatowt, A. (2020). YAKE! Keyword extraction from single documents
      using multiple local features. _Information Sciences_, 509, 257-289.
NOTE: Instead of "term" and "frequency" we use "word" and "count". This
      wording makes NLP easier to understand (at the expence of precision).
*/

import {isStopword} from "../_words/stopwords.ts";
import {
  mean,
  median,
  splitChunkIntoTokens,
  splitSentenceIntoChunks,
  splitTextIntoSentences,
  stdev,
  tag,
  TagTypes,
} from "./util.ts";

export type AnnotatedText = {sentences: AnnotatedSentence[]};
export type AnnotatedSentence = {chunks: AnnotatedChunk[]};
export type AnnotatedChunk = {tokens: AnnotatedToken[]};

export type AnnotatedToken = {
  isStopword: boolean;
  lowercase: string;
  tag: TagTypes;
};

export type Words = Record<string, Term>;
export type Coocurrances = Record<string, Record<string, number>>;

export class Term {
  termType: TagTypes; // Tag
  lowercase: string;
  isStopword: boolean;
  offsetsSentences: number[] = []; // Position in Sentence

  // Features
  // INFO: Explanation in the `computeFeatures` function.
  termFrequency = 0;
  normalizedTermFrequency = 0;
  acronymTermFrequency = 0; // Tf_a
  uppercaseTermFrequency = 0; // TFCase

  termCase = 0;
  termPosition = 0;
  termRelatedness = 0; // TRel
  termSentences = 0; // TSent

  constructor(token: AnnotatedToken) {
    this.lowercase = token.lowercase;
    this.termType = token.tag;
    this.isStopword = token.isStopword;
  }
}

/**
 * Stage 1: Pre-processing
 * @see Campos et al., 2020
 * @param text The text to pre-process. It can contain multiple sentences.
 * @returns An array of sentences, each containing an array of chunks. Finally
 * each chunk includes an array of tokens.
 */
export function preprocessText(text: string): AnnotatedSentence[] {
  const annotatedSentences: AnnotatedSentence[] = [];

  const sentences = splitTextIntoSentences(text);
  for (const sentence of sentences) {
    const annotatedSentence: AnnotatedSentence = {chunks: []};
    annotatedSentences.push(annotatedSentence);

    let tokenCountInSentence = 0;

    const chunks = splitSentenceIntoChunks(sentence);
    for (const chunk of chunks) {
      const annotatedChunk: AnnotatedChunk = {tokens: []};
      annotatedSentence.chunks.push(annotatedChunk);

      const tokens = splitChunkIntoTokens(chunk);
      for (const token of tokens) {
        const isFirstTokenInSentence = tokenCountInSentence === 0;
        annotatedChunk.tokens.push({
          isStopword: isStopword(token),
          lowercase: token.toLowerCase(),
          tag: tag(token, isFirstTokenInSentence),
        });

        tokenCountInSentence++;
      }
    }
  }

  return annotatedSentences;
}

/**
 * Stage 2: Feature Extraction, Score Calculation
 * @see Campos et al., 2020
 * @param annotatedText The pre-processed text.
 */
export function computeTermStatistics(
  annotatedText: AnnotatedText,
  windowSize: number,
): {terms: Words; cooccur: Coocurrances} {
  const terms: Words = {};
  const cooccur: Coocurrances = {};
  let sentenceIndex = 0;
  for (const sentence of annotatedText.sentences) {
    for (const chunk of sentence.chunks) {
      for (const [index, token] of chunk.tokens.entries()) {
        const termId = token.lowercase;

        terms[termId] ||= new Term(token);
        terms[termId].termFrequency++;
        terms[termId].offsetsSentences.push(sentenceIndex);

        if (token.tag === TagTypes.Acronym) {
          terms[termId].acronymTermFrequency++;
        }

        if (token.tag === TagTypes.Uppercase) {
          terms[termId].uppercaseTermFrequency++;
        }

        // This implementation differs from the original YAKE implementation.
        // Rather than create an asymmetrical cooccurance matrix we make it
        // symmetrical but it might be redundant since the top and bottom half
        // of a cooccurance matrix are usually the same.
        //
        // The YAKE Implementation:
        //
        // ```
        // if (index - distance >= 0) {
        //   const leftTermId = chunk.tokens[index - distance].lowercase;
        //   cooccur[leftTermId] ||= {};
        //   cooccur[leftTermId][termId] ||= 0;
        //   cooccur[leftTermId][termId]++;
        // }
        //
        // if (index + distance < chunk.tokens.length) {
        //   const rightTermId = chunk.tokens[index + distance].lowercase;
        //   cooccur[termId] ||= {};
        //   cooccur[termId][rightTermId] ||= 0;
        //   cooccur[termId][rightTermId]++;
        // }
        // ```

        for (let distance = 1; distance <= windowSize; distance++) {
          const leftIndex = index - distance;
          if (leftIndex < 0) break;

          const leftToken = chunk.tokens[leftIndex];
          const leftTermId = leftToken.lowercase;

          // Make sure both sides exist
          cooccur[leftTermId] ||= {};
          cooccur[termId] ||= {};

          // Increment leftTermId.termId
          cooccur[leftTermId][termId] ||= 0;
          cooccur[leftTermId][termId]++;

          // Also increment termId.leftTermId (to keep it symmetric)
          cooccur[termId][leftTermId] ||= 0;
          cooccur[termId][leftTermId]++;
        }
      }
    }

    sentenceIndex++;
  }

  return {terms, cooccur};
}

// Features are the most important part of an NLP program. A feature is anything
// important that you want to count. For example, how many words start with Q?
// This program looks at the words around a capitalized word.
//
// First, we count how often each word appears. Next, we count up the important
// words. Words like "the" and "to" aren't important. Finally, we count the
// average number of times all words appear.
//
// @param terms The terms to collect information about.
function computeFeatures(
  terms: Term[],
  cooccur: Coocurrances,
  numberOfSentences: number,
): Term[] {
  const termFrequencies: number[] = []; // All TFs
  const validTermFrequencies: number[] = []; // Valid TFs

  for (const term of terms) {
    termFrequencies.push(term.termFrequency);
    if (!term.isStopword) {
      validTermFrequencies.push(term.termFrequency);
    }
  }

  const averageTermFrequency = mean(validTermFrequencies);
  const standardDeviation = stdev(validTermFrequencies);
  const normalizedTermFrequency = averageTermFrequency + standardDeviation;
  const maximumTermFrequency = Math.max(...termFrequencies);

  for (const term of terms) {
    term.termCase =
      Math.max(term.acronymTermFrequency, term.uppercaseTermFrequency) /
      (1 + Math.log(term.termFrequency));
    term.termPosition = Math.log(3 + median(term.offsetsSentences));
    term.normalizedTermFrequency = term.termFrequency / normalizedTermFrequency;

    let uniqueLeftNeighbors = 0;
    let sumOfUniqueLeftNeighbors = 0;
    for (const leftTerm of Object.keys(cooccur)) {
      const uniqueNeighborCount = cooccur[leftTerm][term.lowercase] || 0;
      sumOfUniqueLeftNeighbors += uniqueNeighborCount;
      if (uniqueNeighborCount > 0) {
        uniqueLeftNeighbors++;
      }
    }

    const leftDispersion =
      sumOfUniqueLeftNeighbors > 0
        ? uniqueLeftNeighbors / sumOfUniqueLeftNeighbors
        : 0;

    let uniqueRightNeighbors = 0;
    let sumOfUniqueRightNeighbors = 0;
    if (cooccur[term.lowercase]) {
      for (const rightTerm of Object.keys(cooccur[term.lowercase])) {
        const uniqueNeighborCount = cooccur[term.lowercase][rightTerm] || 0;
        sumOfUniqueRightNeighbors += uniqueNeighborCount;
        if (uniqueNeighborCount > 0) {
          uniqueRightNeighbors++;
        }
      }
    }

    const rightDispersion =
      sumOfUniqueRightNeighbors > 0
        ? uniqueRightNeighbors / sumOfUniqueRightNeighbors
        : 0;

    term.termRelatedness =
      1 +
      (leftDispersion + rightDispersion) *
        (term.termFrequency / maximumTermFrequency);

    term.termSentences = term.offsetsSentences.length / numberOfSentences;
  }

  return terms;
}

// Extract n-grams from a set of sentences.
// (Sentences are arrays of terms)
// export function getNgrams(
//   sentences: Term[][],
//   minTerms = 2,
//   maxTerms = 5,
// ): Ngram[] {
//   const ngrams: Ngram[] = [];

//   for (const sentence of sentences) {
//     for (let start = 0; start < sentence.length; start++) {
//       for (let length = minTerms; length <= maxTerms; length++) {
//         const slice = sentence.slice(start, start + length);
//         if (slice.length < minTerms || slice.length > maxTerms) {
//           continue;
//         }

//         // Create an Ngram only if it passes all checks
//         const ngram = new Ngram(slice);
//         if (ngram.valueOf().length > 0) {
//           ngrams.push(ngram);
//         }
//       }
//     }
//   }

//   return ngrams;
// }

test.ts

import {test, expect, describe} from "bun:test";
import {
  computeTermStatistics,
  preprocessText,
  Term,
  type AnnotatedText,
  type AnnotatedToken,
} from "./code.ts";
import {
  splitAndTrim,
  splitSentenceIntoChunks,
  splitTextIntoSentences,
  tag,
  TagTypes,
} from "./util.ts";

class TestTerm extends Term {
  setTermFrequency(value: number) {
    this.termFrequency = value;
    return this;
  }

  setAcronymTermFrequency(value: number) {
    this.acronymTermFrequency = value;
    return this;
  }

  setUppercaseTermFrequency(value: number) {
    this.uppercaseTermFrequency = value;
    return this;
  }
}

describe("Splitting", () => {
  test("Split", () => {
    const result = splitAndTrim("  foo  bar , baz  quux ", /,/);
    expect(result.length).toBe(2);
    expect(result[0]).toBe("foo  bar");
    expect(result[1]).toBe("baz  quux");
  });

  test("Sentence", () => {
    const result = splitTextIntoSentences("Lorem ipsum. Foo bar?");
    expect(result.length).toBe(2);
    expect(result[0]).toBe("Lorem ipsum");
    expect(result[1]).toBe("Foo bar");
  });

  test("Chunk", () => {
    const result = splitSentenceIntoChunks("Lorem McRib; Dolor sit (amet)");
    expect(result.length).toBe(3);
    const [lorem, dolor, amet] = result;
    expect(lorem).toBe("Lorem McRib");
    expect(dolor).toBe("Dolor sit");
    expect(amet).toBe("amet");
  });
});

describe("Pre-process", () => {
  test("Parse Sentences", () => {
    const result = preprocessText("Foo Bar. Foo Faz.");

    expect(result.length).toBe(2); // Container
    expect(result[0].chunks.length).toBe(1); // Sentence
    expect(result[0].chunks[0].tokens.length).toBe(2); // Chunk
  });

  test("Yake example", () => {
    const text =
      "One of the most iconic 3 software’s on this matter is the Practical Automatic Keyphrase Extraction (KEA), which was ≥ first known on 1999–06–07";
    const result = preprocessText(text);

    expect(result.length).toBe(1); // Container
    expect(result[0].chunks.length).toBe(3); // Sentence
    expect(result[0].chunks[0].tokens.length).toBe(17); // Chunk
    expect(result[0].chunks[1].tokens.length).toBe(1); // Chunk
    expect(result[0].chunks[2].tokens.length).toBe(7); // Chunk
  });
});

describe("Tag Terms", () => {
  test("Tag Numbers", () => {
    const result = tag("123");
    expect(result).toBe(TagTypes.Number);
  });

  test("Tag Spelled Number", () => {
    const result = tag("one", false);
    expect(result).toBe(TagTypes.Number);
  });

  test("Tag Money", () => {
    const result = tag("$100");
    expect(result).toBe(TagTypes.Money);
  });

  test("Remove Fraction", () => {
    const result = tag("1/2");
    console.log(result);
    expect(result).toBe(TagTypes.Fraction);
  });

  test("Remove Percent", () => {
    const result = tag("50%");
    expect(result).toBe(TagTypes.Percent);
  });

  test("Remove PhoneNumber", () => {
    const result = tag("123-456-7890");
    expect(result).toBe(TagTypes.Phone);
  });

  test("Remove Date", () => {
    const result = tag("2021-10-10");
    expect(result).toBe(TagTypes.Date);
  });
});

// Tag parts of speech
// -----------------------------------------------------------------------------

describe("N-gram Stopwords", () => {
  test.todo("Tag Determiner", () => {
    // Const result = tag("the cat");
  });

  test.todo("Tag Preposition", () => {
    // Const result = tag("bad at golf");
  });

  test.todo("Tag Conjunction", () => {
    // Const result = tag("apple and orange");
  });

  test.todo("Tag Pronoun", () => {
    // Const result = tag("he she");
  });

  test.todo("Tag Copula", () => {
    // Const result = tag("is are");
  });
});

// Remove web markup
// -----------------------------------------------------------------------------

describe("N-Gram Web Markup", () => {
  test("Remove Url", () => {
    const result = tag("https://example.com");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove unsecured Url", () => {
    const result = tag("http://example.com");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove Url without protocol", () => {
    const result = tag("example.com");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove Url with www", () => {
    const result = tag("www.example.com");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove Url with weird tld", () => {
    const result = tag("http://my.blog");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove Emoticon", () => {
    const result = tag(":-)");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove Email", () => {
    const result = tag("email@example.com");
    expect(result).toBe(TagTypes.Web);
  });

  test("Remove HashTag", () => {
    const result = tag("#hashtag");
    expect(result).toBe(TagTypes.TagHandle);
  });

  test("Remove AtMention", () => {
    const result = tag("@user");
    expect(result).toBe(TagTypes.TagHandle);
  });
});

describe("Compute Term Statistics", () => {
  test("Single token, chunk, and sentence", () => {
    const annotatedToken: AnnotatedToken = {
      lowercase: "hello",
      tag: TagTypes.Uppercase,
      isStopword: false,
    };

    const annotatedText: AnnotatedText = {
      sentences: [
        {
          chunks: [
            {
              tokens: [annotatedToken],
            },
          ],
        },
      ],
    };

    const {terms, cooccur} = computeTermStatistics(annotatedText, 2);

    expect(Object.keys(terms)).toHaveLength(1);
    expect(Object.keys(cooccur)).toHaveLength(0);
  });

  test("Two tokens, single chunk, single sentence", () => {
    const annotatedText: AnnotatedText = {
      sentences: [
        {
          chunks: [
            {
              tokens: [
                {
                  lowercase: "hello",
                  tag: TagTypes.Uppercase,
                  isStopword: false,
                },
                {
                  lowercase: "world",
                  tag: TagTypes.Uppercase,
                  isStopword: false,
                },
              ],
            },
          ],
        },
      ],
    };

    const {terms, cooccur} = computeTermStatistics(annotatedText, 1);

    // Check the terms
    expect(Object.keys(terms).sort()).toEqual(["hello", "world"].sort());
    expect(terms.hello.termFrequency).toBe(1);
    expect(terms.hello.uppercaseTermFrequency).toBe(1);

    expect(terms.world.termFrequency).toBe(1);
    expect(terms.world.uppercaseTermFrequency).toBe(1);

    // Check cooccurrences
    expect(cooccur.hello).toBeDefined();
    expect(cooccur.hello.world).toBe(1);
  });

  test("Acronym and uppercase tokens, single chunk, window=2", () => {
    const annotatedToken1: AnnotatedToken = {
      lowercase: "nlp",
      tag: TagTypes.Acronym,
      isStopword: false,
    };
    const annotatedToken2: AnnotatedToken = {
      lowercase: "is",
      tag: TagTypes.Uppercase,
      isStopword: true,
    };
    const annotatedToken3: AnnotatedToken = {
      lowercase: "cool",
      tag: TagTypes.Uppercase,
      isStopword: false,
    };
    const annotatedText: AnnotatedText = {
      sentences: [
        {
          chunks: [
            {
              tokens: [annotatedToken1, annotatedToken2, annotatedToken3],
            },
          ],
        },
      ],
    };

    const {terms, cooccur} = computeTermStatistics(annotatedText, 2);

    // Check terms
    expect(terms.nlp.termFrequency).toBe(1);
    expect(terms.nlp.acronymTermFrequency).toBe(1);

    expect(terms.is.termFrequency).toBe(1);
    expect(terms.is.uppercaseTermFrequency).toBe(1);

    expect(terms.cool.termFrequency).toBe(1);
    expect(terms.cool.uppercaseTermFrequency).toBe(1);

    // Check cooccurrences
    // 'nlp' co-occurs with 'is' and 'cool'
    expect(cooccur.nlp.is).toBe(1);
    expect(cooccur.nlp.cool).toBe(1);
    // 'is' co-occurs with 'nlp' and 'cool'
    expect(cooccur.is.nlp).toBe(1);
    expect(cooccur.is.cool).toBe(1);
    // 'cool' co-occurs with 'is' and 'nlp'
    expect(cooccur.cool.is).toBe(1);
    expect(cooccur.cool.nlp).toBe(1);
  });

  test("Multiple sentences, checking offsetsSentences", () => {
    const annotatedText: AnnotatedText = {
      sentences: [
        {
          chunks: [
            {
              tokens: [
                {
                  lowercase: "hello",
                  tag: TagTypes.Uppercase,
                  isStopword: false,
                },
              ],
            },
          ],
        },
        {
          chunks: [
            {
              tokens: [
                {
                  lowercase: "world",
                  tag: TagTypes.Uppercase,
                  isStopword: false,
                },
              ],
            },
          ],
        },
      ],
    };

    const {terms, cooccur} = computeTermStatistics(annotatedText, 1);

    // Terms should have 'hello' and 'world'.
    expect(terms.hello).toBeDefined();
    expect(terms.hello.termFrequency).toBe(1);
    expect(terms.hello.uppercaseTermFrequency).toBe(1);

    expect(terms.world).toBeDefined();
    expect(terms.world.termFrequency).toBe(1);
    expect(terms.world.uppercaseTermFrequency).toBe(1);

    // No cooccurrences, because each is in a separate sentence, no adjacent tokens.
    expect(Object.keys(cooccur)).toHaveLength(0);
  });

  // ADD THESE TESTS

//   describe("computeFeatures tests", () => {
//     test("Single term, single sentence", () => {
//       // 1) Setup
//       const terms: Term[] = [
//         {
//           lowercase: "hello",
//           numberOfWords: 1, // TF=1
//           numberOfAcronyms: 0,
//           numberOfCapitalWords: 1,
//           isStopword: false,
//           offsetsSentences: [0],
//         },
//       ];
//       const cooccur: Coocurrances = {
//         // For one term, there's no real co-occurrence
//         hello: {},
//       };
//       const numberOfSentences = 1;

//       // 2) Execute
//       const result = computeFeatures(terms, cooccur, numberOfSentences);

//       // 3) Assert
//       expect(result).toHaveLength(1);
//       const [term] = result;
//       // Check some basics
//       expect(term.termCase).toBeDefined();
//       expect(term.termPosition).toBeDefined();
//       expect(term.normalizedTermFrequency).toBeDefined();
//       expect(term.wordRelatedness).toBeDefined();
//       expect(term.wordSentences).toBeDefined();

//       // Because numberOfWords=1
//       // termCase = max(0,1)/(1 + ln(1)) => 1/1 => 1
//       expect(term.termCase).toBeCloseTo(1, 5);

//       // TermPosition = ln(3 + median([0])) => ln(3 + 0) => ln(3) ~ 1.0986
//       expect(term.termPosition).toBeCloseTo(Math.log(3), 5);

//       // NormalizedTermFrequency => 1/(average(1)+std(1)) => 1/(1+0) => 1
//       expect(term.normalizedTermFrequency).toBe(1);

//       // WordRelatedness => 1 + (0+0)*(1/1) => 1
//       // because no left or right neighbors
//       expect(term.wordRelatedness).toBe(1);

//       // WordSentences => 1/1 => 1
//       expect(term.wordSentences).toBe(1);
//     });

//     test("Two terms, single sentence, simple cooccurrence", () => {
//       // 1) Setup
//       const terms: Term[] = [
//         {
//           lowercase: "hello",
//           numberOfWords: 2,
//           numberOfAcronyms: 0,
//           numberOfCapitalWords: 2,
//           isStopword: false,
//           offsetsSentences: [0],
//         },
//         {
//           lowercase: "world",
//           numberOfWords: 1,
//           numberOfAcronyms: 0,
//           numberOfCapitalWords: 1,
//           isStopword: false,
//           offsetsSentences: [0],
//         },
//       ];
//       const cooccur: Coocurrances = {
//         // 'hello' co-occurs once with 'world'
//         hello: {world: 1},
//         // 'world' co-occurs once with 'hello'
//         world: {hello: 1},
//       };
//       const numberOfSentences = 1;

//       // 2) Execute
//       const result = computeFeatures(terms, cooccur, numberOfSentences);

//       // 3) Assert
//       // Because we have 2 terms: [2, 1] => average=1.5, stdev ~ 0.707..., sum=2.207...
//       // highestWordCount=2
//       // Check 'hello' stats (TF=2)
//       const hello = result.find((t) => t.lowercase === "hello");
//       expect(hello).toBeDefined();
//       if (hello) {
//         // TermCase => max(0,2)/(1+ln(2)) => 2/(1+0.6931) ~ 2/1.6931 => ~1.18
//         expect(hello.termCase).toBeGreaterThan(1);

//         // TermPosition => ln(3 + median([0])) => ln(3+0) => ln(3) ~1.0986
//         expect(hello.termPosition).toBeCloseTo(Math.log(3), 5);

//         // NormalizedTermFrequency => 2 / (1.5 + 0.707...) => 2/2.207... ~ 0.905
//         expect(hello.normalizedTermFrequency).toBeCloseTo(0.905, 2);

//         // For cooccurrence:
//         //   leftDispersion => from cooccur[*][hello] => 'world' -> cooccur['world']['hello']=1
//         //                    sumOfUniqueLeftNeighbors=1, uniqueLeftNeighbors=1 => 1/1=1
//         //   rightDispersion => from cooccur[hello][*] => 'world' -> 1
//         //                      sumOfUniqueRightNeighbors=1, uniqueRightNeighbors=1 =>1
//         // => wordRelatedness => 1 + (1+1)*(2/2) => 1 + (2*1) => 3
//         expect(hello.wordRelatedness).toBeCloseTo(3, 5);

//         // WordSentences => 1 / 1 => 1
//         expect(hello.wordSentences).toBeCloseTo(1, 5);
//       }

//       // Check 'world' stats (TF=1)
//       const world = result.find((t) => t.lowercase === "world");
//       expect(world).toBeDefined();
//       if (world) {
//         // TermCase => max(0,1)/(1+ln(1)) => 1/1 => 1
//         expect(world.termCase).toBe(1);

//         // TermPosition => same ln(3+0) => ln(3)
//         expect(world.termPosition).toBeCloseTo(Math.log(3), 5);

//         // NormalizedTermFrequency => 1 / 2.207 => ~0.452
//         expect(world.normalizedTermFrequency).toBeCloseTo(0.45, 2);

//         // Cooccur => leftDispersion => cooccur[*][world], i.e. 'hello' -> 1 => sum=1, distinct=1 =>1
//         //           rightDispersion => cooccur[world][*], i.e. 'hello' ->1 => sum=1, distinct=1 =>1
//         // => wordRelatedness => 1 + (1+1)*(1/2) => 1 +2*(0.5) => 2
//         expect(world.wordRelatedness).toBeCloseTo(2, 5);

//         // WordSentences => 1/1 =>1
//         expect(world.wordSentences).toBe(1);
//       }
//     });

//     test("Handle empty validTermFrequencies gracefully", () => {
//       // Suppose all terms are stopwords => validTermFrequencies = []
//       const terms: Term[] = [
//         {
//           lowercase: "the",
//           numberOfWords: 5,
//           numberOfAcronyms: 0,
//           numberOfCapitalWords: 0,
//           isStopword: true, // All are stopwords
//           offsetsSentences: [0],
//         },
//         {
//           lowercase: "and",
//           numberOfWords: 3,
//           numberOfAcronyms: 0,
//           numberOfCapitalWords: 0,
//           isStopword: true,
//           offsetsSentences: [0],
//         },
//       ];
//       const cooccur: Coocurrances = {the: {}, and: {}};

//       // This will lead to average=0, stdev=0 => normalizedWordCount=0 => possible division by zero
//       // Let's see how the function handles it
//       const result = computeFeatures(terms, cooccur, 1);

//       // We expect no crashes
//       expect(result).toHaveLength(2);

//       // Because averageTermFrequency=0, standardDeviation=0 => normalizedWordCount=0
//       // => term.normalizedTermFrequency = numberOfWords / 0 => Infinity or some fallback
//       // If your code doesn't guard for that, you might see Infinity or NaN
//       // Check if we do see Infinity
//       expect(result[0].normalizedTermFrequency).toBe(Infinity);
//       expect(result[1].normalizedTermFrequency).toBe(Infinity);
//     });
//   });
// });

// Describe("N-Gram Basics", () => {
//   test("Gets Expected", () => {
//     // Original sentence: Education never ends Watson.
//     const expected = [
//       "education",
//       "education does",
//       "education does not",
//       "does",
//       "does not",
//       "does not never",
//       "not",
//       "not never",
//       "not never end",
//       "never",
//       "never end",
//       "never end watson",
//       "end",
//       "end watson",
//       "watson",
//     ];

//     // Const sentences = [["education", "does", "not", "never", "end", "watson"]];
//     const terms = preprocessText("Education never ends Watson.");
//     const ngrams = getNgrams(terms, 1, 3);

//     expect(ngrams.length).toEqual(expected.length);
//   });

//   test("Sentence Boundaries", () => {
//     // Original sentence: Education never ends, Watson.
//     const expected = [
//       "education",
//       "education does",
//       "education does not",
//       "does",
//       "does not",
//       "does not never",
//       "not",
//       "not never",
//       "not never end",
//       "never",
//       "never end",
//       "end",
//       "watson",
//     ];

//     // Const sentences = [
//     //   ["education", "does", "not", "never", "end"],
//     //   ["watson"],
//     // ];

//     const terms = preprocessText("Education never ends, Watson.");
//     const ngrams = getNgrams(terms, 1, 3);

//     // Const terms = explodeTerms(sentences);
//     // const result = getNgrams(terms, 1, 3);

//     expect(ngrams.length).toEqual(expected.length);
//   });

//   test("Extraneous POS", () => {
//     // Original sentence: Henry is bad at golf #blabla.
//     const expected = [
//       "henry",
//       "henry bad",
//       "henry bad golf",
//       "bad",
//       "bad golf",
//       "golf",
//     ];

//     // Const sentences = [["henry", "bad", "golf"]];
//     // const terms = explodeTerms(sentences);
//     // const result = getNgrams(terms, 1, 3);

//     const terms = preprocessText("Henry is bad at golf #blabla.");
//     const ngrams = getNgrams(terms, 1, 3);

//     expect(ngrams.length).toEqual(expected.length);
//   });
// });

util.ts

import {isStopword} from "../_words/stopwords.ts";

export function splitAndTrim(text: string, pattern: RegExp) {
  const result: string[] = [];
  for (const part of text.split(pattern)) {
    const value = part.trim();
    if (value !== "") result.push(value);
  }

  return result;
}

export function splitTextIntoSentences(text: string): string[] {
  return splitAndTrim(text, /[.?!]\s*/);
}

export function splitSentenceIntoChunks(text: string): string[] {
  return splitAndTrim(text, /[:;,$[\]{}()<>]\s*/);
}

export function splitChunkIntoTokens(text: string): string[] {
  return splitAndTrim(text, /['’ ]/);
}

// US phone number with optional "+1 " prefix
// Matches either "123-456-7890" or "(123) 456-7890"
const usPhoneNumberPattern =
  /^(?:\+1\s)?(?:\d{3}-\d{3}-\d{4}|\(\d{3}\)\s\d{3}-\d{4})$/;
function isUsPhoneNumber(string_: string): boolean {
  return usPhoneNumberPattern.test(string_);
}

const percentPattern = /^\d+(\.\d+)?%$/;
function isPercent(string_: string): boolean {
  return percentPattern.test(string_);
}

// Matches: $1,234,567.89
const moneyPattern = /^\$\d{1,3}(,\d{3})*(\.\d{2})?$/;
function isMoney(string_: string): boolean {
  return moneyPattern.test(string_);
}

// Matches: YYYY-MM-DD with hyphen variants.
const isoDatePattern = new RegExp(
  "^" + // 1) Start-of-string anchor
    "(\\d{4})" + // 2) Capture a 4-digit year
    "[\\-\\u2013\\u2014]" + // 3) Match an ASCII hyphen, en dash, or em dash
    "(0[1-9]|1[0-2])" + // 4) Capture the month, allowing 01-12
    "[\\-\\u2013\\u2014]" + // 5) Again, match ASCII hyphen, en dash, or em dash
    "(0[1-9]|[12]\\d|3[01])" + // 6) Capture the day, allowing 01-31
    "$", // 7) End-of-string anchor
);
export function isIsoDate(dateString: string): boolean {
  return isoDatePattern.test(dateString);
}

// Matches: Flexible DD/MM/YYYY allowing slash or fullwidth slash
// e.g., "05/01/2025" or "05/01/2025"
const ddMmYyyyPattern = new RegExp(
  "^" +
    "(0[1-9]|[12]\\d|3[01])" + // Day 01-31
    "[/\\uFF0F]" + // Accept / (ASCII) or / (fullwidth slash)
    "(0[1-9]|1[0-2])" + // Month 01-12
    "[/\\uFF0F]" + // Again, slash or fullwidth slash
    "(\\d{4})" + // 4-digit year
    "$",
);
function isEuDate(dateString: string): boolean {
  return ddMmYyyyPattern.test(dateString);
}

// Matches: Flexible DD-MM-YYYY allowing ASCII hyphen, en dash, or em dash
// e.g., "05-01-2025" or "05–01–2025" or "05—01—2025"
const ddMmYyyyDashPattern = new RegExp(
  "^" +
    "(0[1-9]|[12]\\d|3[01])" + // Day 01-31
    "[\\-\\u2013\\u2014]" + // - (ASCII), – (en), — (em)
    "(0[1-9]|1[0-2])" + // Month 01-12
    "[\\-\\u2013\\u2014]" + // - (ASCII), – (en), — (em)
    "(\\d{4})" + // 4-digit year
    "$",
);

function isEuDateDash(dateString: string): boolean {
  return ddMmYyyyDashPattern.test(dateString);
}

// Matches: -d/-d
const fractionPattern = /^[+-]?\d+\/\d+$/;
function isFraction(string_: string): boolean {
  return fractionPattern.test(string_);
}

const numberPattern = /^-?\d{1,3}(,\d{3})*(\.\d+)?|\d+(\.\d+)?$/;
const spelledNumberPattern =
  /^(one|two|three|four|five|six|seven|eight|nine|ten)$/i;
export function isNumber(token: string): boolean {
  return numberPattern.test(token) || spelledNumberPattern.test(token);
}

// Matches: www.google.com, email@account.com, <head>
// And other items with more than two punctuation symbols.
const webAddressPattern = /^(?:.*[^\w\s]){2,}.*$/;
const tldPattern =
  /\.(com|org|net|edu|gov|mil|int|info|biz|name|us|uk|ca|de|au|jp|cn|in|fr|it|xyz|online|site|tech|store|blog|app|dev|io|co|tv|me|ai|pro|health)\b/;
export function isWebAddress(token: string): boolean {
  return webAddressPattern.test(token) || tldPattern.test(token);
}

// For "not alpha-numeric", we just need to detect any non-letter-or-digit character.
// The presence of symbols like ‰ or ¥ will trigger this as true.
const nonAlphaNumberPattern = /[^\p{L}\p{N}]/u;
export function isNotAlphaNumeric(token: string): boolean {
  // Returns true if there's at least one character that is NOT a letter or digit
  return nonAlphaNumberPattern.test(token);
}

// For alpha-numeric, we want at least one letter and at least one digit.
// Everything in the string must be letters or digits (no punctuation, no symbols).
const onlyAlphaNumericPattern = /^(?=.*\p{L})(?=.*\p{N})[\p{L}\p{N}]+$/u;
export function isAlphaNumeric(token: string): boolean {
  return onlyAlphaNumericPattern.test(token);
}

const twoPlusUppercasePattern = /^[A-Z.]{2,}$/; // U.S.A. or MIT
export function isAcronym(token: string): boolean {
  return twoPlusUppercasePattern.test(token);
}

// Matches an initial uppercase character.
const firstIsUppercasePattern = /^[A-Z]/;
export function isUppercase(
  token: string,
  isFirstTokenInSentence: boolean,
): boolean {
  return isFirstTokenInSentence && firstIsUppercasePattern.test(token);
}

const tagOrHandlePattern = /^[@#]\w[a-z\d]{2,}/;
export function isTagOrHandle(token: string): boolean {
  return tagOrHandlePattern.test(token);
}

export enum TagTypes {
  Acronym = "Acronym",
  AlphaNumeric = "AlphaNumeric",
  Date = "Date",
  Fraction = "Fraction",
  Money = "Money",
  NotAlphaNumeric = "NotAlphaNumeric",
  Number = "Number",
  Parsable = "Parsable",
  Percent = "Percent",
  Phone = "Phone",
  TagHandle = "TagHandle",
  Stopword = "Stopword",
  Unparsable = "Unparsable",
  Uppercase = "Uppercase",
  Web = "Web",
}

// Everything except Parsable gets ignored at the moment.
export function tag(token: string, isFirstTokenInSentence = false): TagTypes {
  if (isUsPhoneNumber(token)) return TagTypes.Phone;
  if (isPercent(token)) return TagTypes.Percent;
  if (isFraction(token)) return TagTypes.Fraction;
  if (isIsoDate(token)) return TagTypes.Date;
  if (isEuDate(token)) return TagTypes.Date;
  if (isEuDateDash(token)) return TagTypes.Date;
  if (isMoney(token)) return TagTypes.Money;
  if (isNumber(token)) return TagTypes.Number;
  if (isWebAddress(token)) return TagTypes.Web;
  if (isTagOrHandle(token)) return TagTypes.TagHandle;
  if (isNotAlphaNumeric(token)) return TagTypes.NotAlphaNumeric;
  if (isAlphaNumeric(token)) return TagTypes.AlphaNumeric;
  if (isAcronym(token)) return TagTypes.Acronym;
  if (isStopword(token)) return TagTypes.Stopword;
  if (isUppercase(token, isFirstTokenInSentence)) return TagTypes.Uppercase;

  return TagTypes.Parsable;
}

/**
 * Helper to compute the median of a numeric array.
 * If the array is empty, returns 0 (you can choose another policy).
 */
export function median(values: number[]): number {
  if (values.length === 0) return 0;
  const sorted = [...values].sort((a, b) => a - b);
  const mid = Math.floor(sorted.length / 2);
  return sorted.length % 2 === 0
    ? (sorted[mid - 1] + sorted[mid]) / 2
    : sorted[mid];
}

/**
 * Helper to compute the mean (average) of a numeric array.
 * If empty, returns 0.
 */
export function mean(values: number[]): number {
  if (values.length === 0) return 0;
  const sum = values.reduce((accumulator, value) => accumulator + value, 0);
  return sum / values.length;
}

/**
 * Helper to compute the sample standard deviation.
 * If array length < 2, returns 0.
 */
export function stdev(values: number[]): number {
  if (values.length < 2) return 0;
  const avg = mean(values);
  const variance =
    values.reduce((accumulator, value) => accumulator + (value - avg) ** 2, 0) /
    (values.length - 1);
  return Math.sqrt(variance);
}