N-Gram
Posted by Dustin Boston in Data Structures.
The n-gram data structure is a simple container. It is used as a probabilistic model typically used in Natural Language Processing (NLP) to predict sequences of elements such as words or characters. It represents a sequence of \(n\) items from a given dataset, often applied for tasks like language modeling, auto-completion, and text prediction.
The n-gram works by breaking down text data into chunks of \(n\) contiguous elements (e.g., bigram for \({n=2}\), trigram for \({n=3}\)) and counting their occurrences to compute probabilities or frequencies for specific sequences.
Source Code Listing
code.ts
/* eslint-disable max-depth */
/*
DESC: YAKE!
CITE: Campos, R., Mangaravite, V., Pasquali, A., Jorge, A., Nunes, C.,
& Jatowt, A. (2020). YAKE! Keyword extraction from single documents
using multiple local features. _Information Sciences_, 509, 257-289.
NOTE: Instead of "term" and "frequency" we use "word" and "count". This
wording makes NLP easier to understand (at the expence of precision).
*/
import {isStopword} from "../_words/stopwords.ts";
import {
mean,
median,
splitChunkIntoTokens,
splitSentenceIntoChunks,
splitTextIntoSentences,
stdev,
tag,
TagTypes,
} from "./util.ts";
export type AnnotatedText = {sentences: AnnotatedSentence[]};
export type AnnotatedSentence = {chunks: AnnotatedChunk[]};
export type AnnotatedChunk = {tokens: AnnotatedToken[]};
export type AnnotatedToken = {
isStopword: boolean;
lowercase: string;
tag: TagTypes;
};
export type Words = Record<string, Term>;
export type Coocurrances = Record<string, Record<string, number>>;
export class Term {
termType: TagTypes; // Tag
lowercase: string;
isStopword: boolean;
offsetsSentences: number[] = []; // Position in Sentence
// Features
// INFO: Explanation in the `computeFeatures` function.
termFrequency = 0;
normalizedTermFrequency = 0;
acronymTermFrequency = 0; // Tf_a
uppercaseTermFrequency = 0; // TFCase
termCase = 0;
termPosition = 0;
termRelatedness = 0; // TRel
termSentences = 0; // TSent
constructor(token: AnnotatedToken) {
this.lowercase = token.lowercase;
this.termType = token.tag;
this.isStopword = token.isStopword;
}
}
/**
* Stage 1: Pre-processing
* @see Campos et al., 2020
* @param text The text to pre-process. It can contain multiple sentences.
* @returns An array of sentences, each containing an array of chunks. Finally
* each chunk includes an array of tokens.
*/
export function preprocessText(text: string): AnnotatedSentence[] {
const annotatedSentences: AnnotatedSentence[] = [];
const sentences = splitTextIntoSentences(text);
for (const sentence of sentences) {
const annotatedSentence: AnnotatedSentence = {chunks: []};
annotatedSentences.push(annotatedSentence);
let tokenCountInSentence = 0;
const chunks = splitSentenceIntoChunks(sentence);
for (const chunk of chunks) {
const annotatedChunk: AnnotatedChunk = {tokens: []};
annotatedSentence.chunks.push(annotatedChunk);
const tokens = splitChunkIntoTokens(chunk);
for (const token of tokens) {
const isFirstTokenInSentence = tokenCountInSentence === 0;
annotatedChunk.tokens.push({
isStopword: isStopword(token),
lowercase: token.toLowerCase(),
tag: tag(token, isFirstTokenInSentence),
});
tokenCountInSentence++;
}
}
}
return annotatedSentences;
}
/**
* Stage 2: Feature Extraction, Score Calculation
* @see Campos et al., 2020
* @param annotatedText The pre-processed text.
*/
export function computeTermStatistics(
annotatedText: AnnotatedText,
windowSize: number,
): {terms: Words; cooccur: Coocurrances} {
const terms: Words = {};
const cooccur: Coocurrances = {};
let sentenceIndex = 0;
for (const sentence of annotatedText.sentences) {
for (const chunk of sentence.chunks) {
for (const [index, token] of chunk.tokens.entries()) {
const termId = token.lowercase;
terms[termId] ||= new Term(token);
terms[termId].termFrequency++;
terms[termId].offsetsSentences.push(sentenceIndex);
if (token.tag === TagTypes.Acronym) {
terms[termId].acronymTermFrequency++;
}
if (token.tag === TagTypes.Uppercase) {
terms[termId].uppercaseTermFrequency++;
}
// This implementation differs from the original YAKE implementation.
// Rather than create an asymmetrical cooccurance matrix we make it
// symmetrical but it might be redundant since the top and bottom half
// of a cooccurance matrix are usually the same.
//
// The YAKE Implementation:
//
// ```
// if (index - distance >= 0) {
// const leftTermId = chunk.tokens[index - distance].lowercase;
// cooccur[leftTermId] ||= {};
// cooccur[leftTermId][termId] ||= 0;
// cooccur[leftTermId][termId]++;
// }
//
// if (index + distance < chunk.tokens.length) {
// const rightTermId = chunk.tokens[index + distance].lowercase;
// cooccur[termId] ||= {};
// cooccur[termId][rightTermId] ||= 0;
// cooccur[termId][rightTermId]++;
// }
// ```
for (let distance = 1; distance <= windowSize; distance++) {
const leftIndex = index - distance;
if (leftIndex < 0) break;
const leftToken = chunk.tokens[leftIndex];
const leftTermId = leftToken.lowercase;
// Make sure both sides exist
cooccur[leftTermId] ||= {};
cooccur[termId] ||= {};
// Increment leftTermId.termId
cooccur[leftTermId][termId] ||= 0;
cooccur[leftTermId][termId]++;
// Also increment termId.leftTermId (to keep it symmetric)
cooccur[termId][leftTermId] ||= 0;
cooccur[termId][leftTermId]++;
}
}
}
sentenceIndex++;
}
return {terms, cooccur};
}
// Features are the most important part of an NLP program. A feature is anything
// important that you want to count. For example, how many words start with Q?
// This program looks at the words around a capitalized word.
//
// First, we count how often each word appears. Next, we count up the important
// words. Words like "the" and "to" aren't important. Finally, we count the
// average number of times all words appear.
//
// @param terms The terms to collect information about.
function computeFeatures(
terms: Term[],
cooccur: Coocurrances,
numberOfSentences: number,
): Term[] {
const termFrequencies: number[] = []; // All TFs
const validTermFrequencies: number[] = []; // Valid TFs
for (const term of terms) {
termFrequencies.push(term.termFrequency);
if (!term.isStopword) {
validTermFrequencies.push(term.termFrequency);
}
}
const averageTermFrequency = mean(validTermFrequencies);
const standardDeviation = stdev(validTermFrequencies);
const normalizedTermFrequency = averageTermFrequency + standardDeviation;
const maximumTermFrequency = Math.max(...termFrequencies);
for (const term of terms) {
term.termCase =
Math.max(term.acronymTermFrequency, term.uppercaseTermFrequency) /
(1 + Math.log(term.termFrequency));
term.termPosition = Math.log(3 + median(term.offsetsSentences));
term.normalizedTermFrequency = term.termFrequency / normalizedTermFrequency;
let uniqueLeftNeighbors = 0;
let sumOfUniqueLeftNeighbors = 0;
for (const leftTerm of Object.keys(cooccur)) {
const uniqueNeighborCount = cooccur[leftTerm][term.lowercase] || 0;
sumOfUniqueLeftNeighbors += uniqueNeighborCount;
if (uniqueNeighborCount > 0) {
uniqueLeftNeighbors++;
}
}
const leftDispersion =
sumOfUniqueLeftNeighbors > 0
? uniqueLeftNeighbors / sumOfUniqueLeftNeighbors
: 0;
let uniqueRightNeighbors = 0;
let sumOfUniqueRightNeighbors = 0;
if (cooccur[term.lowercase]) {
for (const rightTerm of Object.keys(cooccur[term.lowercase])) {
const uniqueNeighborCount = cooccur[term.lowercase][rightTerm] || 0;
sumOfUniqueRightNeighbors += uniqueNeighborCount;
if (uniqueNeighborCount > 0) {
uniqueRightNeighbors++;
}
}
}
const rightDispersion =
sumOfUniqueRightNeighbors > 0
? uniqueRightNeighbors / sumOfUniqueRightNeighbors
: 0;
term.termRelatedness =
1 +
(leftDispersion + rightDispersion) *
(term.termFrequency / maximumTermFrequency);
term.termSentences = term.offsetsSentences.length / numberOfSentences;
}
return terms;
}
// Extract n-grams from a set of sentences.
// (Sentences are arrays of terms)
// export function getNgrams(
// sentences: Term[][],
// minTerms = 2,
// maxTerms = 5,
// ): Ngram[] {
// const ngrams: Ngram[] = [];
// for (const sentence of sentences) {
// for (let start = 0; start < sentence.length; start++) {
// for (let length = minTerms; length <= maxTerms; length++) {
// const slice = sentence.slice(start, start + length);
// if (slice.length < minTerms || slice.length > maxTerms) {
// continue;
// }
// // Create an Ngram only if it passes all checks
// const ngram = new Ngram(slice);
// if (ngram.valueOf().length > 0) {
// ngrams.push(ngram);
// }
// }
// }
// }
// return ngrams;
// }
test.ts
import {test, expect, describe} from "bun:test";
import {
computeTermStatistics,
preprocessText,
Term,
type AnnotatedText,
type AnnotatedToken,
} from "./code.ts";
import {
splitAndTrim,
splitSentenceIntoChunks,
splitTextIntoSentences,
tag,
TagTypes,
} from "./util.ts";
class TestTerm extends Term {
setTermFrequency(value: number) {
this.termFrequency = value;
return this;
}
setAcronymTermFrequency(value: number) {
this.acronymTermFrequency = value;
return this;
}
setUppercaseTermFrequency(value: number) {
this.uppercaseTermFrequency = value;
return this;
}
}
describe("Splitting", () => {
test("Split", () => {
const result = splitAndTrim(" foo bar , baz quux ", /,/);
expect(result.length).toBe(2);
expect(result[0]).toBe("foo bar");
expect(result[1]).toBe("baz quux");
});
test("Sentence", () => {
const result = splitTextIntoSentences("Lorem ipsum. Foo bar?");
expect(result.length).toBe(2);
expect(result[0]).toBe("Lorem ipsum");
expect(result[1]).toBe("Foo bar");
});
test("Chunk", () => {
const result = splitSentenceIntoChunks("Lorem McRib; Dolor sit (amet)");
expect(result.length).toBe(3);
const [lorem, dolor, amet] = result;
expect(lorem).toBe("Lorem McRib");
expect(dolor).toBe("Dolor sit");
expect(amet).toBe("amet");
});
});
describe("Pre-process", () => {
test("Parse Sentences", () => {
const result = preprocessText("Foo Bar. Foo Faz.");
expect(result.length).toBe(2); // Container
expect(result[0].chunks.length).toBe(1); // Sentence
expect(result[0].chunks[0].tokens.length).toBe(2); // Chunk
});
test("Yake example", () => {
const text =
"One of the most iconic 3 software’s on this matter is the Practical Automatic Keyphrase Extraction (KEA), which was ≥ first known on 1999–06–07";
const result = preprocessText(text);
expect(result.length).toBe(1); // Container
expect(result[0].chunks.length).toBe(3); // Sentence
expect(result[0].chunks[0].tokens.length).toBe(17); // Chunk
expect(result[0].chunks[1].tokens.length).toBe(1); // Chunk
expect(result[0].chunks[2].tokens.length).toBe(7); // Chunk
});
});
describe("Tag Terms", () => {
test("Tag Numbers", () => {
const result = tag("123");
expect(result).toBe(TagTypes.Number);
});
test("Tag Spelled Number", () => {
const result = tag("one", false);
expect(result).toBe(TagTypes.Number);
});
test("Tag Money", () => {
const result = tag("$100");
expect(result).toBe(TagTypes.Money);
});
test("Remove Fraction", () => {
const result = tag("1/2");
console.log(result);
expect(result).toBe(TagTypes.Fraction);
});
test("Remove Percent", () => {
const result = tag("50%");
expect(result).toBe(TagTypes.Percent);
});
test("Remove PhoneNumber", () => {
const result = tag("123-456-7890");
expect(result).toBe(TagTypes.Phone);
});
test("Remove Date", () => {
const result = tag("2021-10-10");
expect(result).toBe(TagTypes.Date);
});
});
// Tag parts of speech
// -----------------------------------------------------------------------------
describe("N-gram Stopwords", () => {
test.todo("Tag Determiner", () => {
// Const result = tag("the cat");
});
test.todo("Tag Preposition", () => {
// Const result = tag("bad at golf");
});
test.todo("Tag Conjunction", () => {
// Const result = tag("apple and orange");
});
test.todo("Tag Pronoun", () => {
// Const result = tag("he she");
});
test.todo("Tag Copula", () => {
// Const result = tag("is are");
});
});
// Remove web markup
// -----------------------------------------------------------------------------
describe("N-Gram Web Markup", () => {
test("Remove Url", () => {
const result = tag("https://example.com");
expect(result).toBe(TagTypes.Web);
});
test("Remove unsecured Url", () => {
const result = tag("http://example.com");
expect(result).toBe(TagTypes.Web);
});
test("Remove Url without protocol", () => {
const result = tag("example.com");
expect(result).toBe(TagTypes.Web);
});
test("Remove Url with www", () => {
const result = tag("www.example.com");
expect(result).toBe(TagTypes.Web);
});
test("Remove Url with weird tld", () => {
const result = tag("http://my.blog");
expect(result).toBe(TagTypes.Web);
});
test("Remove Emoticon", () => {
const result = tag(":-)");
expect(result).toBe(TagTypes.Web);
});
test("Remove Email", () => {
const result = tag("email@example.com");
expect(result).toBe(TagTypes.Web);
});
test("Remove HashTag", () => {
const result = tag("#hashtag");
expect(result).toBe(TagTypes.TagHandle);
});
test("Remove AtMention", () => {
const result = tag("@user");
expect(result).toBe(TagTypes.TagHandle);
});
});
describe("Compute Term Statistics", () => {
test("Single token, chunk, and sentence", () => {
const annotatedToken: AnnotatedToken = {
lowercase: "hello",
tag: TagTypes.Uppercase,
isStopword: false,
};
const annotatedText: AnnotatedText = {
sentences: [
{
chunks: [
{
tokens: [annotatedToken],
},
],
},
],
};
const {terms, cooccur} = computeTermStatistics(annotatedText, 2);
expect(Object.keys(terms)).toHaveLength(1);
expect(Object.keys(cooccur)).toHaveLength(0);
});
test("Two tokens, single chunk, single sentence", () => {
const annotatedText: AnnotatedText = {
sentences: [
{
chunks: [
{
tokens: [
{
lowercase: "hello",
tag: TagTypes.Uppercase,
isStopword: false,
},
{
lowercase: "world",
tag: TagTypes.Uppercase,
isStopword: false,
},
],
},
],
},
],
};
const {terms, cooccur} = computeTermStatistics(annotatedText, 1);
// Check the terms
expect(Object.keys(terms).sort()).toEqual(["hello", "world"].sort());
expect(terms.hello.termFrequency).toBe(1);
expect(terms.hello.uppercaseTermFrequency).toBe(1);
expect(terms.world.termFrequency).toBe(1);
expect(terms.world.uppercaseTermFrequency).toBe(1);
// Check cooccurrences
expect(cooccur.hello).toBeDefined();
expect(cooccur.hello.world).toBe(1);
});
test("Acronym and uppercase tokens, single chunk, window=2", () => {
const annotatedToken1: AnnotatedToken = {
lowercase: "nlp",
tag: TagTypes.Acronym,
isStopword: false,
};
const annotatedToken2: AnnotatedToken = {
lowercase: "is",
tag: TagTypes.Uppercase,
isStopword: true,
};
const annotatedToken3: AnnotatedToken = {
lowercase: "cool",
tag: TagTypes.Uppercase,
isStopword: false,
};
const annotatedText: AnnotatedText = {
sentences: [
{
chunks: [
{
tokens: [annotatedToken1, annotatedToken2, annotatedToken3],
},
],
},
],
};
const {terms, cooccur} = computeTermStatistics(annotatedText, 2);
// Check terms
expect(terms.nlp.termFrequency).toBe(1);
expect(terms.nlp.acronymTermFrequency).toBe(1);
expect(terms.is.termFrequency).toBe(1);
expect(terms.is.uppercaseTermFrequency).toBe(1);
expect(terms.cool.termFrequency).toBe(1);
expect(terms.cool.uppercaseTermFrequency).toBe(1);
// Check cooccurrences
// 'nlp' co-occurs with 'is' and 'cool'
expect(cooccur.nlp.is).toBe(1);
expect(cooccur.nlp.cool).toBe(1);
// 'is' co-occurs with 'nlp' and 'cool'
expect(cooccur.is.nlp).toBe(1);
expect(cooccur.is.cool).toBe(1);
// 'cool' co-occurs with 'is' and 'nlp'
expect(cooccur.cool.is).toBe(1);
expect(cooccur.cool.nlp).toBe(1);
});
test("Multiple sentences, checking offsetsSentences", () => {
const annotatedText: AnnotatedText = {
sentences: [
{
chunks: [
{
tokens: [
{
lowercase: "hello",
tag: TagTypes.Uppercase,
isStopword: false,
},
],
},
],
},
{
chunks: [
{
tokens: [
{
lowercase: "world",
tag: TagTypes.Uppercase,
isStopword: false,
},
],
},
],
},
],
};
const {terms, cooccur} = computeTermStatistics(annotatedText, 1);
// Terms should have 'hello' and 'world'.
expect(terms.hello).toBeDefined();
expect(terms.hello.termFrequency).toBe(1);
expect(terms.hello.uppercaseTermFrequency).toBe(1);
expect(terms.world).toBeDefined();
expect(terms.world.termFrequency).toBe(1);
expect(terms.world.uppercaseTermFrequency).toBe(1);
// No cooccurrences, because each is in a separate sentence, no adjacent tokens.
expect(Object.keys(cooccur)).toHaveLength(0);
});
// ADD THESE TESTS
// describe("computeFeatures tests", () => {
// test("Single term, single sentence", () => {
// // 1) Setup
// const terms: Term[] = [
// {
// lowercase: "hello",
// numberOfWords: 1, // TF=1
// numberOfAcronyms: 0,
// numberOfCapitalWords: 1,
// isStopword: false,
// offsetsSentences: [0],
// },
// ];
// const cooccur: Coocurrances = {
// // For one term, there's no real co-occurrence
// hello: {},
// };
// const numberOfSentences = 1;
// // 2) Execute
// const result = computeFeatures(terms, cooccur, numberOfSentences);
// // 3) Assert
// expect(result).toHaveLength(1);
// const [term] = result;
// // Check some basics
// expect(term.termCase).toBeDefined();
// expect(term.termPosition).toBeDefined();
// expect(term.normalizedTermFrequency).toBeDefined();
// expect(term.wordRelatedness).toBeDefined();
// expect(term.wordSentences).toBeDefined();
// // Because numberOfWords=1
// // termCase = max(0,1)/(1 + ln(1)) => 1/1 => 1
// expect(term.termCase).toBeCloseTo(1, 5);
// // TermPosition = ln(3 + median([0])) => ln(3 + 0) => ln(3) ~ 1.0986
// expect(term.termPosition).toBeCloseTo(Math.log(3), 5);
// // NormalizedTermFrequency => 1/(average(1)+std(1)) => 1/(1+0) => 1
// expect(term.normalizedTermFrequency).toBe(1);
// // WordRelatedness => 1 + (0+0)*(1/1) => 1
// // because no left or right neighbors
// expect(term.wordRelatedness).toBe(1);
// // WordSentences => 1/1 => 1
// expect(term.wordSentences).toBe(1);
// });
// test("Two terms, single sentence, simple cooccurrence", () => {
// // 1) Setup
// const terms: Term[] = [
// {
// lowercase: "hello",
// numberOfWords: 2,
// numberOfAcronyms: 0,
// numberOfCapitalWords: 2,
// isStopword: false,
// offsetsSentences: [0],
// },
// {
// lowercase: "world",
// numberOfWords: 1,
// numberOfAcronyms: 0,
// numberOfCapitalWords: 1,
// isStopword: false,
// offsetsSentences: [0],
// },
// ];
// const cooccur: Coocurrances = {
// // 'hello' co-occurs once with 'world'
// hello: {world: 1},
// // 'world' co-occurs once with 'hello'
// world: {hello: 1},
// };
// const numberOfSentences = 1;
// // 2) Execute
// const result = computeFeatures(terms, cooccur, numberOfSentences);
// // 3) Assert
// // Because we have 2 terms: [2, 1] => average=1.5, stdev ~ 0.707..., sum=2.207...
// // highestWordCount=2
// // Check 'hello' stats (TF=2)
// const hello = result.find((t) => t.lowercase === "hello");
// expect(hello).toBeDefined();
// if (hello) {
// // TermCase => max(0,2)/(1+ln(2)) => 2/(1+0.6931) ~ 2/1.6931 => ~1.18
// expect(hello.termCase).toBeGreaterThan(1);
// // TermPosition => ln(3 + median([0])) => ln(3+0) => ln(3) ~1.0986
// expect(hello.termPosition).toBeCloseTo(Math.log(3), 5);
// // NormalizedTermFrequency => 2 / (1.5 + 0.707...) => 2/2.207... ~ 0.905
// expect(hello.normalizedTermFrequency).toBeCloseTo(0.905, 2);
// // For cooccurrence:
// // leftDispersion => from cooccur[*][hello] => 'world' -> cooccur['world']['hello']=1
// // sumOfUniqueLeftNeighbors=1, uniqueLeftNeighbors=1 => 1/1=1
// // rightDispersion => from cooccur[hello][*] => 'world' -> 1
// // sumOfUniqueRightNeighbors=1, uniqueRightNeighbors=1 =>1
// // => wordRelatedness => 1 + (1+1)*(2/2) => 1 + (2*1) => 3
// expect(hello.wordRelatedness).toBeCloseTo(3, 5);
// // WordSentences => 1 / 1 => 1
// expect(hello.wordSentences).toBeCloseTo(1, 5);
// }
// // Check 'world' stats (TF=1)
// const world = result.find((t) => t.lowercase === "world");
// expect(world).toBeDefined();
// if (world) {
// // TermCase => max(0,1)/(1+ln(1)) => 1/1 => 1
// expect(world.termCase).toBe(1);
// // TermPosition => same ln(3+0) => ln(3)
// expect(world.termPosition).toBeCloseTo(Math.log(3), 5);
// // NormalizedTermFrequency => 1 / 2.207 => ~0.452
// expect(world.normalizedTermFrequency).toBeCloseTo(0.45, 2);
// // Cooccur => leftDispersion => cooccur[*][world], i.e. 'hello' -> 1 => sum=1, distinct=1 =>1
// // rightDispersion => cooccur[world][*], i.e. 'hello' ->1 => sum=1, distinct=1 =>1
// // => wordRelatedness => 1 + (1+1)*(1/2) => 1 +2*(0.5) => 2
// expect(world.wordRelatedness).toBeCloseTo(2, 5);
// // WordSentences => 1/1 =>1
// expect(world.wordSentences).toBe(1);
// }
// });
// test("Handle empty validTermFrequencies gracefully", () => {
// // Suppose all terms are stopwords => validTermFrequencies = []
// const terms: Term[] = [
// {
// lowercase: "the",
// numberOfWords: 5,
// numberOfAcronyms: 0,
// numberOfCapitalWords: 0,
// isStopword: true, // All are stopwords
// offsetsSentences: [0],
// },
// {
// lowercase: "and",
// numberOfWords: 3,
// numberOfAcronyms: 0,
// numberOfCapitalWords: 0,
// isStopword: true,
// offsetsSentences: [0],
// },
// ];
// const cooccur: Coocurrances = {the: {}, and: {}};
// // This will lead to average=0, stdev=0 => normalizedWordCount=0 => possible division by zero
// // Let's see how the function handles it
// const result = computeFeatures(terms, cooccur, 1);
// // We expect no crashes
// expect(result).toHaveLength(2);
// // Because averageTermFrequency=0, standardDeviation=0 => normalizedWordCount=0
// // => term.normalizedTermFrequency = numberOfWords / 0 => Infinity or some fallback
// // If your code doesn't guard for that, you might see Infinity or NaN
// // Check if we do see Infinity
// expect(result[0].normalizedTermFrequency).toBe(Infinity);
// expect(result[1].normalizedTermFrequency).toBe(Infinity);
// });
// });
// });
// Describe("N-Gram Basics", () => {
// test("Gets Expected", () => {
// // Original sentence: Education never ends Watson.
// const expected = [
// "education",
// "education does",
// "education does not",
// "does",
// "does not",
// "does not never",
// "not",
// "not never",
// "not never end",
// "never",
// "never end",
// "never end watson",
// "end",
// "end watson",
// "watson",
// ];
// // Const sentences = [["education", "does", "not", "never", "end", "watson"]];
// const terms = preprocessText("Education never ends Watson.");
// const ngrams = getNgrams(terms, 1, 3);
// expect(ngrams.length).toEqual(expected.length);
// });
// test("Sentence Boundaries", () => {
// // Original sentence: Education never ends, Watson.
// const expected = [
// "education",
// "education does",
// "education does not",
// "does",
// "does not",
// "does not never",
// "not",
// "not never",
// "not never end",
// "never",
// "never end",
// "end",
// "watson",
// ];
// // Const sentences = [
// // ["education", "does", "not", "never", "end"],
// // ["watson"],
// // ];
// const terms = preprocessText("Education never ends, Watson.");
// const ngrams = getNgrams(terms, 1, 3);
// // Const terms = explodeTerms(sentences);
// // const result = getNgrams(terms, 1, 3);
// expect(ngrams.length).toEqual(expected.length);
// });
// test("Extraneous POS", () => {
// // Original sentence: Henry is bad at golf #blabla.
// const expected = [
// "henry",
// "henry bad",
// "henry bad golf",
// "bad",
// "bad golf",
// "golf",
// ];
// // Const sentences = [["henry", "bad", "golf"]];
// // const terms = explodeTerms(sentences);
// // const result = getNgrams(terms, 1, 3);
// const terms = preprocessText("Henry is bad at golf #blabla.");
// const ngrams = getNgrams(terms, 1, 3);
// expect(ngrams.length).toEqual(expected.length);
// });
// });
util.ts
import {isStopword} from "../_words/stopwords.ts";
export function splitAndTrim(text: string, pattern: RegExp) {
const result: string[] = [];
for (const part of text.split(pattern)) {
const value = part.trim();
if (value !== "") result.push(value);
}
return result;
}
export function splitTextIntoSentences(text: string): string[] {
return splitAndTrim(text, /[.?!]\s*/);
}
export function splitSentenceIntoChunks(text: string): string[] {
return splitAndTrim(text, /[:;,$[\]{}()<>]\s*/);
}
export function splitChunkIntoTokens(text: string): string[] {
return splitAndTrim(text, /['’ ]/);
}
// US phone number with optional "+1 " prefix
// Matches either "123-456-7890" or "(123) 456-7890"
const usPhoneNumberPattern =
/^(?:\+1\s)?(?:\d{3}-\d{3}-\d{4}|\(\d{3}\)\s\d{3}-\d{4})$/;
function isUsPhoneNumber(string_: string): boolean {
return usPhoneNumberPattern.test(string_);
}
const percentPattern = /^\d+(\.\d+)?%$/;
function isPercent(string_: string): boolean {
return percentPattern.test(string_);
}
// Matches: $1,234,567.89
const moneyPattern = /^\$\d{1,3}(,\d{3})*(\.\d{2})?$/;
function isMoney(string_: string): boolean {
return moneyPattern.test(string_);
}
// Matches: YYYY-MM-DD with hyphen variants.
const isoDatePattern = new RegExp(
"^" + // 1) Start-of-string anchor
"(\\d{4})" + // 2) Capture a 4-digit year
"[\\-\\u2013\\u2014]" + // 3) Match an ASCII hyphen, en dash, or em dash
"(0[1-9]|1[0-2])" + // 4) Capture the month, allowing 01-12
"[\\-\\u2013\\u2014]" + // 5) Again, match ASCII hyphen, en dash, or em dash
"(0[1-9]|[12]\\d|3[01])" + // 6) Capture the day, allowing 01-31
"$", // 7) End-of-string anchor
);
export function isIsoDate(dateString: string): boolean {
return isoDatePattern.test(dateString);
}
// Matches: Flexible DD/MM/YYYY allowing slash or fullwidth slash
// e.g., "05/01/2025" or "05/01/2025"
const ddMmYyyyPattern = new RegExp(
"^" +
"(0[1-9]|[12]\\d|3[01])" + // Day 01-31
"[/\\uFF0F]" + // Accept / (ASCII) or / (fullwidth slash)
"(0[1-9]|1[0-2])" + // Month 01-12
"[/\\uFF0F]" + // Again, slash or fullwidth slash
"(\\d{4})" + // 4-digit year
"$",
);
function isEuDate(dateString: string): boolean {
return ddMmYyyyPattern.test(dateString);
}
// Matches: Flexible DD-MM-YYYY allowing ASCII hyphen, en dash, or em dash
// e.g., "05-01-2025" or "05–01–2025" or "05—01—2025"
const ddMmYyyyDashPattern = new RegExp(
"^" +
"(0[1-9]|[12]\\d|3[01])" + // Day 01-31
"[\\-\\u2013\\u2014]" + // - (ASCII), – (en), — (em)
"(0[1-9]|1[0-2])" + // Month 01-12
"[\\-\\u2013\\u2014]" + // - (ASCII), – (en), — (em)
"(\\d{4})" + // 4-digit year
"$",
);
function isEuDateDash(dateString: string): boolean {
return ddMmYyyyDashPattern.test(dateString);
}
// Matches: -d/-d
const fractionPattern = /^[+-]?\d+\/\d+$/;
function isFraction(string_: string): boolean {
return fractionPattern.test(string_);
}
const numberPattern = /^-?\d{1,3}(,\d{3})*(\.\d+)?|\d+(\.\d+)?$/;
const spelledNumberPattern =
/^(one|two|three|four|five|six|seven|eight|nine|ten)$/i;
export function isNumber(token: string): boolean {
return numberPattern.test(token) || spelledNumberPattern.test(token);
}
// Matches: www.google.com, email@account.com, <head>
// And other items with more than two punctuation symbols.
const webAddressPattern = /^(?:.*[^\w\s]){2,}.*$/;
const tldPattern =
/\.(com|org|net|edu|gov|mil|int|info|biz|name|us|uk|ca|de|au|jp|cn|in|fr|it|xyz|online|site|tech|store|blog|app|dev|io|co|tv|me|ai|pro|health)\b/;
export function isWebAddress(token: string): boolean {
return webAddressPattern.test(token) || tldPattern.test(token);
}
// For "not alpha-numeric", we just need to detect any non-letter-or-digit character.
// The presence of symbols like ‰ or ¥ will trigger this as true.
const nonAlphaNumberPattern = /[^\p{L}\p{N}]/u;
export function isNotAlphaNumeric(token: string): boolean {
// Returns true if there's at least one character that is NOT a letter or digit
return nonAlphaNumberPattern.test(token);
}
// For alpha-numeric, we want at least one letter and at least one digit.
// Everything in the string must be letters or digits (no punctuation, no symbols).
const onlyAlphaNumericPattern = /^(?=.*\p{L})(?=.*\p{N})[\p{L}\p{N}]+$/u;
export function isAlphaNumeric(token: string): boolean {
return onlyAlphaNumericPattern.test(token);
}
const twoPlusUppercasePattern = /^[A-Z.]{2,}$/; // U.S.A. or MIT
export function isAcronym(token: string): boolean {
return twoPlusUppercasePattern.test(token);
}
// Matches an initial uppercase character.
const firstIsUppercasePattern = /^[A-Z]/;
export function isUppercase(
token: string,
isFirstTokenInSentence: boolean,
): boolean {
return isFirstTokenInSentence && firstIsUppercasePattern.test(token);
}
const tagOrHandlePattern = /^[@#]\w[a-z\d]{2,}/;
export function isTagOrHandle(token: string): boolean {
return tagOrHandlePattern.test(token);
}
export enum TagTypes {
Acronym = "Acronym",
AlphaNumeric = "AlphaNumeric",
Date = "Date",
Fraction = "Fraction",
Money = "Money",
NotAlphaNumeric = "NotAlphaNumeric",
Number = "Number",
Parsable = "Parsable",
Percent = "Percent",
Phone = "Phone",
TagHandle = "TagHandle",
Stopword = "Stopword",
Unparsable = "Unparsable",
Uppercase = "Uppercase",
Web = "Web",
}
// Everything except Parsable gets ignored at the moment.
export function tag(token: string, isFirstTokenInSentence = false): TagTypes {
if (isUsPhoneNumber(token)) return TagTypes.Phone;
if (isPercent(token)) return TagTypes.Percent;
if (isFraction(token)) return TagTypes.Fraction;
if (isIsoDate(token)) return TagTypes.Date;
if (isEuDate(token)) return TagTypes.Date;
if (isEuDateDash(token)) return TagTypes.Date;
if (isMoney(token)) return TagTypes.Money;
if (isNumber(token)) return TagTypes.Number;
if (isWebAddress(token)) return TagTypes.Web;
if (isTagOrHandle(token)) return TagTypes.TagHandle;
if (isNotAlphaNumeric(token)) return TagTypes.NotAlphaNumeric;
if (isAlphaNumeric(token)) return TagTypes.AlphaNumeric;
if (isAcronym(token)) return TagTypes.Acronym;
if (isStopword(token)) return TagTypes.Stopword;
if (isUppercase(token, isFirstTokenInSentence)) return TagTypes.Uppercase;
return TagTypes.Parsable;
}
/**
* Helper to compute the median of a numeric array.
* If the array is empty, returns 0 (you can choose another policy).
*/
export function median(values: number[]): number {
if (values.length === 0) return 0;
const sorted = [...values].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 === 0
? (sorted[mid - 1] + sorted[mid]) / 2
: sorted[mid];
}
/**
* Helper to compute the mean (average) of a numeric array.
* If empty, returns 0.
*/
export function mean(values: number[]): number {
if (values.length === 0) return 0;
const sum = values.reduce((accumulator, value) => accumulator + value, 0);
return sum / values.length;
}
/**
* Helper to compute the sample standard deviation.
* If array length < 2, returns 0.
*/
export function stdev(values: number[]): number {
if (values.length < 2) return 0;
const avg = mean(values);
const variance =
values.reduce((accumulator, value) => accumulator + (value - avg) ** 2, 0) /
(values.length - 1);
return Math.sqrt(variance);
}