TF-IDF
Posted by Dustin Boston in Algorithm.
Source Code Listing
tf_idf.ts
/**
* @file This file implements the Term Frequency-Inverse Document
* Frequency (TF-IDF) weighting scheme, a statistical measure used to evaluate
* how important a word is to a document in a collection or corpus. It also
* includes functions for calculating term frequency and inverse document
* frequency which are the building blocks for computing TF-IDF scores.
*/
import {type Corpus} from "../../../../data/algorithms/nlp/types";
/** A map of terms to their respective vector columns */
export type TermColumn = Map<string, number>;
/** An array of term frequencies mapped to term columns */
export type TfVector = number[];
/** An array of document frequencies mapped to term columns */
export type DfVector = number[];
/** An array of inverse document frequencies mapped to term columns */
export type IdfVector = number[];
/** An array of TF-IDF scores mapped to term columns */
export type TfIdfVector = number[];
/**
* Create a TF-IDF matrix for an entire corpus.
*
* Calculates the term frequency-inverse document frequency (TF-IDF) for each
* term in the documents of the given corpus. This helps in understanding the
* importance or relevance of a term to a document in the context of a corpus.
*
* @param corpus The corpus of documents to calculate TF-IDF scores for.
* @returns An array of maps where each map represents the TF-IDF scores of
* terms in the corresponding document of the corpus.
*
* - Each row in the matrix represents a document from the corpus.
* - Each column corresponds to a term (word or n-gram) from the entire corpus.
* - Each cell at row i, col j contains the TF-IDF score of term j in doc i.
*/
export function computeTfIdfMatrix(corpus: Corpus): TfIdfVector[] {
const terms: TermColumn = getTerms(corpus);
const tfs = computeTF(corpus, terms);
const dfs = computeDF(tfs, terms.size);
const idfs = computeIDF(dfs, corpus.length);
return computeTfIdf(tfs, idfs, corpus.length, terms.size);
}
/**
* The final step in computing a TF-IDF score is multiplying the TF and the
* IDF. If you want more control over the TF-IDF process, for example, to
* create your own custom tfVectors, use this function. Otherwise use the
* computeTfIdfMatrix function.
*
* @see computeTfIdfMatrix
* @param tfVectors A map of terms to term frequency in a document.
* @param idfVector An array of IDF values, one per term.
* @param documentCount the number of documents in the corpus.
* @param termCount the number of unique terms in the corpus.
* @returns a matrix of TF-IDF scores.
*/
export function computeTfIdf(
tfVectors: TfVector[],
idfVector: IdfVector,
documentCount: number,
termCount: number,
): TfIdfVector[] {
// Initialize the tfIdfVectors
const tfIdfs: TfIdfVector[] = Array.from({length: termCount}, () =>
Array.from({length: documentCount}, () => 0),
);
for (const [i, tfVector] of tfVectors.entries()) {
for (const [index, tf] of tfVector.entries()) {
const idf = idfVector[i];
tfIdfs[i][index] = tf * idf;
}
}
return tfIdfs;
}
/**
* Calculates the term frequency (TF) for each term in the document. TF is the
* number of times a term appears in the document.
*
* @param document An array of terms representing the document to calculate
* term frequencies for.
* @returns A map of terms to their corresponding term frequency in the doc.
*/
export function computeTF(corpus: Corpus, terms: TermColumn): TfVector[] {
return corpus.map((document) => {
const tfVector: number[] = Array.from({length: terms.size}, () => 0);
for (const term of document) {
const columnIndex = terms.get(term);
if (columnIndex !== undefined) {
tfVector[columnIndex] = (tfVector[columnIndex] ?? 0) + 1;
}
}
return tfVector;
});
}
/**
* Calculates the inverse document frequency (IDF) for each term across all
* documents. IDF measures the importance of a term; rarer terms have higher
* IDF values.
*
* The +1 outside the logarithm function is a technique that ensures that
* terms with a document frequency of 1 (which occur in only one document)
* have an IDF of zero. This offset allows the TF-IDF to prioritize terms
* that are unique to a document but still gives a small amount of weight to
* terms that are more common.
*
* @param dfs A DF vector, see computeDF for details.
* @param documentCount is the number of documents in the corpus.
* @returns a vector of IDF values, one per term.
*/
export function computeIDF(dfs: DfVector, documentCount: number): IdfVector {
const n = documentCount;
const terms = dfs.length;
return Array.from({length: terms}, (_, i) => Math.log(n / dfs[i]) + 1);
}
/**
* Computes the Document Frequency (DF) vector for a given set of terms and
* their term frequency vectors. The DF for a term is the number of documents
* in which the term appears.
*
* @param terms - A map or object where keys are terms and values are indices.
* @param tfVectors - An array of TF vectors, each representing a document.
* @returns - A vector where each element represents the DF of a term.
*/
export function computeDF(tfVectors: TfVector[], termCount: number): DfVector {
// Initialize the DF vector with zeros
const dfVector: DfVector = Array.from({length: termCount}, () => 0);
// Count how many documents each term appears in
for (let t = 0; t < termCount; t++) {
for (let f = 0; f < tfVectors[t].length; f++) {
if (tfVectors[t][f] > 0) {
dfVector[t] += 1;
}
}
}
return dfVector;
}
/**
* Extracts a set of unique terms from a corpus of documents and maps each
* term to a unique index. This function creates a mapping that can be used to
* convert terms into vector indices for further text analysis operations
* such as TF-IDF.
*
* @param corpus - An array of documents; each document is an array of strings.
* @returns A Map where keys are terms and values are unique indices.
*/
export function getTerms(corpus: Corpus) {
return corpus.reduce<TermColumn>((termColumn, document) => {
for (const term of document) {
if (!termColumn.has(term)) {
termColumn.set(term, termColumn.size);
}
}
return termColumn;
}, new Map());
}
/**
* Transposes a two-dimensional array (matrix), converting rows to columns and
* vice versa. This function is useful in scenarios where you need to change
* the orientation of data in a matrix. For instance, in text analysis,
* converting a term-document matrix to a document-term matrix can facilitate
* certain types of calculations such as computing cosine similarity between
* documents.
*
* How it works:
* - The function iterates over the first row of the input matrix to determine
* the number of columns.
* - For each column index, it constructs a new row by gathering the elements
* from each original row at that column index.
* - The map function on the first row creates a new array where each element
* is the result of the inner map, which assembles a new row for the
* transposed matrix.
*
* @param matrix - The matrix to be transposed, represented as an array of arrays.
* @returns The transposed matrix.
*/
export function transpose(matrix: number[][]): number[][] {
return matrix[0].map((_, colIndex) => matrix.map((row) => row[colIndex]));
}
tf_idf_test.ts
import {
computeIDF,
computeTF,
computeTfIdfMatrix,
TermColumn,
type TfVector,
} from "../src/tf_idf.ts";
import {type Corpus} from "../src/types.d.ts";
import {assertEquals} from "./deps.ts";
/*
Some useful test data:
corpus
[
['ngram', 'one'],
['ngram', 'two'],
['ngram', 'three'],
['ngram', 'eight'],
['two', 'one'],
['ngram'],
['ngram', 'two'],
['eight']
];
terms Map(5) {
"ngram" => 0,
"one" => 1,
"two" => 2,
"three" => 3,
"eight" => 4
}
tfs
[
[ 1, 1, 0, 0, 0 ],
[ 1, 0, 1, 0, 0 ],
[ 1, 0, 0, 1, 0 ],
[ 1, 0, 0, 0, 1 ],
[ 0, 1, 1, 0, 0 ],
[ 1, 0, 0, 0, 0 ],
[ 1, 0, 1, 0, 0 ],
[ 0, 0, 0, 0, 1 ]
]
transposed tfs
[
[ 1, 1, 1, 1, 0, 1, 1, 0 ],
[ 1, 0, 0, 0, 1, 0, 0, 0 ],
[ 0, 1, 0, 0, 1, 0, 1, 0 ],
[ 0, 0, 1, 0, 0, 0, 0, 0 ],
[ 0, 0, 0, 1, 0, 0, 0, 1 ]
]
df scores
[ 6, 2, 3, 1, 2 ]
idf scores:
1.2876820724517808,
2.386294361119891,
1.9808292530117262,
3.0794415416798357,
2.386294361119891
similarity matrix:
0 1 2 3 4 5 6 7
0 1.00 0.26 0.18 0.23 0.68 0.47 0.26 0.00
1 0.26 1.00 0.21 0.26 0.54 0.55 1.00 0.00
2 0.18 0.21 1.00 0.18 0.00 0.39 0.21 0.00
3 0.23 0.26 0.18 1.00 0.00 0.47 0.26 0.88
4 0.68 0.54 0.00 0.00 1.00 0.00 0.54 0.00
5 0.47 0.55 0.39 0.47 0.00 1.00 0.55 0.00
6 0.26 1.00 0.21 0.26 0.54 0.55 1.00 0.00
7 0.00 0.00 0.00 0.88 0.00 0.00 0.00 1.00
breakdown:
0,1 ngram one -> ngram two: 0.2588281488091952
0,2 ngram one -> ngram three: 0.18320413303646593
0,3 ngram one -> ngram eight: 0.2255177530553173
0,4 ngram one -> two one: 0.6771508628309691
0,5 ngram one -> ngram: 0.4748870950608337
0,6 ngram one -> ngram two: 0.2588281488091952
0,7 ngram one -> eight: 0
1,2 ngram two -> ngram three: 0.2102645400000537
1,3 ngram two -> ngram eight: 0.2588281488091952
1,4 ngram two -> two one: 0.5355034322447702
1,5 ngram two -> ngram: 0.5450309168246379
1,6 ngram two -> ngram two: 1
1,7 ngram two -> eight: 0
2,3 ngram three -> ngram eight: 0.18320413303646593
2,4 ngram three -> two one: 0
2,5 ngram three -> ngram: 0.385784610577799
2,6 ngram three -> ngram two: 0.2102645400000537
2,7 ngram three -> eight: 0
3,4 ngram eight -> two one: 0
3,5 ngram eight -> ngram: 0.4748870950608337
3,6 ngram eight -> ngram two: 0.2588281488091952
3,7 ngram eight -> eight: 0.8800467299778363
4,5 two one -> ngram: 0
4,6 two one -> ngram two: 0.5355034322447702
4,7 two one -> eight: 0
5,6 ngram -> ngram two: 0.5450309168246379
5,7 ngram -> eight: 0
6,7 ngram two -> eight: 0
top terms:
[
[ [ 0, 4 ], [ "ngram one", "two one" ], 0.6771508628309691 ],
[ [ 1, 5 ], [ "ngram two", "ngram" ], 0.5450309168246379 ],
[ [ 2, 5 ], [ "ngram three", "ngram" ], 0.385784610577799 ],
[ [ 3, 7 ], [ "ngram eight", "eight" ], 0.8800467299778363 ],
[ [ 4, 6 ], [ "two one", "ngram two" ], 0.5355034322447702 ],
[ [ 5, 6 ], [ "ngram", "ngram two" ], 0.5450309168246379 ],
[ [ 6, 7 ], [ "ngram two", "eight" ], 0 ]
]
*/
test("calculateTf should return TF vectors", () => {
const corpus = [["term1", "term2", "term3"]];
const terms = new Map([
["term1", 0],
["term2", 1],
["term3", 2],
]);
const tfVectors = computeTF(corpus, terms);
const expectedOutput = [[1, 1, 1]];
assertEquals(tfVectors, expectedOutput);
});
test("calculateTf should handle empty document", () => {
const corpus: Corpus = [];
const terms = new Map();
const tfVectors = computeTF(corpus, terms);
const expectedOutput: TfVector[] = [];
assertEquals(tfVectors, expectedOutput);
});
test("calculateTf should handle repeated single term", () => {
const corpus = [["term1", "term1", "term1"]];
const terms = new Map([["term1", 0]]);
const tfVectors = computeTF(corpus, terms);
const expectedOutput = [[3]];
assertEquals(tfVectors, expectedOutput);
});
test("calculateTf should be case sensitive", () => {
const corpus = [["term1", "Term1", "TERM1"]];
const terms = new Map([
["term1", 0],
["Term1", 1],
["TERM1", 2],
]);
const tfVectors = computeTF(corpus, terms);
const expectedOutput = [[1, 1, 1]];
assertEquals(tfVectors, expectedOutput);
});
// TODO: Renable and fix
// test('calculateIdf should return correct inverse document frequencies', () => {
// const termColumns: TermColumn = new Map([['term1', 0], ['term2', 1]]);
// // document 1: term2 term1, document 2: term1
// const tfVectors = [[1, 2], [1, 0]];
// const idfs = computeIDF(termColumns, tfVectors);
// const expected = [
// [0.6931471805599453, 0.6931471805599453],
// [0.6931471805599453, 0],
// ];
// assertEquals(idfs[0][0], expected[0][0]);
// assertEquals(idfs[0][1], expected[0][1]);
// assertEquals(idfs[1][0], expected[1][0]);
// assertEquals(idfs[1][1], expected[1][1]);
// });
//
// test('calculateTfIdf should handle empty corpus', () => {
// const corpus: Corpus = [];
// const tfIdfs = computeTfIdfMatrix(corpus);
// const expected: TfIdfResult = {
// tfIdfVectors: [],
// terms: new Map(),
// };
// assertEquals(tfIdfs, expected);
// });
//
// test('calculateTfIdf with varying document lengths', () => {
// const corpus = [['term1', 'term2', 'term3'], ['term1', 'term2']];
// const { tfIdfVectors, terms } = computeTfIdfMatrix(corpus);
//
// assertEquals(terms.size, 3);
// assertEquals(tfIdfVectors.length, 2);
// assertEquals(tfIdfVectors[0].length, 3);
//
// const expected = [
// [0.6931471805599453, 0.6931471805599453, 0.6931471805599453],
// [0.6931471805599453, 0.6931471805599453, 0],
// ];
// assertEquals(tfIdfVectors[0][0], expected[0][0]);
// });