TF-IDF

Posted by Dustin Boston in .


Source Code Listing

tf_idf.ts

/**
 * @file This file implements the Term Frequency-Inverse Document
 * Frequency (TF-IDF) weighting scheme, a statistical measure used to evaluate
 * how important a word is to a document in a collection or corpus. It also
 * includes functions for calculating term frequency and inverse document
 * frequency which are the building blocks for computing TF-IDF scores.
 */

import {type Corpus} from "../../../../data/algorithms/nlp/types";

/** A map of terms to their respective vector columns */
export type TermColumn = Map<string, number>;

/** An array of term frequencies mapped to term columns */
export type TfVector = number[];

/** An array of document frequencies mapped to term columns */
export type DfVector = number[];

/** An array of inverse document frequencies mapped to term columns */
export type IdfVector = number[];

/** An array of TF-IDF scores mapped to term columns */
export type TfIdfVector = number[];

/**
 * Create a TF-IDF matrix for an entire corpus.
 *
 * Calculates the term frequency-inverse document frequency (TF-IDF) for each
 * term in the documents of the given corpus. This helps in understanding the
 * importance or relevance of a term to a document in the context of a corpus.
 *
 * @param corpus The corpus of documents to calculate TF-IDF scores for.
 * @returns An array of maps where each map represents the TF-IDF scores of
 * terms in the corresponding document of the corpus.
 *
 * - Each row in the matrix represents a document from the corpus.
 * - Each column corresponds to a term (word or n-gram) from the entire corpus.
 * - Each cell at row i, col j contains the TF-IDF score of term j in doc i.
 */
export function computeTfIdfMatrix(corpus: Corpus): TfIdfVector[] {
  const terms: TermColumn = getTerms(corpus);
  const tfs = computeTF(corpus, terms);
  const dfs = computeDF(tfs, terms.size);
  const idfs = computeIDF(dfs, corpus.length);
  return computeTfIdf(tfs, idfs, corpus.length, terms.size);
}

/**
 * The final step in computing a TF-IDF score is multiplying the TF and the
 * IDF. If you want more control over the TF-IDF process, for example, to
 * create your own custom tfVectors, use this function. Otherwise use the
 * computeTfIdfMatrix function.
 *
 * @see computeTfIdfMatrix
 * @param tfVectors A map of terms to term frequency in a document.
 * @param idfVector An array of IDF values, one per term.
 * @param documentCount the number of documents in the corpus.
 * @param termCount the number of unique terms in the corpus.
 * @returns a matrix of TF-IDF scores.
 */
export function computeTfIdf(
  tfVectors: TfVector[],
  idfVector: IdfVector,
  documentCount: number,
  termCount: number,
): TfIdfVector[] {
  // Initialize the tfIdfVectors
  const tfIdfs: TfIdfVector[] = Array.from({length: termCount}, () =>
    Array.from({length: documentCount}, () => 0),
  );

  for (const [i, tfVector] of tfVectors.entries()) {
    for (const [index, tf] of tfVector.entries()) {
      const idf = idfVector[i];
      tfIdfs[i][index] = tf * idf;
    }
  }

  return tfIdfs;
}

/**
 * Calculates the term frequency (TF) for each term in the document. TF is the
 * number of times a term appears in the document.
 *
 * @param document An array of terms representing the document to calculate
 * term frequencies for.
 * @returns A map of terms to their corresponding term frequency in the doc.
 */
export function computeTF(corpus: Corpus, terms: TermColumn): TfVector[] {
  return corpus.map((document) => {
    const tfVector: number[] = Array.from({length: terms.size}, () => 0);
    for (const term of document) {
      const columnIndex = terms.get(term);
      if (columnIndex !== undefined) {
        tfVector[columnIndex] = (tfVector[columnIndex] ?? 0) + 1;
      }
    }

    return tfVector;
  });
}

/**
 * Calculates the inverse document frequency (IDF) for each term across all
 * documents. IDF measures the importance of a term; rarer terms have higher
 * IDF values.
 *
 * The +1 outside the logarithm function is a technique that ensures that
 * terms with a document frequency of 1 (which occur in only one document)
 * have an IDF of zero. This offset allows the TF-IDF to prioritize terms
 * that are unique to a document but still gives a small amount of weight to
 * terms that are more common.
 *
 * @param dfs A DF vector, see computeDF for details.
 * @param documentCount is the number of documents in the corpus.
 * @returns a vector of IDF values, one per term.
 */
export function computeIDF(dfs: DfVector, documentCount: number): IdfVector {
  const n = documentCount;
  const terms = dfs.length;
  return Array.from({length: terms}, (_, i) => Math.log(n / dfs[i]) + 1);
}

/**
 * Computes the Document Frequency (DF) vector for a given set of terms and
 * their term frequency vectors. The DF for a term is the number of documents
 * in which the term appears.
 *
 * @param terms - A map or object where keys are terms and values are indices.
 * @param tfVectors - An array of TF vectors, each representing a document.
 * @returns - A vector where each element represents the DF of a term.
 */
export function computeDF(tfVectors: TfVector[], termCount: number): DfVector {
  // Initialize the DF vector with zeros
  const dfVector: DfVector = Array.from({length: termCount}, () => 0);

  // Count how many documents each term appears in
  for (let t = 0; t < termCount; t++) {
    for (let f = 0; f < tfVectors[t].length; f++) {
      if (tfVectors[t][f] > 0) {
        dfVector[t] += 1;
      }
    }
  }

  return dfVector;
}

/**
 * Extracts a set of unique terms from a corpus of documents and maps each
 * term to a unique index. This function creates a mapping that can be used to
 * convert terms into vector indices for further text analysis operations
 * such as TF-IDF.
 *
 * @param corpus - An array of documents; each document is an array of strings.
 * @returns A Map where keys are terms and values are unique indices.
 */
export function getTerms(corpus: Corpus) {
  return corpus.reduce<TermColumn>((termColumn, document) => {
    for (const term of document) {
      if (!termColumn.has(term)) {
        termColumn.set(term, termColumn.size);
      }
    }

    return termColumn;
  }, new Map());
}

/**
 * Transposes a two-dimensional array (matrix), converting rows to columns and
 * vice versa. This function is useful in scenarios where you need to change
 * the orientation of data in a matrix. For instance, in text analysis,
 * converting a term-document matrix to a document-term matrix can facilitate
 * certain types of calculations such as computing cosine similarity between
 * documents.
 *
 * How it works:
 * - The function iterates over the first row of the input matrix to determine
 *   the number of columns.
 * - For each column index, it constructs a new row by gathering the elements
 *   from each original row at that column index.
 * - The map function on the first row creates a new array where each element
 *   is the result of the inner map, which assembles a new row for the
 *   transposed matrix.
 *
 * @param matrix - The matrix to be transposed, represented as an array of arrays.
 * @returns The transposed matrix.
 */
export function transpose(matrix: number[][]): number[][] {
  return matrix[0].map((_, colIndex) => matrix.map((row) => row[colIndex]));
}

tf_idf_test.ts

import {
  computeIDF,
  computeTF,
  computeTfIdfMatrix,
  TermColumn,
  type TfVector,
} from "../src/tf_idf.ts";
import {type Corpus} from "../src/types.d.ts";
import {assertEquals} from "./deps.ts";

/*
Some useful test data:

corpus 
[
  ['ngram', 'one'], 
  ['ngram', 'two'], 
  ['ngram', 'three'],
  ['ngram', 'eight'], 
  ['two', 'one'], 
  ['ngram'],
  ['ngram', 'two'], 
  ['eight']
];

terms Map(5) {
  "ngram" => 0,
  "one" => 1,
  "two" => 2,
  "three" => 3,
  "eight" => 4
}

tfs 
[
  [ 1, 1, 0, 0, 0 ],
  [ 1, 0, 1, 0, 0 ],
  [ 1, 0, 0, 1, 0 ],
  [ 1, 0, 0, 0, 1 ],
  [ 0, 1, 1, 0, 0 ],
  [ 1, 0, 0, 0, 0 ],
  [ 1, 0, 1, 0, 0 ],
  [ 0, 0, 0, 0, 1 ]
]

transposed tfs 
[
  [ 1, 1, 1, 1, 0, 1, 1, 0 ],
  [ 1, 0, 0, 0, 1, 0, 0, 0 ],
  [ 0, 1, 0, 0, 1, 0, 1, 0 ],
  [ 0, 0, 1, 0, 0, 0, 0, 0 ],
  [ 0, 0, 0, 1, 0, 0, 0, 1 ]
]

df scores 
[ 6, 2, 3, 1, 2 ]

idf scores:
1.2876820724517808,
2.386294361119891,
1.9808292530117262,
3.0794415416798357,
2.386294361119891

similarity matrix:
  0    1    2    3    4    5    6    7
0 1.00 0.26 0.18 0.23 0.68 0.47 0.26 0.00
1 0.26 1.00 0.21 0.26 0.54 0.55 1.00 0.00
2 0.18 0.21 1.00 0.18 0.00 0.39 0.21 0.00
3 0.23 0.26 0.18 1.00 0.00 0.47 0.26 0.88
4 0.68 0.54 0.00 0.00 1.00 0.00 0.54 0.00
5 0.47 0.55 0.39 0.47 0.00 1.00 0.55 0.00
6 0.26 1.00 0.21 0.26 0.54 0.55 1.00 0.00
7 0.00 0.00 0.00 0.88 0.00 0.00 0.00 1.00

breakdown:
0,1 ngram one -> ngram two: 0.2588281488091952
0,2 ngram one -> ngram three: 0.18320413303646593
0,3 ngram one -> ngram eight: 0.2255177530553173
0,4 ngram one -> two one: 0.6771508628309691
0,5 ngram one -> ngram: 0.4748870950608337
0,6 ngram one -> ngram two: 0.2588281488091952
0,7 ngram one -> eight: 0

1,2 ngram two -> ngram three: 0.2102645400000537
1,3 ngram two -> ngram eight: 0.2588281488091952
1,4 ngram two -> two one: 0.5355034322447702
1,5 ngram two -> ngram: 0.5450309168246379
1,6 ngram two -> ngram two: 1
1,7 ngram two -> eight: 0

2,3 ngram three -> ngram eight: 0.18320413303646593
2,4 ngram three -> two one: 0
2,5 ngram three -> ngram: 0.385784610577799
2,6 ngram three -> ngram two: 0.2102645400000537
2,7 ngram three -> eight: 0

3,4 ngram eight -> two one: 0
3,5 ngram eight -> ngram: 0.4748870950608337
3,6 ngram eight -> ngram two: 0.2588281488091952
3,7 ngram eight -> eight: 0.8800467299778363

4,5 two one -> ngram: 0
4,6 two one -> ngram two: 0.5355034322447702
4,7 two one -> eight: 0

5,6 ngram -> ngram two: 0.5450309168246379
5,7 ngram -> eight: 0

6,7 ngram two -> eight: 0

top terms:
[
  [ [ 0, 4 ], [ "ngram one", "two one" ], 0.6771508628309691 ],
  [ [ 1, 5 ], [ "ngram two", "ngram" ], 0.5450309168246379 ],
  [ [ 2, 5 ], [ "ngram three", "ngram" ], 0.385784610577799 ],
  [ [ 3, 7 ], [ "ngram eight", "eight" ], 0.8800467299778363 ],
  [ [ 4, 6 ], [ "two one", "ngram two" ], 0.5355034322447702 ],
  [ [ 5, 6 ], [ "ngram", "ngram two" ], 0.5450309168246379 ],
  [ [ 6, 7 ], [ "ngram two", "eight" ], 0 ]
]

*/

test("calculateTf should return TF vectors", () => {
  const corpus = [["term1", "term2", "term3"]];
  const terms = new Map([
    ["term1", 0],
    ["term2", 1],
    ["term3", 2],
  ]);
  const tfVectors = computeTF(corpus, terms);
  const expectedOutput = [[1, 1, 1]];
  assertEquals(tfVectors, expectedOutput);
});

test("calculateTf should handle empty document", () => {
  const corpus: Corpus = [];
  const terms = new Map();
  const tfVectors = computeTF(corpus, terms);
  const expectedOutput: TfVector[] = [];
  assertEquals(tfVectors, expectedOutput);
});

test("calculateTf should handle repeated single term", () => {
  const corpus = [["term1", "term1", "term1"]];
  const terms = new Map([["term1", 0]]);
  const tfVectors = computeTF(corpus, terms);
  const expectedOutput = [[3]];
  assertEquals(tfVectors, expectedOutput);
});

test("calculateTf should be case sensitive", () => {
  const corpus = [["term1", "Term1", "TERM1"]];
  const terms = new Map([
    ["term1", 0],
    ["Term1", 1],
    ["TERM1", 2],
  ]);
  const tfVectors = computeTF(corpus, terms);
  const expectedOutput = [[1, 1, 1]];
  assertEquals(tfVectors, expectedOutput);
});

// TODO: Renable and fix
// test('calculateIdf should return correct inverse document frequencies', () => {
// 	const termColumns: TermColumn = new Map([['term1', 0], ['term2', 1]]);
// 	// document 1: term2 term1, document 2: term1
// 	const tfVectors = [[1, 2], [1, 0]];
// 	const idfs = computeIDF(termColumns, tfVectors);
// 	const expected = [
// 		[0.6931471805599453, 0.6931471805599453],
// 		[0.6931471805599453, 0],
// 	];
// 	assertEquals(idfs[0][0], expected[0][0]);
// 	assertEquals(idfs[0][1], expected[0][1]);
// 	assertEquals(idfs[1][0], expected[1][0]);
// 	assertEquals(idfs[1][1], expected[1][1]);
// });
//
// test('calculateTfIdf should handle empty corpus', () => {
// 	const corpus: Corpus = [];
// 	const tfIdfs = computeTfIdfMatrix(corpus);
// 	const expected: TfIdfResult = {
// 		tfIdfVectors: [],
// 		terms: new Map(),
// 	};
// 	assertEquals(tfIdfs, expected);
// });
//
// test('calculateTfIdf with varying document lengths', () => {
// 	const corpus = [['term1', 'term2', 'term3'], ['term1', 'term2']];
// 	const { tfIdfVectors, terms } = computeTfIdfMatrix(corpus);
//
// 	assertEquals(terms.size, 3);
// 	assertEquals(tfIdfVectors.length, 2);
// 	assertEquals(tfIdfVectors[0].length, 3);
//
// 	const expected = [
// 		[0.6931471805599453, 0.6931471805599453, 0.6931471805599453],
// 		[0.6931471805599453, 0.6931471805599453, 0],
// 	];
// 	assertEquals(tfIdfVectors[0][0], expected[0][0]);
// });