Textrank

Posted by Dustin Boston in .


Source Code Listing

textrank.ts

/**
 * Textrank Algorithm Implementation
 * @see Mihalcea, Rada, and Tarau, Paul. (2004). _TextRank: Bringing Order
 *     into Text._ Association for Computational Linguistics.
 * @see https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
 */

import {Vert} from "../../../../data/algorithms/nlp/vert.ts";
import {type Weights, type Ngram} from "../../../../data/algorithms/nlp/types";
import {type ExtractionStrategy} from "../../../../data/algorithms/nlp/extraction_strategy.ts";

export const DAMPEN = 0.85;

/**
 * Implementation of the Textrank algorithm. The strategy pattern is
 * used to direct whether we are ranking keywords/co-references or
 * sentences/similarity.
 */
export class Textrank {
  constructor(public extractionStrategy: ExtractionStrategy) {}

  extractData(ngrams: Ngram[]): Vert[] {
    const graphData = this.createGraph(ngrams);
    const {vertices, adj} = graphData;
    this.extractionStrategy.collect(graphData);
    this.scoreVertices(vertices, adj);
    return this.sortResult(vertices);
  }

  createGraph(ngrams: Ngram[]) {
    const corpus = new Map<string, number>();
    const vertices: Vert[] = []; // Indices are ids
    const tokenVec: number[] = [];

    for (const ngram of ngrams) {
      const text = this.extractionStrategy.transform(ngram);
      let id = corpus.get(text);
      if (id === undefined) {
        id = corpus.size;
        corpus.set(text, id);
        const vert = new Vert(id, text);
        vert.ngram = ngram;
        vertices.push(vert);
      }

      tokenVec.push(id);
    }

    // Initialize a sparse adjacency matrix - stores the number of occurances of an edge.
    const adj: number[][] = [];
    for (let i = 0; i < corpus.size; i++) adj[i] = [];
    return {tokenVec, vertices, adj};
  }

  scoreVertices(vertices: Vert[], adj: number[][]) {
    const threshhold = 0.0001;
    const maxIterations = 100;

    let converged = 0;
    let iterations = 0;

    while (converged < vertices.length && iterations < maxIterations) {
      converged = 0;
      iterations++;
      const scores = Array.from({length: vertices.length});
      for (const [i, vi] of vertices.entries()) {
        const previous = vi.score;
        scores[i] = this.weightedScore(adj, vi);
        const errorRate = scores[i] - previous;
        if (errorRate <= threshhold) {
          converged++;
        }
      }

      // Update all scores
      for (const [i, score] of scores.entries()) vertices[i].score = score;
    }
  }

  sortResult(vertices: Vert[]) {
    return vertices.toSorted((a, b) => b.score - a.score);
  }

  weightedScore(weights: Weights, vert: Vert): number {
    let sum = 0;
    for (const inbound of vert.inbound) {
      let denom = 0;
      for (const outbound of inbound.outbound) {
        denom += weights[inbound.id][outbound.id];
      }

      // Skip if there are no outbound vertices, avoiding division by zero
      if (denom === 0) continue;
      sum += (weights[inbound.id][vert.id] / denom) * inbound.score;
    }

    // Return the score, but wait until after convergence to set it.
    const score = 1 - DAMPEN + DAMPEN * sum;
    return score;
  }
}