Suffix Array

Posted by Dustin Boston in and .


The suffix array algorithm constructs a sorted array of all suffixes of a given string, enabling efficient substring searches, pattern matching, and text analysis. It uses sorting and optional auxiliary structures like the LCP array for additional optimizations.

Source Code Listing

code.ts

/**
 * @file Suffix Array
 * @author Dustin Boston
 * @see R. Sedgewick and K. Wayne, Algorithms, 4th ed. Upper Saddle River, NJ:
 *   Addison-Wesley, 2011. [Online]. Available: https://algs4.cs.princeton.edu/home/
 */

/** Represents a single suffix in a suffix array. */
/** Represents a single suffix in a suffix array. */
export class Suffix {
  constructor(
    public text: string,
    public index: number,
  ) {}

  /**
   * Gets the length of the suffix.
   *
   * @returns Text length minus the index
   */
  public get length(): number {
    return this.text.length - this.index;
  }

  /**
   * Returns the Unicode value of the character at the specified position or
   * undefined if the position is out of bounds.
   *
   * @param i The index of the character in the suffix.
   * @returns The Unicode value of the character.
   */
  public getCodePointAt(i: number): number {
    return this.text.codePointAt(this.index + i) ?? 0;
  }

  /**
   * Gets a character from the text by its index.
   *
   * @param i The index of the character.
   * @returns The character at the specified index.
   */
  public getCharAt(i: number): string {
    return this.text.charAt(this.index + i);
  }

  /**
   * Compares this suffix with another suffix.
   *
   * @param that The other suffix to compare.
   * @returns 0 if they are the same, -1 if this < that, and 1 if this > that.
   */
  public compareTo(that: Suffix): number {
    if (Object.is(this, that)) return 0; // Optimization
    const n = Math.min(this.length, that.length);
    for (let i = 0; i < n; i++) {
      if (this.getCodePointAt(i) < that.getCodePointAt(i)) return -1;
      if (this.getCodePointAt(i) > that.getCodePointAt(i)) return 1;
    }

    return this.length - that.length;
  }

  public toString(): string {
    return this.text.slice(Math.max(0, this.index));
  }
}

/** Suffix array. A suffix array is a sorted array of all suffixes of a string. */
export class SuffixArray {
  private readonly suffixes: Suffix[] = [];

  /**
   * Initializes a suffix array for the given `text` string.
   *
   * @param text The input string
   */
  constructor(text: string) {
    const n = text.length;
    for (let i = 0; i < n; i++) {
      this.suffixes[i] = new Suffix(text, i);
    }

    this.suffixes.sort((a, b) => a.compareTo(b));
  }

  /**
   * Gets the number of suffixes.
   *
   * @returns The number of suffixes
   */
  public get length(): number {
    return this.suffixes.length;
  }

  /**
   * Gets the index of the i-th smallest suffix in the original text.
   *
   * @param i The index of the suffix to retrieve.
   * @returns The start index of the i-th smallest suffix.
   * @throws If i is out of bounds.
   */
  public index(i: number): number {
    if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
    return this.suffixes[i].index;
  }

  /**
   * Computes the longest common prefix (LCP) between the i-th smallest suffix
   * and the previous suffix.
   *
   * @param i The index of the suffix.
   * @returns The length of the LCP.
   * @throws If i is out of bounds.
   */
  public lcp(i: number): number {
    if (i < 1 || i >= this.length) throw new RangeError("Index out of bounds");
    return this.lcpSuffix(this.suffixes[i], this.suffixes[i - 1]);
  }

  /**
   * Returns the i-th smallest suffix as a string.
   *
   * @param i The index
   * @returns The i-th smallest suffex of the string.
   * @throws If i is out of bounds.
   */
  public select(i: number) {
    if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
    return this.suffixes[i].toString();
  }

  /**
   * Gets the number of suffixes less than the query with binary search.
   *
   * @param query The query string
   * @returns The number of suffixes less than the query.
   */
  public rank(query: string): number {
    let lo = 0;
    let hi = this.length - 1;
    while (lo <= hi) {
      const mid = Math.floor(lo + (hi - lo) / 2);
      const cmp = this.compare(query, this.suffixes[mid]);
      if (cmp < 0) hi = mid - 1;
      else if (cmp > 0) lo = mid + 1;
      else return mid;
    }

    return lo;
  }

  /**
   * Computes the longest common prefix between two suffixes.
   *
   * @param a The first suffix.
   * @param b The second suffix.
   * @returns The length of the longest common prefix.
   */
  private lcpSuffix(a: Suffix, b: Suffix): number {
    const n = Math.min(a.length, b.length);
    for (let i = 0; i < n; i++) {
      if (a.getCodePointAt(i) !== b.getCodePointAt(i)) return i;
    }

    return n;
  }

  /**
   * Compares a query string with a suffix.
   *
   * @param query The query string.
   * @param suffix The suffix to compare with.
   * @returns -1, 0, or 1 depending on the comparison result.
   * @throws If suffix is undefined.
   */
  private compare(query: string, suffix: Suffix): number {
    // If (!suffix) throw new Error('Suffix is undefined');
    const n = Math.min(query.length, suffix.length);
    for (let i = 0; i < n; i++) {
      if ((query.codePointAt(i) ?? 0) < suffix.getCodePointAt(i)) return -1;
      if ((query.codePointAt(i) ?? 0) > suffix.getCodePointAt(i)) return 1;
    }

    return query.length - suffix.length;
  }
}

/**
 * Finds the longest repeated substring (LRS) in a given text that appears at
 * least twice. The repetitions can overlap if they are distinct.
 *
 * @param text The string in which to find the longest repeated substring.
 * @returns The longest repeated substring or empty string if not found.
 */
export function longestRepeatedSubstring(text: string): string {
  const n: number = text.length;
  const sa: SuffixArray = new SuffixArray(text);
  let lrs = "";
  for (let i = 1; i < n; i++) {
    const length = sa.lcp(i);
    if (length > lrs.length) {
      lrs = text.slice(Number(sa.index(i)), Number(sa.index(i)) + length);
    }
  }

  return lrs;
}

export type DebugRank = Array<{
  i: number;
  index: number;
  lcp: number;
  rank: number;
  select: string;
}>;

/**
 * Gets an array of objects containing rank, index, and lcp for each suffix in
 * the given string.
 *
 * @param s The string to rank.
 * @returns An array of objects with detailed suffix information.
 */
export function debugRank(s: string): DebugRank {
  const sa = new SuffixArray(s);
  const ranked: DebugRank = [];
  for (let i = 0; i < s.length; i++) {
    const index = sa.index(i);
    const select = s.slice(index, Math.min(index + 50, s.length));
    const rank = sa.rank(s.slice(Math.max(0, index)));
    if (i === 0) {
      ranked.push({i, index, lcp: -1, rank, select});
    } else {
      const lcp = sa.lcp(i);
      ranked.push({i, index, lcp, rank, select});
    }
  }

  return ranked;
}

(() => {
  // Define an example string
  const text = "banana";

  // Create a suffix array from the string
  const suffixArray = new SuffixArray(text);

  // Retrieve and print the sorted suffixes with their indices
  for (let i = 0; i < suffixArray.length; i++) {
    const index = suffixArray.index(i);
    const suffix = suffixArray.select(i);
    console.log(`Suffix at index ${index}: "${suffix}"`);
  }

  // Demonstrate LCP (Longest Common Prefix) functionality
  if (suffixArray.length > 1) {
    console.log(`LCP of suffixes 1 and 2: ${suffixArray.lcp(1)}`);
  }

  // Rank an arbitrary substring
  const substring = "ana";
  console.log(
    `Rank of substring "${substring}": ${suffixArray.rank(substring)}`,
  );

  const repeated = longestRepeatedSubstring("abcabcabc");
  console.log(`Longest repeated substring: '${repeated}'`);
})();

suffix_array_lrs_test.ts

import {longestRepeatedSubstring} from "../../../../data/algorithms/src/suffix_array_lrs.ts";
import {assertEquals} from "$assert";

test(function testLrsAcaag() {
  const s = "aacaagtttacaagc";
  const lrs = longestRepeatedSubstring(s);
  assertEquals(lrs, "acaag");
});

test(function testTinyTale() {
  const tinyTale = Deno.readTextFileSync("./test/fixtures/tiny_tale.txt");
  const expected = "st of times it was the ";
  const actual = longestRepeatedSubstring(tinyTale.replaceAll(/\s+/g, " "));
  assertEquals(actual, expected);
});

test(function testMobyDick() {
  const mobyDick = Deno.readTextFileSync("./test/fixtures/moby_dick.txt");
  const expected =
    ",- Such a funny, sporty, gamy, jesty, " +
    "joky, hoky-poky lad, is the Ocean, oh! Th";
  const actual = longestRepeatedSubstring(mobyDick.replaceAll(/\s+/g, " "));
  assertEquals(actual, expected);
});

test(function testAaaaaaaaa() {
  const expected = "aaaaaaaa";
  const actual = longestRepeatedSubstring("aaaaaaaaa");
  assertEquals(actual, expected);
});

suffix_array_test.ts

import {
  type DebugRank,
  debugRank,
  Suffix,
  SuffixArray,
} from "../../../../data/algorithms/src/suffix_array.ts";
import {
  assertEquals,
  assertGreater,
  assertLess,
  assertObjectMatch,
  assertThrows,
} from "$assert";

/* ========================================================================
   SUFFIX CLASS TESTS
   ======================================================================== */

test("test Suffix length", () => {
  const suffix = new Suffix("example", 2);
  assertEquals(suffix.length, 5);
});

test(function testSuffixCharAt() {
  const suffix = new Suffix("example", 3);
  assertEquals(suffix.charAt(0), "m");
});

test(function testSuffixToString() {
  const suffix = new Suffix("example", 3);
  assertEquals(suffix.toString(), "mple");
});

test(function testSuffixCompareToEq() {
  const suffix1 = new Suffix("example", 2);
  const suffix2 = new Suffix("example", 2);
  assertEquals(suffix1.compareTo(suffix2), 0);
});

test(function testSuffixCompareToLt() {
  const suffix1 = new Suffix("example", 2);
  const suffix2 = new Suffix("example", 3);
  assertLess(suffix1.compareTo(suffix2), 0);
});

test(function testSuffixCompareToGt() {
  const suffix1 = new Suffix("example", 3);
  const suffix2 = new Suffix("example", 2);
  assertGreater(suffix1.compareTo(suffix2), 0);
});

test(function testSuffixLengths() {
  const suffix1 = new Suffix("hello", 2); // "llo"
  const suffix2 = new Suffix("hello world", 2); // "llo world"
  assertLess(suffix1.compareTo(suffix2), 0);
});

/* ========================================================================
   SUFFIX ARRAY CLASS TESTS
   ======================================================================== */

test(function testConstructor() {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  assertEquals(suffixArray.length, 6);
  assertEquals(suffixArray.index(0), 5);
  assertEquals(suffixArray.index(1), 3);
  assertEquals(suffixArray.index(2), 1);
  // And so on for sorted order of suffixes
});

test(function testIndex() {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  assertThrows(() => suffixArray.index(-1), RangeError);
  assertThrows(() => suffixArray.index(6), RangeError);
  assertEquals(suffixArray.index(2), 1);
});

test(function testLcp() {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  assertEquals(suffixArray.lcp(2), 3);
  assertEquals(suffixArray.lcp(1), 1);
  assertThrows(() => suffixArray.lcp(0), RangeError);
  assertThrows(() => suffixArray.lcp(6), RangeError);
});

test(function testSelect() {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  assertThrows(() => suffixArray.select(-1), RangeError);
  assertThrows(() => suffixArray.select(6), RangeError);
  assertEquals(suffixArray.select(0), "a");
  assertEquals(suffixArray.select(1), "ana");
});

test(function testRank() {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  assertEquals(suffixArray.rank("banana"), 3);
  assertEquals(suffixArray.rank("apple"), 3);
  assertEquals(suffixArray.rank("zoo"), 6);
});

/* ========================================================================
   DEBUG RANK
   ======================================================================== */

test(function testDebugRank() {
  const actual = debugRank("ABRACADABRA!");
  const expected: DebugRank = [
    {i: 0, index: 11, lcp: -1, rank: 0, select: "!"},
    {i: 1, index: 10, lcp: 0, rank: 1, select: "A!"},
    {i: 2, index: 7, lcp: 1, rank: 2, select: "ABRA!"},
    {i: 3, index: 0, lcp: 4, rank: 3, select: "ABRACADABRA!"},
    {i: 4, index: 3, lcp: 1, rank: 4, select: "ACADABRA!"},
    {i: 5, index: 5, lcp: 1, rank: 5, select: "ADABRA!"},
    {i: 6, index: 8, lcp: 0, rank: 6, select: "BRA!"},
    {i: 7, index: 1, lcp: 3, rank: 7, select: "BRACADABRA!"},
    {i: 8, index: 4, lcp: 0, rank: 8, select: "CADABRA!"},
    {i: 9, index: 6, lcp: 0, rank: 9, select: "DABRA!"},
    {i: 10, index: 9, lcp: 0, rank: 10, select: "RA!"},
    {i: 11, index: 2, lcp: 2, rank: 11, select: "RACADABRA!"},
  ];
  assertObjectMatch({rank: actual}, {rank: expected});
});