Suffix Array

Posted by Dustin Boston in and .

The suffix array algorithm constructs a sorted array of all suffixes of a given string, enabling efficient substring searches, pattern matching, and text analysis. It uses sorting and optional auxiliary structures like the LCP array for additional optimizations.

Source Code Listing

code.ts

/**
 * @file Suffix Array
 * @author Dustin Boston
 * @see R. Sedgewick and K. Wayne, Algorithms, 4th ed. Upper Saddle River, NJ:
 *   Addison-Wesley, 2011. [Online]. Available: https://algs4.cs.princeton.edu/home/
 */

/** Represents a single suffix in a suffix array. */
/** Represents a single suffix in a suffix array. */
export class Suffix {
  constructor(
    public text: string,
    public index: number,
  ) {}

  /**
   * Gets the length of the suffix.
   *
   * @returns Text length minus the index
   */
  public get length(): number {
    return this.text.length - this.index;
  }

  /**
   * Returns the Unicode value of the character at the specified position or
   * undefined if the position is out of bounds.
   *
   * @param i The index of the character in the suffix.
   * @returns The Unicode value of the character.
   */
  public getCodePointAt(i: number): number {
    return this.text.codePointAt(this.index + i) ?? 0;
  }

  /**
   * Gets a character from the text by its index.
   *
   * @param i The index of the character.
   * @returns The character at the specified index.
   */
  public getCharAt(i: number): string {
    return this.text.charAt(this.index + i);
  }

  /**
   * Compares this suffix with another suffix.
   *
   * @param that The other suffix to compare.
   * @returns 0 if they are the same, -1 if this < that, and 1 if this > that.
   */
  public compareTo(that: Suffix): number {
    if (Object.is(this, that)) return 0; // Optimization
    const n = Math.min(this.length, that.length);
    for (let i = 0; i < n; i++) {
      if (this.getCodePointAt(i) < that.getCodePointAt(i)) return -1;
      if (this.getCodePointAt(i) > that.getCodePointAt(i)) return 1;
    }

    return this.length - that.length;
  }

  public toString(): string {
    return this.text.slice(Math.max(0, this.index));
  }
}

/** Suffix array. A suffix array is a sorted array of all suffixes of a string. */
export class SuffixArray {
  private readonly suffixes: Suffix[] = [];

  /**
   * Initializes a suffix array for the given `text` string.
   *
   * @param text The input string
   */
  constructor(text: string) {
    const n = text.length;
    for (let i = 0; i < n; i++) {
      this.suffixes[i] = new Suffix(text, i);
    }

    this.suffixes.sort((a, b) => a.compareTo(b));
  }

  /**
   * Gets the number of suffixes.
   *
   * @returns The number of suffixes
   */
  public get length(): number {
    return this.suffixes.length;
  }

  /**
   * Gets the index of the i-th smallest suffix in the original text.
   *
   * @param i The index of the suffix to retrieve.
   * @returns The start index of the i-th smallest suffix.
   * @throws If i is out of bounds.
   */
  public index(i: number): number {
    if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
    return this.suffixes[i].index;
  }

  /**
   * Computes the longest common prefix (LCP) between the i-th smallest suffix
   * and the previous suffix.
   *
   * @param i The index of the suffix.
   * @returns The length of the LCP.
   * @throws If i is out of bounds.
   */
  public lcp(i: number): number {
    if (i < 1 || i >= this.length) throw new RangeError("Index out of bounds");
    return this.lcpSuffix(this.suffixes[i], this.suffixes[i - 1]);
  }

  /**
   * Returns the i-th smallest suffix as a string.
   *
   * @param i The index
   * @returns The i-th smallest suffex of the string.
   * @throws If i is out of bounds.
   */
  public select(i: number) {
    if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
    return this.suffixes[i].toString();
  }

  /**
   * Gets the number of suffixes less than the query with binary search.
   *
   * @param query The query string
   * @returns The number of suffixes less than the query.
   */
  public rank(query: string): number {
    let lo = 0;
    let hi = this.length - 1;
    while (lo <= hi) {
      const mid = Math.floor(lo + (hi - lo) / 2);
      const cmp = this.compare(query, this.suffixes[mid]);
      if (cmp < 0) hi = mid - 1;
      else if (cmp > 0) lo = mid + 1;
      else return mid;
    }

    return lo;
  }

  /**
   * Computes the longest common prefix between two suffixes.
   *
   * @param a The first suffix.
   * @param b The second suffix.
   * @returns The length of the longest common prefix.
   */
  private lcpSuffix(a: Suffix, b: Suffix): number {
    const n = Math.min(a.length, b.length);
    for (let i = 0; i < n; i++) {
      if (a.getCodePointAt(i) !== b.getCodePointAt(i)) return i;
    }

    return n;
  }

  /**
   * Compares a query string with a suffix.
   *
   * @param query The query string.
   * @param suffix The suffix to compare with.
   * @returns -1, 0, or 1 depending on the comparison result.
   * @throws If suffix is undefined.
   */
  private compare(query: string, suffix: Suffix): number {
    // If (!suffix) throw new Error('Suffix is undefined');
    const n = Math.min(query.length, suffix.length);
    for (let i = 0; i < n; i++) {
      if ((query.codePointAt(i) ?? 0) < suffix.getCodePointAt(i)) return -1;
      if ((query.codePointAt(i) ?? 0) > suffix.getCodePointAt(i)) return 1;
    }

    return query.length - suffix.length;
  }
}

/**
 * Finds the longest repeated substring (LRS) in a given text that appears at
 * least twice. The repetitions can overlap if they are distinct.
 *
 * @param text The string in which to find the longest repeated substring.
 * @returns The longest repeated substring or empty string if not found.
 */
export function longestRepeatedSubstring(text: string): string {
  const n: number = text.length;
  const sa: SuffixArray = new SuffixArray(text);
  let lrs = "";
  for (let i = 1; i < n; i++) {
    const length = sa.lcp(i);
    if (length > lrs.length) {
      lrs = text.slice(Number(sa.index(i)), Number(sa.index(i)) + length);
    }
  }

  return lrs;
}

export type DebugRank = Array<{
  i: number;
  index: number;
  lcp: number;
  rank: number;
  select: string;
}>;

/**
 * Gets an array of objects containing rank, index, and lcp for each suffix in
 * the given string.
 *
 * @param s The string to rank.
 * @returns An array of objects with detailed suffix information.
 */
export function debugRank(s: string): DebugRank {
  const sa = new SuffixArray(s);
  const ranked: DebugRank = [];
  for (let i = 0; i < s.length; i++) {
    const index = sa.index(i);
    const select = s.slice(index, Math.min(index + 50, s.length));
    const rank = sa.rank(s.slice(Math.max(0, index)));
    if (i === 0) {
      ranked.push({i, index, lcp: -1, rank, select});
    } else {
      const lcp = sa.lcp(i);
      ranked.push({i, index, lcp, rank, select});
    }
  }

  return ranked;
}

code_test.ts

import {describe, test, expect} from "bun:test";
import {type DebugRank, debugRank, Suffix, SuffixArray} from "./code";
import * as path from "path";
import {longestRepeatedSubstring} from "./code";

/* ========================================================================
   SUFFIX CLASS TESTS
   ======================================================================== */

test("test Suffix length", () => {
  const suffix = new Suffix("example", 2);
  expect(suffix.length).toBe(5);
});

test("testSuffixCharAt", () => {
  const suffix = new Suffix("example", 3);
  expect(suffix.getCharAt(0)).toBe("m");
});

test("testSuffixToString", () => {
  const suffix = new Suffix("example", 3);
  expect(suffix.toString()).toBe("mple");
});

test("testSuffixCompareToEq", () => {
  const suffix1 = new Suffix("example", 2);
  const suffix2 = new Suffix("example", 2);
  expect(suffix1.compareTo(suffix2)).toBe(0);
});

test("testSuffixCompareToLt", () => {
  const suffix1 = new Suffix("example", 2);
  const suffix2 = new Suffix("example", 3);
  expect(suffix1.compareTo(suffix2)).toBeLessThan(0);
});

test("testSuffixCompareToGt", () => {
  const suffix1 = new Suffix("example", 3);
  const suffix2 = new Suffix("example", 2);
  expect(suffix1.compareTo(suffix2)).toBeGreaterThan(0);
});

test("testSuffixLengths", () => {
  const suffix1 = new Suffix("hello", 2); // "llo"
  const suffix2 = new Suffix("hello world", 2); // "llo world"
  expect(suffix1.compareTo(suffix2)).toBeLessThan(0);
});

/* ========================================================================
   SUFFIX ARRAY CLASS TESTS
   ======================================================================== */

test("testConstructor", () => {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  expect(suffixArray.length).toBe(6);
  expect(suffixArray.index(0)).toBe(5);
  expect(suffixArray.index(1)).toBe(3);
  expect(suffixArray.index(2)).toBe(1);
  // And so on for sorted order of suffixes
});

test("testIndex", () => {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  expect(() => suffixArray.index(-1)).toThrow(RangeError);
  expect(() => suffixArray.index(6)).toThrow(RangeError);
  expect(suffixArray.index(2)).toBe(1);
});

test("testLcp", () => {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  expect(suffixArray.lcp(2)).toBe(3);
  expect(suffixArray.lcp(1)).toBe(1);
  expect(() => suffixArray.lcp(0)).toThrow(RangeError);
  expect(() => suffixArray.lcp(6)).toThrow(RangeError);
});

test("testSelect", () => {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  expect(() => suffixArray.select(-1)).toThrow(RangeError);
  expect(() => suffixArray.select(6)).toThrow(RangeError);
  expect(suffixArray.select(0)).toBe("a");
  expect(suffixArray.select(1)).toBe("ana");
});

test("testRank", () => {
  const text = "banana";
  const suffixArray = new SuffixArray(text);
  expect(suffixArray.rank("banana")).toBe(3);
  expect(suffixArray.rank("apple")).toBe(3);
  expect(suffixArray.rank("zoo")).toBe(6);
});

/* ========================================================================
   DEBUG RANK
   ======================================================================== */

test("testDebugRank", () => {
  const actual = debugRank("ABRACADABRA!");
  const expected: DebugRank = [
    {i: 0, index: 11, lcp: -1, rank: 0, select: "!"},
    {i: 1, index: 10, lcp: 0, rank: 1, select: "A!"},
    {i: 2, index: 7, lcp: 1, rank: 2, select: "ABRA!"},
    {i: 3, index: 0, lcp: 4, rank: 3, select: "ABRACADABRA!"},
    {i: 4, index: 3, lcp: 1, rank: 4, select: "ACADABRA!"},
    {i: 5, index: 5, lcp: 1, rank: 5, select: "ADABRA!"},
    {i: 6, index: 8, lcp: 0, rank: 6, select: "BRA!"},
    {i: 7, index: 1, lcp: 3, rank: 7, select: "BRACADABRA!"},
    {i: 8, index: 4, lcp: 0, rank: 8, select: "CADABRA!"},
    {i: 9, index: 6, lcp: 0, rank: 9, select: "DABRA!"},
    {i: 10, index: 9, lcp: 0, rank: 10, select: "RA!"},
    {i: 11, index: 2, lcp: 2, rank: 11, select: "RACADABRA!"},
  ];
  expect(actual).toEqual(expected);
});

/* ========================================================================
   LONGEST REPEATED SUBSTRING TESTS
   ======================================================================== */

describe("longestRepeatedSubstring", () => {
  test("should find LRS in a simple string", () => {
    const s = "aacaagtttacaagc";
    const lrs = longestRepeatedSubstring(s);
    expect(lrs).toBe("acaag");
  });

  test("should find LRS in tiny_tale.txt", async () => {
    const tinyTale = Bun.file(path.join(import.meta.dir, "tiny-tale.txt"));
    const expected = "st of times, it was the ";
    const actual = longestRepeatedSubstring(await tinyTale.text());
    expect(actual).toBe(expected);
  });

  test("should find LRS in moby_dick.txt", async () => {
    const tinyTale = Bun.file(path.join(import.meta.dir, "moby-dick.txt"));
    const expected =
      ",— Such a funny, sporty, gamy, jesty, " +
      "joky, hoky-poky lad, is the Ocean, oh! Th";
    const text = await tinyTale.text();
    const actual = longestRepeatedSubstring(text.replaceAll(/\s+/g, " "));
    expect(actual).toBe(expected);
  });

  test("should find LRS in a string of same characters", () => {
    const expected = "aaaaaaaa";
    const actual = longestRepeatedSubstring("aaaaaaaaa");
    expect(actual).toBe(expected);
  });
});