Suffix Array
Posted by Dustin Boston in Algorithms and Data Structures.
The suffix array algorithm constructs a sorted array of all suffixes of a given string, enabling efficient substring searches, pattern matching, and text analysis. It uses sorting and optional auxiliary structures like the LCP array for additional optimizations.
Source Code Listing
code.ts
/**
* @file Suffix Array
* @author Dustin Boston
* @see R. Sedgewick and K. Wayne, Algorithms, 4th ed. Upper Saddle River, NJ:
* Addison-Wesley, 2011. [Online]. Available: https://algs4.cs.princeton.edu/home/
*/
/** Represents a single suffix in a suffix array. */
/** Represents a single suffix in a suffix array. */
export class Suffix {
constructor(
public text: string,
public index: number,
) {}
/**
* Gets the length of the suffix.
*
* @returns Text length minus the index
*/
public get length(): number {
return this.text.length - this.index;
}
/**
* Returns the Unicode value of the character at the specified position or
* undefined if the position is out of bounds.
*
* @param i The index of the character in the suffix.
* @returns The Unicode value of the character.
*/
public getCodePointAt(i: number): number {
return this.text.codePointAt(this.index + i) ?? 0;
}
/**
* Gets a character from the text by its index.
*
* @param i The index of the character.
* @returns The character at the specified index.
*/
public getCharAt(i: number): string {
return this.text.charAt(this.index + i);
}
/**
* Compares this suffix with another suffix.
*
* @param that The other suffix to compare.
* @returns 0 if they are the same, -1 if this < that, and 1 if this > that.
*/
public compareTo(that: Suffix): number {
if (Object.is(this, that)) return 0; // Optimization
const n = Math.min(this.length, that.length);
for (let i = 0; i < n; i++) {
if (this.getCodePointAt(i) < that.getCodePointAt(i)) return -1;
if (this.getCodePointAt(i) > that.getCodePointAt(i)) return 1;
}
return this.length - that.length;
}
public toString(): string {
return this.text.slice(Math.max(0, this.index));
}
}
/** Suffix array. A suffix array is a sorted array of all suffixes of a string. */
export class SuffixArray {
private readonly suffixes: Suffix[] = [];
/**
* Initializes a suffix array for the given `text` string.
*
* @param text The input string
*/
constructor(text: string) {
const n = text.length;
for (let i = 0; i < n; i++) {
this.suffixes[i] = new Suffix(text, i);
}
this.suffixes.sort((a, b) => a.compareTo(b));
}
/**
* Gets the number of suffixes.
*
* @returns The number of suffixes
*/
public get length(): number {
return this.suffixes.length;
}
/**
* Gets the index of the i-th smallest suffix in the original text.
*
* @param i The index of the suffix to retrieve.
* @returns The start index of the i-th smallest suffix.
* @throws If i is out of bounds.
*/
public index(i: number): number {
if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
return this.suffixes[i].index;
}
/**
* Computes the longest common prefix (LCP) between the i-th smallest suffix
* and the previous suffix.
*
* @param i The index of the suffix.
* @returns The length of the LCP.
* @throws If i is out of bounds.
*/
public lcp(i: number): number {
if (i < 1 || i >= this.length) throw new RangeError("Index out of bounds");
return this.lcpSuffix(this.suffixes[i], this.suffixes[i - 1]);
}
/**
* Returns the i-th smallest suffix as a string.
*
* @param i The index
* @returns The i-th smallest suffex of the string.
* @throws If i is out of bounds.
*/
public select(i: number) {
if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
return this.suffixes[i].toString();
}
/**
* Gets the number of suffixes less than the query with binary search.
*
* @param query The query string
* @returns The number of suffixes less than the query.
*/
public rank(query: string): number {
let lo = 0;
let hi = this.length - 1;
while (lo <= hi) {
const mid = Math.floor(lo + (hi - lo) / 2);
const cmp = this.compare(query, this.suffixes[mid]);
if (cmp < 0) hi = mid - 1;
else if (cmp > 0) lo = mid + 1;
else return mid;
}
return lo;
}
/**
* Computes the longest common prefix between two suffixes.
*
* @param a The first suffix.
* @param b The second suffix.
* @returns The length of the longest common prefix.
*/
private lcpSuffix(a: Suffix, b: Suffix): number {
const n = Math.min(a.length, b.length);
for (let i = 0; i < n; i++) {
if (a.getCodePointAt(i) !== b.getCodePointAt(i)) return i;
}
return n;
}
/**
* Compares a query string with a suffix.
*
* @param query The query string.
* @param suffix The suffix to compare with.
* @returns -1, 0, or 1 depending on the comparison result.
* @throws If suffix is undefined.
*/
private compare(query: string, suffix: Suffix): number {
// If (!suffix) throw new Error('Suffix is undefined');
const n = Math.min(query.length, suffix.length);
for (let i = 0; i < n; i++) {
if ((query.codePointAt(i) ?? 0) < suffix.getCodePointAt(i)) return -1;
if ((query.codePointAt(i) ?? 0) > suffix.getCodePointAt(i)) return 1;
}
return query.length - suffix.length;
}
}
/**
* Finds the longest repeated substring (LRS) in a given text that appears at
* least twice. The repetitions can overlap if they are distinct.
*
* @param text The string in which to find the longest repeated substring.
* @returns The longest repeated substring or empty string if not found.
*/
export function longestRepeatedSubstring(text: string): string {
const n: number = text.length;
const sa: SuffixArray = new SuffixArray(text);
let lrs = "";
for (let i = 1; i < n; i++) {
const length = sa.lcp(i);
if (length > lrs.length) {
lrs = text.slice(Number(sa.index(i)), Number(sa.index(i)) + length);
}
}
return lrs;
}
export type DebugRank = Array<{
i: number;
index: number;
lcp: number;
rank: number;
select: string;
}>;
/**
* Gets an array of objects containing rank, index, and lcp for each suffix in
* the given string.
*
* @param s The string to rank.
* @returns An array of objects with detailed suffix information.
*/
export function debugRank(s: string): DebugRank {
const sa = new SuffixArray(s);
const ranked: DebugRank = [];
for (let i = 0; i < s.length; i++) {
const index = sa.index(i);
const select = s.slice(index, Math.min(index + 50, s.length));
const rank = sa.rank(s.slice(Math.max(0, index)));
if (i === 0) {
ranked.push({i, index, lcp: -1, rank, select});
} else {
const lcp = sa.lcp(i);
ranked.push({i, index, lcp, rank, select});
}
}
return ranked;
}
code_test.ts
import {describe, test, expect} from "bun:test";
import {type DebugRank, debugRank, Suffix, SuffixArray} from "./code";
import * as path from "path";
import {longestRepeatedSubstring} from "./code";
/* ========================================================================
SUFFIX CLASS TESTS
======================================================================== */
test("test Suffix length", () => {
const suffix = new Suffix("example", 2);
expect(suffix.length).toBe(5);
});
test("testSuffixCharAt", () => {
const suffix = new Suffix("example", 3);
expect(suffix.getCharAt(0)).toBe("m");
});
test("testSuffixToString", () => {
const suffix = new Suffix("example", 3);
expect(suffix.toString()).toBe("mple");
});
test("testSuffixCompareToEq", () => {
const suffix1 = new Suffix("example", 2);
const suffix2 = new Suffix("example", 2);
expect(suffix1.compareTo(suffix2)).toBe(0);
});
test("testSuffixCompareToLt", () => {
const suffix1 = new Suffix("example", 2);
const suffix2 = new Suffix("example", 3);
expect(suffix1.compareTo(suffix2)).toBeLessThan(0);
});
test("testSuffixCompareToGt", () => {
const suffix1 = new Suffix("example", 3);
const suffix2 = new Suffix("example", 2);
expect(suffix1.compareTo(suffix2)).toBeGreaterThan(0);
});
test("testSuffixLengths", () => {
const suffix1 = new Suffix("hello", 2); // "llo"
const suffix2 = new Suffix("hello world", 2); // "llo world"
expect(suffix1.compareTo(suffix2)).toBeLessThan(0);
});
/* ========================================================================
SUFFIX ARRAY CLASS TESTS
======================================================================== */
test("testConstructor", () => {
const text = "banana";
const suffixArray = new SuffixArray(text);
expect(suffixArray.length).toBe(6);
expect(suffixArray.index(0)).toBe(5);
expect(suffixArray.index(1)).toBe(3);
expect(suffixArray.index(2)).toBe(1);
// And so on for sorted order of suffixes
});
test("testIndex", () => {
const text = "banana";
const suffixArray = new SuffixArray(text);
expect(() => suffixArray.index(-1)).toThrow(RangeError);
expect(() => suffixArray.index(6)).toThrow(RangeError);
expect(suffixArray.index(2)).toBe(1);
});
test("testLcp", () => {
const text = "banana";
const suffixArray = new SuffixArray(text);
expect(suffixArray.lcp(2)).toBe(3);
expect(suffixArray.lcp(1)).toBe(1);
expect(() => suffixArray.lcp(0)).toThrow(RangeError);
expect(() => suffixArray.lcp(6)).toThrow(RangeError);
});
test("testSelect", () => {
const text = "banana";
const suffixArray = new SuffixArray(text);
expect(() => suffixArray.select(-1)).toThrow(RangeError);
expect(() => suffixArray.select(6)).toThrow(RangeError);
expect(suffixArray.select(0)).toBe("a");
expect(suffixArray.select(1)).toBe("ana");
});
test("testRank", () => {
const text = "banana";
const suffixArray = new SuffixArray(text);
expect(suffixArray.rank("banana")).toBe(3);
expect(suffixArray.rank("apple")).toBe(3);
expect(suffixArray.rank("zoo")).toBe(6);
});
/* ========================================================================
DEBUG RANK
======================================================================== */
test("testDebugRank", () => {
const actual = debugRank("ABRACADABRA!");
const expected: DebugRank = [
{i: 0, index: 11, lcp: -1, rank: 0, select: "!"},
{i: 1, index: 10, lcp: 0, rank: 1, select: "A!"},
{i: 2, index: 7, lcp: 1, rank: 2, select: "ABRA!"},
{i: 3, index: 0, lcp: 4, rank: 3, select: "ABRACADABRA!"},
{i: 4, index: 3, lcp: 1, rank: 4, select: "ACADABRA!"},
{i: 5, index: 5, lcp: 1, rank: 5, select: "ADABRA!"},
{i: 6, index: 8, lcp: 0, rank: 6, select: "BRA!"},
{i: 7, index: 1, lcp: 3, rank: 7, select: "BRACADABRA!"},
{i: 8, index: 4, lcp: 0, rank: 8, select: "CADABRA!"},
{i: 9, index: 6, lcp: 0, rank: 9, select: "DABRA!"},
{i: 10, index: 9, lcp: 0, rank: 10, select: "RA!"},
{i: 11, index: 2, lcp: 2, rank: 11, select: "RACADABRA!"},
];
expect(actual).toEqual(expected);
});
/* ========================================================================
LONGEST REPEATED SUBSTRING TESTS
======================================================================== */
describe("longestRepeatedSubstring", () => {
test("should find LRS in a simple string", () => {
const s = "aacaagtttacaagc";
const lrs = longestRepeatedSubstring(s);
expect(lrs).toBe("acaag");
});
test("should find LRS in tiny_tale.txt", async () => {
const tinyTale = Bun.file(path.join(import.meta.dir, "tiny-tale.txt"));
const expected = "st of times, it was the ";
const actual = longestRepeatedSubstring(await tinyTale.text());
expect(actual).toBe(expected);
});
test("should find LRS in moby_dick.txt", async () => {
const tinyTale = Bun.file(path.join(import.meta.dir, "moby-dick.txt"));
const expected =
",— Such a funny, sporty, gamy, jesty, " +
"joky, hoky-poky lad, is the Ocean, oh! Th";
const text = await tinyTale.text();
const actual = longestRepeatedSubstring(text.replaceAll(/\s+/g, " "));
expect(actual).toBe(expected);
});
test("should find LRS in a string of same characters", () => {
const expected = "aaaaaaaa";
const actual = longestRepeatedSubstring("aaaaaaaaa");
expect(actual).toBe(expected);
});
});