Suffix Array
Posted by Dustin Boston in Algorithm and Data Structure.
The suffix array algorithm constructs a sorted array of all suffixes of a given string, enabling efficient substring searches, pattern matching, and text analysis. It uses sorting and optional auxiliary structures like the LCP array for additional optimizations.
Source Code Listing
code.ts
/**
* @file Suffix Array
* @author Dustin Boston
* @see R. Sedgewick and K. Wayne, Algorithms, 4th ed. Upper Saddle River, NJ:
* Addison-Wesley, 2011. [Online]. Available: https://algs4.cs.princeton.edu/home/
*/
/** Represents a single suffix in a suffix array. */
/** Represents a single suffix in a suffix array. */
export class Suffix {
constructor(
public text: string,
public index: number,
) {}
/**
* Gets the length of the suffix.
*
* @returns Text length minus the index
*/
public get length(): number {
return this.text.length - this.index;
}
/**
* Returns the Unicode value of the character at the specified position or
* undefined if the position is out of bounds.
*
* @param i The index of the character in the suffix.
* @returns The Unicode value of the character.
*/
public getCodePointAt(i: number): number {
return this.text.codePointAt(this.index + i) ?? 0;
}
/**
* Gets a character from the text by its index.
*
* @param i The index of the character.
* @returns The character at the specified index.
*/
public getCharAt(i: number): string {
return this.text.charAt(this.index + i);
}
/**
* Compares this suffix with another suffix.
*
* @param that The other suffix to compare.
* @returns 0 if they are the same, -1 if this < that, and 1 if this > that.
*/
public compareTo(that: Suffix): number {
if (Object.is(this, that)) return 0; // Optimization
const n = Math.min(this.length, that.length);
for (let i = 0; i < n; i++) {
if (this.getCodePointAt(i) < that.getCodePointAt(i)) return -1;
if (this.getCodePointAt(i) > that.getCodePointAt(i)) return 1;
}
return this.length - that.length;
}
public toString(): string {
return this.text.slice(Math.max(0, this.index));
}
}
/** Suffix array. A suffix array is a sorted array of all suffixes of a string. */
export class SuffixArray {
private readonly suffixes: Suffix[] = [];
/**
* Initializes a suffix array for the given `text` string.
*
* @param text The input string
*/
constructor(text: string) {
const n = text.length;
for (let i = 0; i < n; i++) {
this.suffixes[i] = new Suffix(text, i);
}
this.suffixes.sort((a, b) => a.compareTo(b));
}
/**
* Gets the number of suffixes.
*
* @returns The number of suffixes
*/
public get length(): number {
return this.suffixes.length;
}
/**
* Gets the index of the i-th smallest suffix in the original text.
*
* @param i The index of the suffix to retrieve.
* @returns The start index of the i-th smallest suffix.
* @throws If i is out of bounds.
*/
public index(i: number): number {
if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
return this.suffixes[i].index;
}
/**
* Computes the longest common prefix (LCP) between the i-th smallest suffix
* and the previous suffix.
*
* @param i The index of the suffix.
* @returns The length of the LCP.
* @throws If i is out of bounds.
*/
public lcp(i: number): number {
if (i < 1 || i >= this.length) throw new RangeError("Index out of bounds");
return this.lcpSuffix(this.suffixes[i], this.suffixes[i - 1]);
}
/**
* Returns the i-th smallest suffix as a string.
*
* @param i The index
* @returns The i-th smallest suffex of the string.
* @throws If i is out of bounds.
*/
public select(i: number) {
if (i < 0 || i >= this.length) throw new RangeError("Index out of bounds");
return this.suffixes[i].toString();
}
/**
* Gets the number of suffixes less than the query with binary search.
*
* @param query The query string
* @returns The number of suffixes less than the query.
*/
public rank(query: string): number {
let lo = 0;
let hi = this.length - 1;
while (lo <= hi) {
const mid = Math.floor(lo + (hi - lo) / 2);
const cmp = this.compare(query, this.suffixes[mid]);
if (cmp < 0) hi = mid - 1;
else if (cmp > 0) lo = mid + 1;
else return mid;
}
return lo;
}
/**
* Computes the longest common prefix between two suffixes.
*
* @param a The first suffix.
* @param b The second suffix.
* @returns The length of the longest common prefix.
*/
private lcpSuffix(a: Suffix, b: Suffix): number {
const n = Math.min(a.length, b.length);
for (let i = 0; i < n; i++) {
if (a.getCodePointAt(i) !== b.getCodePointAt(i)) return i;
}
return n;
}
/**
* Compares a query string with a suffix.
*
* @param query The query string.
* @param suffix The suffix to compare with.
* @returns -1, 0, or 1 depending on the comparison result.
* @throws If suffix is undefined.
*/
private compare(query: string, suffix: Suffix): number {
// If (!suffix) throw new Error('Suffix is undefined');
const n = Math.min(query.length, suffix.length);
for (let i = 0; i < n; i++) {
if ((query.codePointAt(i) ?? 0) < suffix.getCodePointAt(i)) return -1;
if ((query.codePointAt(i) ?? 0) > suffix.getCodePointAt(i)) return 1;
}
return query.length - suffix.length;
}
}
/**
* Finds the longest repeated substring (LRS) in a given text that appears at
* least twice. The repetitions can overlap if they are distinct.
*
* @param text The string in which to find the longest repeated substring.
* @returns The longest repeated substring or empty string if not found.
*/
export function longestRepeatedSubstring(text: string): string {
const n: number = text.length;
const sa: SuffixArray = new SuffixArray(text);
let lrs = "";
for (let i = 1; i < n; i++) {
const length = sa.lcp(i);
if (length > lrs.length) {
lrs = text.slice(Number(sa.index(i)), Number(sa.index(i)) + length);
}
}
return lrs;
}
export type DebugRank = Array<{
i: number;
index: number;
lcp: number;
rank: number;
select: string;
}>;
/**
* Gets an array of objects containing rank, index, and lcp for each suffix in
* the given string.
*
* @param s The string to rank.
* @returns An array of objects with detailed suffix information.
*/
export function debugRank(s: string): DebugRank {
const sa = new SuffixArray(s);
const ranked: DebugRank = [];
for (let i = 0; i < s.length; i++) {
const index = sa.index(i);
const select = s.slice(index, Math.min(index + 50, s.length));
const rank = sa.rank(s.slice(Math.max(0, index)));
if (i === 0) {
ranked.push({i, index, lcp: -1, rank, select});
} else {
const lcp = sa.lcp(i);
ranked.push({i, index, lcp, rank, select});
}
}
return ranked;
}
(() => {
// Define an example string
const text = "banana";
// Create a suffix array from the string
const suffixArray = new SuffixArray(text);
// Retrieve and print the sorted suffixes with their indices
for (let i = 0; i < suffixArray.length; i++) {
const index = suffixArray.index(i);
const suffix = suffixArray.select(i);
console.log(`Suffix at index ${index}: "${suffix}"`);
}
// Demonstrate LCP (Longest Common Prefix) functionality
if (suffixArray.length > 1) {
console.log(`LCP of suffixes 1 and 2: ${suffixArray.lcp(1)}`);
}
// Rank an arbitrary substring
const substring = "ana";
console.log(
`Rank of substring "${substring}": ${suffixArray.rank(substring)}`,
);
const repeated = longestRepeatedSubstring("abcabcabc");
console.log(`Longest repeated substring: '${repeated}'`);
})();
suffix_array_lrs_test.ts
import {longestRepeatedSubstring} from "../../../../data/algorithms/src/suffix_array_lrs.ts";
import {assertEquals} from "$assert";
test(function testLrsAcaag() {
const s = "aacaagtttacaagc";
const lrs = longestRepeatedSubstring(s);
assertEquals(lrs, "acaag");
});
test(function testTinyTale() {
const tinyTale = Deno.readTextFileSync("./test/fixtures/tiny_tale.txt");
const expected = "st of times it was the ";
const actual = longestRepeatedSubstring(tinyTale.replaceAll(/\s+/g, " "));
assertEquals(actual, expected);
});
test(function testMobyDick() {
const mobyDick = Deno.readTextFileSync("./test/fixtures/moby_dick.txt");
const expected =
",- Such a funny, sporty, gamy, jesty, " +
"joky, hoky-poky lad, is the Ocean, oh! Th";
const actual = longestRepeatedSubstring(mobyDick.replaceAll(/\s+/g, " "));
assertEquals(actual, expected);
});
test(function testAaaaaaaaa() {
const expected = "aaaaaaaa";
const actual = longestRepeatedSubstring("aaaaaaaaa");
assertEquals(actual, expected);
});
suffix_array_test.ts
import {
type DebugRank,
debugRank,
Suffix,
SuffixArray,
} from "../../../../data/algorithms/src/suffix_array.ts";
import {
assertEquals,
assertGreater,
assertLess,
assertObjectMatch,
assertThrows,
} from "$assert";
/* ========================================================================
SUFFIX CLASS TESTS
======================================================================== */
test("test Suffix length", () => {
const suffix = new Suffix("example", 2);
assertEquals(suffix.length, 5);
});
test(function testSuffixCharAt() {
const suffix = new Suffix("example", 3);
assertEquals(suffix.charAt(0), "m");
});
test(function testSuffixToString() {
const suffix = new Suffix("example", 3);
assertEquals(suffix.toString(), "mple");
});
test(function testSuffixCompareToEq() {
const suffix1 = new Suffix("example", 2);
const suffix2 = new Suffix("example", 2);
assertEquals(suffix1.compareTo(suffix2), 0);
});
test(function testSuffixCompareToLt() {
const suffix1 = new Suffix("example", 2);
const suffix2 = new Suffix("example", 3);
assertLess(suffix1.compareTo(suffix2), 0);
});
test(function testSuffixCompareToGt() {
const suffix1 = new Suffix("example", 3);
const suffix2 = new Suffix("example", 2);
assertGreater(suffix1.compareTo(suffix2), 0);
});
test(function testSuffixLengths() {
const suffix1 = new Suffix("hello", 2); // "llo"
const suffix2 = new Suffix("hello world", 2); // "llo world"
assertLess(suffix1.compareTo(suffix2), 0);
});
/* ========================================================================
SUFFIX ARRAY CLASS TESTS
======================================================================== */
test(function testConstructor() {
const text = "banana";
const suffixArray = new SuffixArray(text);
assertEquals(suffixArray.length, 6);
assertEquals(suffixArray.index(0), 5);
assertEquals(suffixArray.index(1), 3);
assertEquals(suffixArray.index(2), 1);
// And so on for sorted order of suffixes
});
test(function testIndex() {
const text = "banana";
const suffixArray = new SuffixArray(text);
assertThrows(() => suffixArray.index(-1), RangeError);
assertThrows(() => suffixArray.index(6), RangeError);
assertEquals(suffixArray.index(2), 1);
});
test(function testLcp() {
const text = "banana";
const suffixArray = new SuffixArray(text);
assertEquals(suffixArray.lcp(2), 3);
assertEquals(suffixArray.lcp(1), 1);
assertThrows(() => suffixArray.lcp(0), RangeError);
assertThrows(() => suffixArray.lcp(6), RangeError);
});
test(function testSelect() {
const text = "banana";
const suffixArray = new SuffixArray(text);
assertThrows(() => suffixArray.select(-1), RangeError);
assertThrows(() => suffixArray.select(6), RangeError);
assertEquals(suffixArray.select(0), "a");
assertEquals(suffixArray.select(1), "ana");
});
test(function testRank() {
const text = "banana";
const suffixArray = new SuffixArray(text);
assertEquals(suffixArray.rank("banana"), 3);
assertEquals(suffixArray.rank("apple"), 3);
assertEquals(suffixArray.rank("zoo"), 6);
});
/* ========================================================================
DEBUG RANK
======================================================================== */
test(function testDebugRank() {
const actual = debugRank("ABRACADABRA!");
const expected: DebugRank = [
{i: 0, index: 11, lcp: -1, rank: 0, select: "!"},
{i: 1, index: 10, lcp: 0, rank: 1, select: "A!"},
{i: 2, index: 7, lcp: 1, rank: 2, select: "ABRA!"},
{i: 3, index: 0, lcp: 4, rank: 3, select: "ABRACADABRA!"},
{i: 4, index: 3, lcp: 1, rank: 4, select: "ACADABRA!"},
{i: 5, index: 5, lcp: 1, rank: 5, select: "ADABRA!"},
{i: 6, index: 8, lcp: 0, rank: 6, select: "BRA!"},
{i: 7, index: 1, lcp: 3, rank: 7, select: "BRACADABRA!"},
{i: 8, index: 4, lcp: 0, rank: 8, select: "CADABRA!"},
{i: 9, index: 6, lcp: 0, rank: 9, select: "DABRA!"},
{i: 10, index: 9, lcp: 0, rank: 10, select: "RA!"},
{i: 11, index: 2, lcp: 2, rank: 11, select: "RACADABRA!"},
];
assertObjectMatch({rank: actual}, {rank: expected});
});