"Common string algorithms summary"

Common string algorithm Summary

String Hash

Typically using a polynomial \ (\ mathrm {Hash} \ ) method empowerment, mapping the string to a positive integer.

\[f(s)=\sum_{i=1}^{|s|}|s_i|\times P^i\ (\bmod\ p)\]

Support \ (O (1) \) end of the insert characters, \ (O (1) \) extraction section string \ ({Hash} \ \ mathrm ) value.

Collision rate for each query probably \ (\ frac {1} { p} \) or so, if the query more frequently, the number of dual-mode can be used \ (\ mathrm {Hash} \) .

#include <bits/stdc++.h>
using namespace std;
const int N = 1e6 + 20 , Mod = 998244353 , P = 131;
inline int inc(int a,int b) { return a + b >= Mod ? a + b - Mod : a + b; }
inline int mul(int a,int b) { return 1LL * a * b % Mod; }
inline int dec(int a,int b) { return a - b < 0 ? a - b + Mod : a - b; }
inline void Inc(int &a,int b) { a = inc( a , b ); }
inline void Mul(int &a,int b) { a = mul( a , b ); }
inline void Dec(int &a,int b) { a = dec( a , b ); }
int Pow[N],val[N],n,m; char s[N];
inline int GetHash(int l,int r) { return dec( val[r] , mul( val[l-1] , Pow[r-l+1] ) ); }
int main(void)
{
    scanf( "%s" , s+1 );
    n = strlen( s + 1 );
    Pow[0] = 1;
    for (int i = 1; i <= n; i++)
        Pow[i] = mul( Pow[i-1] , P ) , val[i] = inc( s[i] - 'a' , mul( val[i-1] , P ) );
    scanf( "%d" , &m );
    for (int i = 1; i <= m; i++)
    {
        int l1,l2,r1,r2;
        scanf( "%d%d%d%d" , &l1 , &r1 , &l2 , &r2 );
        GetHash(l1,r1) == GetHash(l2,r2) ? puts("Yes") : puts("No");
    }
    return 0;
}

Trie tree

Deterministic finite state automaton, recognizes only the identification and collection of strings \ (S \) in all string.

Support \ (O (| s |) \) is inserted into the string, \ (O (| s |) \) to retrieve a string.

#include <bits/stdc++.h>
using namespace std;
const int N = 1e6 + 20;
struct Trie
{
    int e[N][26],end[N],tot;
    Trie(void) { tot = 1; }
    inline void Insert(char *s)
    {
        int n = strlen( s + 1 ) , p = 1;
        for (int i = 1; i <= n; i++)
        {
            int c = s[i] - 'a';
            if ( !e[p][c] ) e[p][c] = ++tot;
            p = e[p][c];
        }
        end[p] = true;
    }
    inline bool Query(char *s)
    {
        int n = strlen( s + 1 ) , p = 1;
        for (int i = 1; i <= n; i++)
        {
            int c = s[i] - 'a';
            if ( !e[p][c] ) return false;
            p = e[p][c];
        }
        return end[p];
    }
};

Knuth-Morris-Pratt algorithm

Definition of a string of \ (\ mathrm {Border} \ ) for the common prefix and suffix.

Function definition prefix string \ [\ pi (p) = \ max_ {s (1, t) = s (p-t + 1, p)} \ {t \} \]

Is the meaning of the string \ (S \) prefix \ (S_P \) up \ (\ mathrm {Border} \ ) length. Traversal string, each time from a position on the longest \ (\ mathrm {Border} \ ) start of the match rearwardly, if a match fails to jump \ (\ mathrm {Border} \ ) , until a successful match to seek a function of all the prefix string.

Defined potential energy function \ (\ Phi (p) \ ) is the prefix string \ (S_P \) longest \ (\ mathrm {Border} \ ) length, according to the \ (\ mathrm {Knuth-Morris -Pratt} \) Algorithm there \ (\ Phi (P) \ Leq \ Phi (. 1-P) + 1'd \) , if violent jumping \ (\ mathrm Border} {\) , the potential decreases, the overall time complexity is found \ (O ( the n-) \) .

If the prefix is obtained a string function, a single mode can be achieved string matching string, it is the longest mismatch \ (\ mathrm {Border} \ ) at the matching can be restarted, the time complexity is \ (O (n-m +) \) , a similar analysis.

#include <bits/stdc++.h>
using namespace std;
const int N = 1000020;
int n,m,fail[N]; char s[N],t[N];
int main(void)
{
    scanf( "%s\n%s" , s+1 , t+1 );
    n = strlen( s + 1 ) , m = strlen( t + 1 );
    for (int i = 2 , j = 0; i <= m; i++)
    {
        while ( j && t[j+1] != t[i] ) j = fail[j];
        j += ( t[j+1] == t[i] ) , fail[i] = j;
    }
    for (int i = 1 , j = 0; i <= n; i++)
    {
        while ( j && ( t[j+1] != s[i] || j == m ) ) j = fail[j];
        j += ( t[j+1] == s[i] );
        if ( j == m ) printf( "%d\n" , i - m + 1 );
    }
    for (int i = 1; i <= m; i++)
        printf( "%d%c" , fail[i] , " \n"[ i == m ] );
    return 0;
}

Knuth-Morris-Pratt automaton

For a string \ (S \) , which define the \ (\ mathrm {KMP} \ ) automaton satisfied:

\ (1 \) number of states \ (n-+. 1 \) .
\ (2 \) identify all prefixes.
\ (3 \) transfer function \ (\ delta (p, c ) \) state \ (P \) corresponding to the connected character prefix \ (C \) after the longest \ (\ mathrm {Border} \ ) position prefix corresponding state .

Constructor methods \ (\ mathrm {Knuth-Morris -Pratt} \) algorithm Similarly, time complexity is \ (O (n-\ Sigma) \) .

#include <bits/stdc++.h>
using namespace std;
const int N = 1e6 + 20;
struct KMPAutomaton
{
    int trans[N][26],n;
    inline void Build(char *s)
    {
        n = strlen( s + 1 ) , trans[0][s[1]-'a'] = 1;
        for (int i = 1 , j = 0; i <= n; i++)
        {
            for (int k = 0; k < 26; k++)
                trans[i][k] = trans[j][k];
            trans[i][s[i]-'a'] = i + 1;
            j = trans[j][ s[i] - 'a' ];
        }
    }
};

Aho-Corasick automaton

Deterministic finite state automaton, recognizes all suffixes in a set of specified string (S \) \ string.

Firstly, we initialize \ (\ mathrm {Aho-Corasick } \) Automatic machine for the collection specified string \ (S \) a \ (\ mathrm {Trie} \ ) tree, then follow \ (\ mathrm {bfs} \ ) transfer sequence constructor function \ (\ Delta \) .

We define each state has a \ (\ mathrm {fail} \ ) pointer, \ (\ mathrm {} Fail (X) = Y \) if and only if the state \ (Y \) string representing the state \ ( X \) representative of the suffix string and \ (Y \) representative of the maximum length of the string.

We simply \ (\ mathrm {bfs} \ ) original \ (\ mathrm {Trie} \ ) tree, when the node \ (X \) in \ (\ mathrm {Trie} \ ) is present on the character \ (C \) when the transfer side, we let \ (\ Delta (X, C) = \ mathrm {Trie} (X, C) \) , and update \ (\ mathrm {fail} \ ) pointer \ (\ Delta (\ Fail} {mathrm (X), C) \) , and vice versa, you can make \ (\ Delta (X, C) = \ Delta (\ Fail} {mathrm (X), C) \) , easy to know the accuracy .

\ (\ mathrm {Aho-Corasick } \) time text matching, and matching automaton constructed can achieve multiple patterns are linear complexity (It is noted that, if the calculation of the contribution of selected rampaging \ (\ mathrm {fail} \) , then the time complexity can not be guaranteed).

\ (\ mathrm {Knuth-Morris -Pratt} \) automaton is only one string \ (\ mathrm {Aho-Corasick } \) automata.

#include <bits/stdc++.h>
using namespace std;
const int N = 2e5 + 20;
struct AhoCorasickautomaton
{
    int trans[N][26],fail[N],end[N],q[N],tot,head,tail;
    inline void insert(char *s,int id)
    {
        int len = strlen( s + 1 ) , now = 0;
        for (int i = 1; i <= len; i++)
        {
            int c = s[i] - 'a';
            if ( !trans[now][c] ) trans[now][c] = ++tot;
            now = trans[now][c];
        }
        end[id] = now;
    }
    inline void build(void)
    {
        head = 1 , tail = 0;
        for (int i = 0; i < 26; i++)
            if ( trans[0][i] ) q[++tail] = trans[0][i];
        while ( head <= tail )
        {
            int x = q[head++];
            for (int i = 0; i < 26; i++)
                if ( !trans[x][i] )
                    trans[x][i] = trans[fail[x]][i];
                else {
                    fail[trans[x][i]] = trans[fail[x]][i];
                    q[++tail] = trans[x][i];
                }
        }
    }
};

Sequence automaton

Deterministic finite state automaton, identifying all sequences and only a recognition sequence.

By definition, it is possible to construct a \ (| s | +1 \) automaton states, and even the reverse side to each state can be used as a final state, the time complexity \ (O (n \ Sigma) \) .

#include <bits/stdc++.h>
using namespace std;
const int N = 1e6;
struct SequenceAutomaton
{
    int trans[N][26],next[26];
    inline void Build(char *s)
    {
        int n = strlen( s + 1 )；
        memset( next , 0 , sizeof next );
        for (int i = n; i >= 1; i--)
        {
            next[ s[i] - 'a' ] = i;
            for (int j = 0; j < 26; j++)
                trans[i-1][j] = next[j];
        }
    }
};

Minimum representation

Obtaining a string \ (S \) in all of the cycles represents a lexicographically smallest.

Can use two pointers \ (i, j \) scan, a comparative \ (i, j \) at the beginning of cycle two positions isomorphic string, and sequentially comparing violent downward until it finds a length \ (K \) , such that \ (S_ {I + K}> S_ {J + K} \) , then we can directly order \ (I = I + K +. 1 \) , because for any \ (p \ in [0, k] \ ) , isomorphic string \ (s_ {i + p} \) than homogeneous string \ (s_ {j + p} \) inferior, so no further comparison.

Time complexity is easy to know \ (O (n-) \) .

#include <bits/stdc++.h>
using namespace std;
const int N = 3e5 + 20;
int n,s[N<<1];
int main(void)
{
    scanf( "%d" , &n );
    for (int i = 1; i <= n; i++)
        scanf( "%d" , &s[i] ) , s[i+n] = s[i];
    int i = 1 , j = 2 , k;
    while ( i <= n && j <= n )
    {
        for (k = 0; k < n && s[i+k] == s[j+k]; k++);
        if ( k == n ) break;
        if ( s[i+k] > s[j+k] ) ( i += k + 1 ) += ( i == j );
        if ( s[i+k] < s[j+k] ) ( j += k + 1 ) += ( i == j );
    }
    i = min( i , j ) , j = i + n - 1;
    for (int p = i; p <= j; p++) printf( "%d " , s[p] );
    return puts("") , 0;
}

Suffix automata

Deterministic finite state automaton, recognizes all suffixes and only recognize a character string.

Incremental construction method, see "suffix automaton entry SuffixAutomaton」 .

Memory array using the static transfer side, time and space complexity of the \ (O (n-\ Sigma) \) , a linked list may be optimized to the time complexity \ (O (n-) \) . Tree edges with equilibration deposit transfer, time complexity \ (O (n-\ log \ Sigma) \) , the space complexity \ (O (n-) \) .

struct SuffixAutomaton
{
    int trans[N][26],link[N],maxlen[N],tot,last;
    // trans为转移函数，link为后缀链接，maxlen为状态内的最长后缀长度
    // tot为总结点数，last为终止状态编号
    SuffixAutomaton () { last = tot = 1; } // 初始化：1号节点为S
    inline void Extend(int c)
    {
        int cur = ++tot , p;
        maxlen[cur] = maxlen[last] + 1;
        // 创建节点cur
        for ( p = last; p && !trans[p][c]; p = link[p] ) // 遍历后缀链接路径
            trans[p][c] = cur; // 没有字符c转移边的链接转移边
        if ( p == 0 ) link[cur] = 1; // 情况1
        else {
            int q = trans[p][c];
            if ( maxlen[q] == maxlen[p] + 1 ) link[cur] = q; // 情况2
            else {
                int cl = ++tot; maxlen[cl] = maxlen[p] + 1; // 情况3
                memcpy( trans[cl] , trans[q] , sizeof trans[q] );
                while ( p && trans[p][c] == q )
                    trans[p][c] = cl , p = link[p];
                link[cl] = link[q] , link[q] = link[cur] = cl;
            }
        }
        last = cur;
    }
};

Generalized suffix automata

Deterministic finite state automaton, recognizes only the identification and collection of strings \ (S \) in all suffixes all strings.

Constructor methods similar narrow suffix automaton, a node can simply split the transfer side when the conflict.

Temporal complexity are the same suffix automaton.

It is worth mentioning that, if the automaton generalized suffix tree merge segment to maintain \ (\ mathrm {endpos} \ ) set, need to \ (\ mathrm {dfs} \ ) traversing \ (\ mathrm {Parent} \ ) tree to combine, can not be combined according to the topology of the radix sort order .

struct SuffixAutomaton
{
    int trans[N][26],link[N],maxlen[N],tot;
    SuffixAutomaton () { tot = 1; }
    inline int Extend(int c,int pre)
    {
        if ( trans[pre][c] == 0 )
        {
            int cur = ++tot , p;
            maxlen[cur] = maxlen[pre] + 1;
            for ( p = pre; p && !trans[p][c]; p = link[p] )
                trans[p][c] = cur;
            if ( p == 0 ) link[cur] = 1;
            else {
                int q = trans[p][c];
                if ( maxlen[q] == maxlen[p] + 1 ) link[cur] = q;
                else {
                    int cl = ++tot; maxlen[cl] = maxlen[p] + 1;
                    memcpy( trans[cl] , trans[q] , sizeof trans[q] );
                    while ( p && trans[p][c] == q )
                        trans[p][c] = cl , p = link[p];
                    link[cl] = link[q] , link[q] = link[cur] = cl;
                }
            }
            return cur;
        }
        else {
            int q = trans[pre][c];
            if ( maxlen[q] == maxlen[pre] + 1 ) return q;
            else {
                int cl = ++tot; maxlen[cl] = maxlen[pre] + 1;
                memcpy( trans[cl] , trans[q] , sizeof trans[q] );
                while ( pre && trans[pre][c] == q )
                    trans[pre][c] = cl , pre = link[pre];
                return link[cl] = link[q] , link[q] = cl;
            }
        }
    }
};

Suffix tree

A string \ (S \) all inserted into a suffix \ (\ mathrm {Trie} \ ) tree, we call tree \ (\ mathrm {Trie} \ ) virtual tree all leaf nodes of the character string suffix tree.

The \ (\ mathrm {endpos} \ ) define equivalence classes and properties, that is easy to reverse the original string is inserted into a suffix automaton \ (\ mathrm {Parent} \ ) tree is a suffix tree of the string, can be used suffix automata constructor seeking suffix tree.

Time and complexity suffix automaton same time complexity can be \ (O (n) \) passing seeking suffix array.

#include <bits/stdc++.h>
using namespace std;
const int N = 2e5+20;
struct SuffixAutomaton
{
    int trans[N][26],link[N],maxlen[N],tot,last;
    int id[N],flag[N],trie[N][26],sa[N],rk[N],hei[N],cnt;
    // id 代表这个状态是几号后缀 , flag 代表这个状态是否对应了一个真实存在的后缀
    SuffixAutomaton () { tot = last = 1; }
    inline void Extend(int c,int pos)
    {
        int cur = ++tot , p;
        id[cur] = pos , flag[cur] = true;
        maxlen[cur] = maxlen[last] + 1;
        for ( p = last; p && !trans[p][c]; p = link[p] )
            trans[p][c] = cur;
        if ( p == 0 ) link[cur] = 1;
        else {
            int q = trans[p][c];
            if ( maxlen[q] == maxlen[p] + 1 ) link[cur] = q;
            else {
                int cl = ++tot; maxlen[cl] = maxlen[p] + 1;
                memcpy( trans[cl] , trans[q] , sizeof trans[q] );
                while ( p && trans[p][c] == q )
                    trans[p][c] = cl , p = link[p];
                link[cl] = link[q] , id[cl] = id[q] , link[q] = link[cur] = cl;
            }
        }
        last = cur;
    }
    inline void insert(int x,int y,char c) { trie[x][c-'a'] = y; }
    inline void Build(char *s,int n)
    {
        for (int i = n; i >= 1; i--)
            Extend( s[i]-'a' , i );
        for (int i = 2; i <= tot; i++)
            insert( link[i] , i , s[ id[i] + maxlen[link[i]] ] );
    }
    inline void Dfs(int x)
    {
        if ( flag[x] ) sa[ rk[id[x]] = ++cnt ] = id[x];
        for (int i = 0 , y; i < 26; i++)
            if ( y = trie[x][i] ) Dfs(y);
    }
    inline void Calcheight(char *s,int n)
    {
        for (int i = 1 , k = 0 , j; i <= n; i++)
        {
            if (k) --k; j = sa[ rk[i]-1 ];
            while ( s[ i+k ] == s[ j+k ] ) ++k;
            hei[ rk[i] ] = k;
        }
    }
};
SuffixAutomaton T; char s[N];
int main(void)
{
    scanf( "%s" , s+1 );
    int n = strlen( s+1 );
    T.Build( s , n ) , T.Dfs(1);
    T.Calcheight( s , n );
    for (int i = 1; i <= n; i++)
        printf( "%d%c" , T.sa[i] , " \n"[ i == n ] );
    for (int i = 2; i <= n; i++)
        printf( "%d%c" , T.hei[i] , " \n"[ i == n ] );
    return 0;
}

Palindrome automata

Deterministic finite state automaton, identifying one and only identification string \ (S \) in all palindrome string right half .

Since the parity sub-string palindrome, the palindrome thus has two initial state automaton, representing odd and even palindromic sequence palindromic sequence.

You can use mathematical induction to prove, the string \ (s \) at most \ (| s | \) different nature palindrome string, so a palindrome state automaton represents a palindrome string. And a transfer robot palindromic each edge represents a plus on both sides of the original character string, the string remains thus transferred palindromic sequence also explains why only recognize palindromic automaton right half of the palindromic sequence .

Palindrome automaton incremental method using the same configuration. For each state, we recorded the longest palindromic additional suffix corresponds state, called \ (\ mathrm {link} \ ) function. When at the end we insert a character string, we start from the last state of the original string hop \ (\ mathrm Link} {\) , may constitute up palindromic sequence, and determining the new state.

For the new state, you can still continue to jump \ (\ mathrm {Link} \) , find the longest palindrome suffix.

Palindrome automaton can be seen as two trees, the tree is also called a palindrome. For \ (\ mathrm {link} \ ) pointers, also constitute a tree can be called palindromic suffix tree. Defined potential energy function \ (\ Phi (p) \ ) shows a state \ (P \) depth palindromic suffix tree, the algorithm according to the configuration, easy to know \ (\ Phi (p) \ leq \ Phi (\ mathrm {link } (P)). 1 + \) , and the jump \ (\ mathrm {link} \ ) is the potential function decreases. And because the number of state automaton is palindromic \ (O (n) \) , the maximum depth of the palindrome suffix tree is \ (n-\) , may be configured that the time complexity of the algorithm does not exceed \ (O ( the n-) \) .

Spatial complexity is \ (O (n-\ Sigma) \) , using the adjacency list storage side, the time complexity raised \ (O (n-\ Sigma) \) , the spatial complexity is reduced to \ (O (n) \ ) . If \ (\ mathrm {Hash} \ ) table deposit side, are reduced time and space complexity of the \ (O (n-) \) .

Since the longest string of a palindrome palindrome it must be a suffix \ (\ mathrm Border} {\) , so palindromic tree \ (\ mathrm {dp} \ ) may be used \ (\ mathrm {Border \ Series } \) of the arithmetic properties. Palindrome automaton will record two additional parameters \ (\ mathrm {dif} \ ) and \ (\ mathrm slink} {\) , \ (\ mathrm DIF} {(X) = \ mathrm len {} (X ) - \ mathrm len {} (\ mathrm {Link} (X)) \) , \ (\ mathrm slink} {(X) \) recorded palindromic suffix tree \ (X \) deepest ancestor, satisfying \ (\ mathrm {DIF} (\ mathrm slink} {(X)) \ = Not \ mathrm DIF} {(X) \) , which can be maintained during construction passing.

#include <bits/stdc++.h>
using namespace std;
const int N = 1e6 + 20 , Mod = 1e9 + 7;
struct PalindromesAutomaton
{
    int n,tot,last,link[N],slink[N],trans[N][26],len[N],dif[N],s[N];
    PalindromesAutomaton(void)
    {
        len[ last = 0 ] = 0 , link[0] = 1;
        len[1] = -1 , tot = 1 , s[0] = -1;
    }
    inline void Extend(int c)
    {
        int p = last; s[++n] = c;
        while ( s[n] != s[ n - len[p] - 1 ] ) p = link[p];
        if ( trans[p][c] == 0 )
        {
            int cur = ++tot , q = link[p];
            len[cur] = len[p] + 2;
            while ( s[n] != s[ n - len[q] - 1 ] ) q = link[q];
            link[cur] = trans[q][c] , trans[p][c] = cur;
            dif[cur] = len[cur] - len[ link[cur] ];
            if ( dif[cur] != dif[ link[cur] ] ) slink[cur] = link[cur];
            else slink[cur] = slink[ link[cur] ];
        }
        last = trans[p][c];
    }
};