CodeForces - 427D Match & Catch（后缀自动机）

Police headquarter is monitoring signal on different frequency levels. They have got two suspiciously encoded strings s1 and s2 from two different frequencies as signals. They are suspecting that these two strings are from two different criminals and they are planning to do some evil task.

Now they are trying to find a common substring of minimum length between these two strings. The substring must occur only once in the first string, and also it must occur only once in the second string.

Given two strings s1 and s2 consist of lowercase Latin letters, find the smallest (by length) common substring p of both s1 and s2, where p is a unique substring in s1 and also in s2. See notes for formal definition of substring and uniqueness.

Input
The first line of input contains s1 and the second line contains s2 (1 ≤ |s1|, |s2| ≤ 5000). Both strings consist of lowercase Latin letters.

Output
Print the length of the smallest common unique substring of s1 and s2. If there are no common unique substrings of s1 and s2 print -1.

题目大意：
给出两个小写字母组成的字符串，求在这两个字符串中都只出现过一次的公共子串的最小长度。

今天好困，摸鱼，不想打题，拿之前做的题写个博客划水。

后缀自动机裸题，学过sam的都知道，后缀自动机可以维护endpos（或者用陈立杰的叫法，right集合），代表当前状态表达子串每次出现的最后一个位置。用endpos可以轻易完成子串出现次数的统计。

对于本题，给出两份代码，两种做法。

首先第一种，我一上来就想到的做法，两个串分别建两个自动机，然后分别求出endpos，再用一个串在另一个串的自动机上跑模式匹配，并且让该串自己的自动机也随着进行状态转移，遇到一个两个自动机的endpos计数都为1的节点时，更新答案并且终止匹配（因为是求最小长度，再往下匹配下去只会求出更大的答案）。

ac代码：

#include<bits/stdc++.h>
using namespace std;

const int maxn = 5005;
char s[maxn];

struct Sam {
	int next[maxn << 1][26];
	int link[maxn << 1], step[maxn << 1];
	int a[maxn << 1], b[maxn << 1];
	int endpos[maxn << 1];
	int sz, last;

	void add(int c) {
		int p = last;
		int np = last = ++sz;

		endpos[np] = 1;
		step[np] = step[p] + 1;
		while(!next[p][c] && p) {
			next[p][c] = np;
			p = link[p];
		}

		if(p == 0) {
			link[np] = 1;
		} else {
			int q = next[p][c];
			if(step[p] + 1 == step[q]) {
				link[np] = q;
			} else {
				int nq = ++sz;
				memcpy(next[nq], next[q], sizeof(next[q]));
				step[nq] = step[p] + 1;
				link[nq] = link[q];
				link[q] = link[np] = nq;
				while(next[p][c] == q && p) {
					next[p][c] = nq;
					p = link[p];
				}
			}
		}
	}

	void build(int n) {
		sz = last = 1;
		for(int i = 0; i < n; i++) {
			add(s[i] - 'a');
		}
		for(int i = 1; i <= sz; i++) {
			++a[step[i]];
		}
		for(int i = 1; i <= n; i++) {
			a[i] += a[i - 1];
		}
		for(int i = 1; i <= sz; i++) {
			b[a[step[i]]--] = i;
		}
		for(int i = sz; i > 0; i--) {
			int e = b[i];
			endpos[link[e]] += endpos[e];
		}
	}

} sam[2];

int main() {
	int len, cnt = 0, p = 1, p2 = 1, c, ans = 6000;
	scanf("%s", s);
	len = strlen(s);
	sam[0].build(len);
	scanf("%s", s);
	len = strlen(s);
	sam[1].build(len);

	for(int i = 0; i < len; i++) {
		for(int j = i; j < len; j++) {
			c = s[j] - 'a';
			while(!sam[0].next[p][c] && p) {
				p = sam[0].link[p];
				p2 = sam[1].link[p2];
				cnt = sam[0].step[p];
			}

			if(p == 0) {
				break;
			}

			if(p2 == 0) {
				p2 = 1;
			}
			p = sam[0].next[p][c];
			p2 = sam[1].next[p2][c];
			cnt++;
			if(sam[0].endpos[p] == 1 && sam[1].endpos[p2] == 1) {
				ans = min(ans, cnt);
				break;
			}
		}
		p = 1;
		p2 = 1;
		cnt = 0;
	}
	printf("%d\n", ans == 6000 ? -1 : ans);
	return 0;
}

然后是第二种，感谢权哥提供思路。
将两个串一起建自动机，中间以一个没出现过的字符分隔。然后给节点打上标记，由第一个串建立的新节点标1，第二个串建立的节点标2（分隔字符无所谓，标998都行），在从后往前拓扑求endpos时，顺便前推标记。
遇到“由第一个串建立且被第二个串更新过”的节点时，检查其endpos是否为2，如是则更新答案。这种写法没有进行模式匹配，更新答案要用到另外一条后缀自动机的性质：当前状态包含的最短子串长度minlen(s)，是其后缀链接指向状态的maxlen(s) + 1。
至于维护标记的方法，巧妙的使用了或运算操作，被后一个串更新过的节点在或运算之后标记肯定会变成3，而这个3可以继续用或运算前推（%%权哥）。
这种写法比我的写法快了100多ms，当然，两种自动机写法都比dp写法快了很多倍。
顺带一提，分割字符用‘{’，是因为他刚好是字符‘a’ + 26后得到的字符，如果处理的是数字0-9的字符，则可以用‘:’，因为它是字符‘0’ + 10得到的字符。

ac代码2：

#include<bits/stdc++.h>
using namespace std;

const int maxn = 5005;

struct Sam {
	int next[maxn << 2][27];
	int link[maxn << 2],step[maxn << 2];
	int a[maxn << 2],b[maxn << 2];
	int endpos[maxn << 2],mark[maxn << 2];
	int sz,last,len;
	char s[maxn << 1];

	void add(int c,int m) {
		int p = last;
		int np = last = ++sz;

		endpos[np] = 1;
		mark[np] = m;
		step[np] = step[p] + 1;
		while(!next[p][c] && p) {
			next[p][c] = np;
			p = link[p];
		}

		if(p == 0) {
			link[np] = 1;
		} else {
			int q = next[p][c];
			if(step[p] + 1 == step[q]) {
				link[np] = q;
			} else {
				int nq = ++sz;
				memcpy(next[nq], next[q], sizeof(next[q]));
				step[nq] = step[p] + 1;
				link[nq] = link[q];
				link[q] = link[np] = nq;
				while(next[p][c] == q && p) {
					next[p][c] = nq;
					p = link[p];
				}
			}
		}
	}
	
	void build(){
		sz = last = 1;
		scanf("%s",s);
		len = strlen(s);
		for(int i = 0; i < len; i++) {
			add(s[i] - 'a', 1);
		}
		s[len] = '{';
		scanf("%s", s + len + 1);
		len += strlen(s + len);
		for(int i = step[last]; i < len; i++) {
			add(s[i] - 'a', 2);
		}
		for(int i = 1; i <= sz; i++) {
			++a[step[i]];
		}
		for(int i = 1; i <= len; i++) {
			a[i] += a[i - 1];
		}
		for(int i = 1; i <= sz; i++) {
			b[a[step[i]]--] = i;
		}
	}
	
	
	void solve() {
		build();
		int ans = 6000;
		for(int i = sz; i > 1; i--) {
			int e = b[i];
			if(mark[e] == 3 && endpos[e] == 2) {
				ans = min(ans, step[link[e]] + 1);
			}
			mark[link[e]] |= mark[e];
			endpos[link[e]] += endpos[e];
		}
		printf("%d\n",ans == 6000 ? -1 : ans);
	}

} sam;


int main() {
	sam.solve();
	return 0;
}

9月25日更新：
学会了广义后缀自动机。
回头来看当时刚学，第一种双自动机的解法不仅思路很挫打的也很挫…那时候连跳link来考虑后缀都不会，强行开了两重循环orz…
不想改了，就让它成为历史留着以后笑自己菜吧。
下面是广义后缀自动机的解法，分开记录endpos，连标记都不用打了，而且很快。
ac代码：

#include<bits/stdc++.h>
using namespace std;

const int maxn = 5005;

struct Sam {
	int next[maxn << 2][26];
	int link[maxn << 2], step[maxn << 2];
	int a[maxn << 2], b[maxn << 2];
	int endpos[2][maxn << 2];
	char s[maxn];
	int last, sz,len;

	void add(int c) {
		if(next[last][c] && step[last] + 1 == step[next[last][c]]) {
			last = next[last][c];
			return;
		}

		int p = last;
		int np = last = ++sz;

		step[np] = step[p] + 1;
		while(!next[p][c] && p) {
			next[p][c] = np;
			p = link[p];
		}

		if(p == 0) {
			link[np] = 1;
		} else {
			int q = next[p][c];
			if(step[p] + 1 == step[q]) {
				link[np] = q;
			} else {
				int nq = ++sz;
				memcpy(next[nq], next[q], sizeof(next[q]));
				step[nq] = step[p] + 1;
				link[nq] = link[q];
				link[q] = link[np] = nq;
				while(next[p][c] == q && p) {
					next[p][c] = nq;
					p = link[p];
				}
			}
		}
	}

	void build() {
		sz = last = 1;
		scanf("%s", s);
		len = strlen(s);
		for(int i = 0; i < len; i++) {
			add(s[i] - 'a');
			endpos[0][last] = 1;
		}
		scanf("%s", s);
		last = 1;
		len = strlen(s);
		for(int i = 0; i < len; i++) {
			add(s[i] - 'a');
			endpos[1][last] = 1;
		}
		for(int i = 1; i <= sz; i++) {
			++a[step[i]];
		}
		for(int i = 1; i <= sz; i++) {
			a[i] += a[i - 1];
		}
		for(int i = 1; i <= sz; i++) {
			b[a[step[i]]--] = i;
		}
	}


	void solve() {
		build();
		int ans = 6000;
		for(int i = sz; i > 1; i--) {
			int e = b[i];
			if(endpos[0][e] == 1 && endpos[1][e] == 1) {
				ans = min(ans, step[link[e]] + 1);
			}
			endpos[0][link[e]] += endpos[0][e];
			endpos[1][link[e]] += endpos[1][e];
		}
		printf("%d\n", ans == 6000 ? -1 : ans);
	}

} sam;


int main() {
	sam.solve();
	return 0;
}

CodeForces - 427D Match & Catch（后缀自动机）

猜你喜欢