（模板）字符串Hash算法

题目链接：https://www.luogu.com.cn/problem/P3370

题意：给n个字符串，求不同的字符串有多少。(n<=1e4，字符串长度<=1500）

思路：

　　字符串模板。

　　acm中广泛使用的一种字符串Hash算法“BKDR Hash”，主要思路是选择合适的进制（base应大于字符串元素的最大值，我取得131），把字符串上的每个字符看成大数上的一位数字，在取模的前提下计算其转换为十进制的结果，代表字符串的Hash值，但可能发生碰撞（不同字符串的Hash值相同）根据不同的方法碰撞的可能性不同，好的Hash算法应保证碰撞的可能性尽可能小。下面列出4种Hash做法：自然溢出、单Hash、双Hash、模1e18级别的质数（推荐），详见代码。

AC code：　　

　　1. 自然溢出（100分）。

/*
 * 自然溢出，用ull，溢出时自动对2^64取模
 * 可以卡，方法见BZOJ 3097 Hash Killer I
*/

#include<cstdio>
#include<cstring>
#include<algorithm>
using namespace std;

typedef unsigned long long ull;
ull base=131;
ull a[10005];
char s[10005];
int n,ans;

ull gethash(char s[]){
    int len=strlen(s);
    ull ret=0;
    for(int i=0;i<len;++i)
        ret=ret*base+(ull)s[i];
    return ret;
}

int main(){
    scanf("%d",&n);
    for(int i=1;i<=n;++i){
        scanf("%s",s);
        a[i]=gethash(s);
    }
    sort(a+1,a+n+1);
    ans=1;
    for(int i=2;i<=n;++i)
        if(a[i]!=a[i-1])
            ++ans;
    printf("%d\n",ans);
    return 0;
}

　　2. 单hash（80分）

/*
 * 单hash，不建议
 * 根据生日悖论找上sqrt(MOD)个字符串时
 * 有很大概率碰撞
 * 参见BZOJ 3098 Hash Killer II
 */

#include<cstdio>
#include<cstring>
#include<algorithm>
using namespace std;

typedef unsigned long long ull;
ull base=131;
ull a[10005];
char s[10005];
int n,ans;
ull MOD=19260817;

ull gethash(char s[]){
    int len=strlen(s);
    ull ret=0;
    for(int i=0;i<len;++i)
        ret=(ret*base+(ull)s[i])%MOD;
    return ret;
}

int main(){
    scanf("%d",&n);
    for(int i=1;i<=n;++i){
        scanf("%s",s);
        a[i]=gethash(s);
    }
    sort(a+1,a+n+1);
    ans=1;
    for(int i=2;i<=n;++i)
        if(a[i]!=a[i-1])
            ++ans;
    printf("%d\n",ans);
    return 0;
}

　　3. 双Hash（100分）

/*
 * 双hash，常数大
 * 模两个1e9级别的质数
 * 仅模两个质数的结果都想等时才相等
 * 除了卡时间之外，目前没有办法卡这种做法
 * 见BZOJ 3099 Hash Killer III
 */
#include<cstdio>
#include<cstring>
#include<algorithm>
using namespace std;

typedef unsigned long long ull;
const ull MOD1=19260817;
const ull MOD2=19660813;
ull base=131;
char s[10005];
int n,ans;

struct node{
    ull x,y;
}a[10005];

bool cmp(node a,node b){
    return a.x<b.x;
}

ull gethash(char s[],ull MOD){
    int len=strlen(s);
    ull ret=0;
    for(int i=0;i<len;++i)
        ret=(ret*base+(ull)s[i])%MOD;
    return ret;
}

int main(){
    scanf("%d",&n);
    for(int i=1;i<=n;++i){
        scanf("%s",s);
        a[i].x=gethash(s,MOD1);
        a[i].y=gethash(s,MOD2);
    }
    sort(a+1,a+n+1,cmp);
    ans=1;
    for(int i=2;i<=n;++i)
        if(a[i].x!=a[i-1].x||a[i].y!=a[i-1].y)
            ++ans;
    printf("%d\n",ans);
    return 0;
}

　　4. 模1e18级别的质数（100分，推荐做法）

/*
 * 模1e18级别的质数
 * 推荐
 * 既难以被卡，且常数小
 */
#include<cstdio>
#include<cstring>
#include<algorithm>
using namespace std;

typedef unsigned long long ull;
const ull MOD=212370440130137957ll;//1e18位的质数
ull base=131;
ull a[10005];
char s[10005];
int n,ans;

ull gethash(char s[]){
    int len=strlen(s);
    ull ret=0;
    for(int i=0;i<len;++i)
        ret=(ret*base+(ull)s[i])%MOD;
    return ret;
}

int main(){
    scanf("%d",&n);
    for(int i=1;i<=n;++i){
        scanf("%s",s);
        a[i]=gethash(s);
    }
    sort(a+1,a+n+1);
    ans=1;
    for(int i=2;i<=n;++i)
        if(a[i]!=a[i-1])
            ++ans;
    printf("%d\n",ans);
    return 0;
}

（模板）字符串Hash算法

猜你喜欢