HASH算法模板以及简单的入门题总结

Hash算法模板

//暂时没用到双hash，用到会过来补充
//hash一般用来解决字符串判重/字符串匹配问题
//遇见不定长问题可通过二分+hash降低复杂度
//遇见定长字符串问题可通过尺取+hash来降低复杂度
//二维hash的时候尺取方法就是把之前不需要的都变为0再加上当前行，将匹配字符串整体下移，来验证hash值是否相等
#include<string.h>
typedef unsigned long long ull;
const int maxn=1e5+5;
ull hash_[maxn],xp[maxn];
void init()
{
    xp[0]=1;
    for(int i=1;i<maxn;i++)
        xp[i]=xp[i-1]*13331;//这里13331玄学数字，大概可以随意换
    return ;
}
void make_hash(char str[])//处理出str的hash值
{
    int len=strlen(str);
    hash_[len]=0;
    for(int i=len-1;i>=0;i--)
    {
        hash_[i]=hash_[i+1]*13331+str[i]-'A'+1;
    }
    return ;
}
ull Get_hash(int i,int L)//得到起点为i，长度为L的子串的hash值
{
    return hash_[i]-hash_[i+L]*xp[L];
}

Hash第一题

HDU1686
题意就是找A串在B串中的出现次数（可重叠）。其实就是一个kmp模板题，但是我们可以用hash很容易的解决
我们得到A串的hash值，然后在B中枚举起点，长度为lena的子串，检验hash值是否相同就可以了。
HDU1686代码

#include<stdio.h>
#include<iostream>
#include<algorithm>
#include<string.h>
using namespace std;
typedef unsigned long long ull;
const int maxn = 1e6+5;
ull xp[maxn],hash_1[maxn],hash_2[maxn];
void init()
{
    xp[0]=1;
    for(int i=1;i<maxn;i++)
        xp[i]=xp[i-1]*13331;
}
ull get_hash(int i,int L,ull hash_[])//get_hash(i,L)可以得到从位置i开始的,长度为L的子串的hash值.
{
    return hash_[i]-hash_[i+L]*xp[L];
}
int make_hash(char str[],ull hash_[])
{
    int len=strlen(str);
    hash_[len]=0;
    for(int i=len-1;i>=0;i--)
    {
        hash_[i]=hash_[i+1]*13331+(str[i]-'a'+1);
    }
    return len;
}
char str[maxn],str2[maxn];
int main()
{
    init();
    int t;
    scanf("%d",&t);
    while(t--)
    {
        int ans=0;
        scanf("%s%s",str,str2);
        int len1=make_hash(str,hash_1);
        int len2=make_hash(str2,hash_2);
        ull tmp=get_hash(0,len1,hash_1);
        for(int i=0;i<len2-len1+1;i++)//注意枚举时的边界问题
        {
            if(get_hash(i,len1,hash_2)==tmp)
                ans++;
        }
        printf("%d\n",ans);
    }
    return 0;
}

Hash第二题

POJ2774
题意就是求两个串的最长公共子串，正常做法应该是后缀数组跑一下height数组，这里说一下hash的做法。
由于没有给定长度，要求长度，这时就要想是否具有二分的性质，发现答案是具有二分性质的，所以我们可以二分答案，然后把A串中所有出现过的hash值放进一个数组，sort一下，然后对于每个B串产生的hash用lower_bound查询是否出现过，若出现过则直接返回true.复杂度是 $o(len*log(len)*log(len))$ ,跑了1250ms,用map判断是否出现过就会超时，这暂且作为hash的一个优化方式吧。
POJ2774代码

#include<stdio.h>
#include<iostream>
#include<algorithm>
#include<string.h>
#include<map>
using namespace std;
typedef unsigned long long ull;
const int maxn = 1e6+5;
ull xp[maxn],hash_1[maxn],hash_2[maxn];
ull a[maxn];
int len1,len2;
void init()
{
    xp[0]=1;
    for(int i=1;i<maxn;i++)
        xp[i]=xp[i-1]*13331;
}
ull get_hash(int i,int L,ull hash_[])
{
    return hash_[i]-hash_[i+L]*xp[L];
}
int make_hash(char str[],ull hash_[])
{
    int len=strlen(str);
    hash_[len]=0;
    for(int i=len-1;i>=0;i--)
    {
        hash_[i]=hash_[i+1]*13331+(str[i]-'a'+1);
    }
    return len;
}
char str[maxn],str2[maxn];
bool check(int L)
{
    int cnt=0;
    for(int i=0;i<len1-L+1;i++)
    {
        a[cnt++]=get_hash(i,L,hash_1);
    }
    sort(a,a+cnt);
    for(int i=0;i<len2-L+1;i++)
    {
        ull tmp=get_hash(i,L,hash_2);
        int pos=lower_bound(a,a+cnt,tmp)-a;
        if(a[pos]==tmp) return true;
    }
    return false;
}
int main()
{
    init();
    while( scanf("%s%s",str,str2)!=EOF)
    {
        len1=make_hash(str,hash_1);
        len2=make_hash(str2,hash_2);
        int l=0,r=min(len1,len2),mid;
        while(l<=r)
        {
            mid=(l+r)>>1;
            if(check(mid)) l=mid+1;
            else r=mid-1;
        }
        printf("%d\n",r);
    }
    return 0;
}

Hash第三题

POJ3261
本题就是求字符串中至少出现过k次的最长子串。
正解应该是后缀数组，二分长度之后在height上分块去做即可
Hash的做法也很明显，二分长度，之后去计算当前长度出现次数是否有超过k的子串就可以了
POJ3261代码

#include<stdio.h>
#include<iostream>
#include<algorithm>
#include<string.h>
#include<map>
using namespace std;
typedef unsigned long long ull;
const int maxn = 1e5+5;
ull xp[maxn],hash_[maxn];
ull a[maxn];
int len1,len2;
map<ull ,int> mp;
void init()
{
    xp[0]=1;
    for(int i=1;i<maxn;i++)
        xp[i]=xp[i-1]*13331;
}
ull get_hash(int i,int L)
{
    return hash_[i]-hash_[i+L]*xp[L];
}
void make_hash(ull str[],int len)
{
    hash_[len]=0;
    for(int i=len-1;i>=0;i--)
    {
        hash_[i]=hash_[i+1]*13331+str[i];
    }
}
ull str[maxn];
bool check(int L,int len,int k)
{
    mp.clear();
    for(int i=0;i<len-L+1;i++)
    {
        ull tmp=get_hash(i,L);
        mp[tmp]++;
        if(mp[tmp]>=k) return true;
    }
    return false;
}
int n,k;
int main()
{
    init();
    while( scanf("%d%d",&n,&k)!=EOF)
    {
        for(int i=0;i<n;i++) scanf("%llu",&str[i]);
        make_hash(str,n);
        int l=0,r=n,mid;
        while(l<=r)
        {
            mid=(l+r)>>1;
            if(check(mid,n,k)) l=mid+1;
            else r=mid-1;
        }
        printf("%d\n",r);
    }
    return 0;
}

Hash第四题

UVA-257
本题意义是给你一堆字符串，将其中具有两个长度超过3的回文串的字符串输出
要求这两个回文字符串不能完全覆盖，但是可以重合，而且这两个回文字符串不能相同。
由于要求长度大于3而且可以重合，我们只要求每个长度为3/4的回文串就可以了，当某个位置存在长度>=3的回文串，如果串长为奇数，那么就取3，否则取4，然后把遍历指针i往后挪一位（去掉覆盖的情况），之后用hash判重就可以了。
比如AAAA
我们找到第一个AAA 然后把i++，这样就避免了AAAA覆盖AAA的情况
因为我们只考虑长度为3/4的回文串，所以只考虑这一种覆盖情况即可。
求某个位置为中心回文串的长度时，可以用manacher算法来计算出每个位置的回文半径。
UVA-257代码

#include<stdio.h>
#include<iostream>
#include<algorithm>
#include<string.h>
#include<sstream>
#include<map>
using namespace std;
typedef unsigned long long ull;
const int maxn=110010;
char s[maxn];
map<ull,int> mp;
char Ma[maxn*2];
int Mp[maxn*2];
string line;
string str;
ull hash_[maxn],xp[maxn];
void init()
{
    xp[0]=1;
    for(int i=1;i<maxn;i++)
        xp[i]=xp[i-1]*13331;
    return ;
}
void make_hash(char str[])
{
    int len=strlen(str);
    hash_[len]=0;
    for(int i=len-1;i>=0;i--)
    {
        hash_[i]=hash_[i+1]*13331+str[i]-'A'+1;
    }
    return ;
}
ull Get_hash(int i,int L)
{
    return hash_[i]-hash_[i+L]*xp[L];
}
void Manacher(char s[],int len)
{
    int l=0;
    Ma[l++]='$';
        Ma[l++]='#';
        for(int i=0; i<len; i++)
        {
            Ma[l++]=s[i];
            Ma[l++]='#';
        }
        Ma[l]=0;
        int mx=0,id=0;
        for(int i=0; i<l; i++)
        {
            Mp[i]=mx>i?min(Mp[2*id-i],mx-i):1;
           while(Ma[i+Mp[i]]==Ma[i-Mp[i]])
                Mp[i]++;
            if(i+Mp[i]>mx)
            {
                mx=i+Mp[i];
                id=i;
            }
        }
}
int main()
{
    int ans=0;
    init();
    while(scanf("%s",s)!=EOF)
    {
        mp.clear();
        int len=strlen(s);
        make_hash(s);
        Manacher(s,len);//manacher之后，Mp[i]-1为i位置的回文半径
        int cnt=0;
        for(int i=0;i<2*len+2;i++)
        {
            if(Mp[i]-1>=3)
            {
                if(Mp[i]%2==1)//回文串为偶数，取长度四的回文串
                {
                    int st=(i-1)/2-2;
                    int le=4;
                    ull tmp=Get_hash(st,le);
                    mp[tmp]++;
                }
                else//回文串为奇数，取长度三的回文串
                {
                    int st=i/2-2;
                    int le=3;
                    ull tmp=Get_hash(st,le);
                    mp[tmp]++;
                }
                i++;//当前位置存在大于三的回文串，避免覆盖后移一位。
            }
        }
        if(mp.size()>=2) printf("%s\n",s);
    }
    return 0;
}

Hash第五题

UVA-11019
本体题意就是给你AB两个字符矩阵，问你B矩阵在A矩阵中的出现次数。
我们可以进行二维hash，其实就是把n个横向串连在一起hash。
注意判相等的时候，我们不断进行尺取+hash，尺取的过程，我们删除当前第一行的hash值加上最后一行的hash值，删除第一行的hash值直接删去就可以
例如
$AAA$
$BBB$
$CCC$
我们删去第一行的hash值相当于把矩阵变成了
$000$
$BBB$
$CCC$
此时我们再添加最后一行
$000$
$BBB$
$CCC$
$DDD$
如果这时候的B矩阵是
$BBB$
$CCC$
$DDD$
这两个矩阵的hash值不同的，为了处理这种情况，我们把B矩阵相应的添加前几行
变成
$000$
$BBB$
$CCC$
$DDD$
这样再去匹配就可以了。
以上就是二维hash大概的处理方法（是我自己想的做法，如果有其他好的尺取方法欢迎指教
掌握了这个做法，我们就可以枚举矩阵的左上角，然后对于当前列数的矩阵从上向下进行尺取，hash判断就可以了。
UVA-11019代码

#include<stdio.h>
#include<algorithm>
#include<iostream>
using namespace std;
const int maxn = 1e3+5;
const int MAXN = 1e6+5;
typedef unsigned long long ull;
ull hash_[maxn][maxn],xp[MAXN];
char str[maxn][maxn];
char str2[maxn][maxn];
void init()
{
    xp[0]=1;
    for(int i=1;i<MAXN;i++)
    {
        xp[i]=xp[i-1]*13331;
    }
    return ;
}
ull Get_hash(int i,int j,int l)
{
    return hash_[i][j]-hash_[i][j+l]*xp[l];
}
int main()
{
    init();
    int t;
    scanf("%d",&t);
    while(t--)
    {
        int ans=0;
        int n,m,x,y;
        scanf("%d%d",&n,&m);
        for(int i=0;i<n;i++)
        {
            scanf("%s",str[i]);
        }
        for(int i=0;i<n;i++)
        {
            hash_[i][m]=0;
            for(int j=m-1;j>=0;j--)
            {
                hash_[i][j]=hash_[i][j+1]*13331+str[i][j]-'a'+1;//每一行分别处理hash值
            }
        }
        scanf("%d%d",&x,&y);
        for(int i=0;i<x;i++)
            scanf("%s",str2[i]);
        ull tmp=0;
        for(int i=x-1;i>=0;i--)
        {
            for(int j=y-1;j>=0;j--)
            {
                tmp=tmp*13331+str2[i][j]-'a'+1;//处理出匹配矩阵的hash值
            }
        }
        for(int i=0;i<=m-y;i++)//枚举横坐标起点
        {
            ull tt=tmp;
            ull tmp2=0;
            for(int j=x-1;j>=0;j--)
            {
                tmp2=tmp2*xp[y]+Get_hash(j,i,y);
            }
             if(tt==tmp2) ans++;
             for(int j=x;j<n;j++)
             {
                 tmp2-=Get_hash(j-x,i,y)*xp[(j-x)*y];//尺取过程去除第一行，也就是将第一行变为0
                 tmp2+=Get_hash(j,i,y)*xp[j*y];//加上最后一行
                 tt=tt*xp[y];//将匹配矩阵第一行添上0
                 if(tmp2==tt) ans++;
             }
        }
        printf("%d\n",ans);
    }
    return 0;
}

Hash第六题

HDU4821
本题题意就是给你一个字符串，问你该字符串具有多少个子串满足
长度为m*l,而且可以拆成m个长度为l的不相同子串。
定长字符串问题，很明显是可以hash+尺取的，我们可以枚举l个子串的起点，之后算出当前m*l的字符串的情况，之后不断往后尺取就可以了
复杂度是 $o(l*(len/l)*log(m))$ 因为尺取的过程每次向后跳l个字符，log是map带来的所以复杂度是可以的
然而如果枚举所有起点的m*l字符串验证复杂度就会变成 $o(len*l）$ 是会超时的
这就是利用hash+尺取带来的优势。
HDU4821代码

#include<stdio.h>
#include<iostream>
#include<string.h>
#include<map>
using namespace std;
typedef unsigned long long ull;
#define dbg(x) cout<<#x<<" = "<<x<<endl;
#define Get_hash(i,L) hash_[i]-hash_[i+L]*xp[L]
const int maxn = 1e5+5;
ull xp[maxn],hash_[maxn];
ull x[maxn];
char str[maxn];
void init()
{
    xp[0]=1;
    for(int i=1;i<maxn;i++)
        xp[i]=xp[i-1]*13331;
    return ;
}
int Make_hash(char str[])
{
    int len=strlen(str);
    hash_[len]=0;
    for(int i=len-1;i>=0;i--)
        hash_[i]=hash_[i+1]*13331+(str[i]-'a'+1);
    return len;
}
map<ull,int> mp;
int main()
{
  init();
  int m,l;
  while(scanf("%d%d",&m,&l)!=EOF)
  {
      int cnt=0;
      scanf("%s",str);
      int len=Make_hash(str);
      for(int i=0;i<len-l+1;i++)
      {
          x[i]=Get_hash(i,l);
      }
      for(int i=0;i<l&&i+m*l<=len;i++)//枚举起点，注意范围
      {
          mp.clear();
          int ans=0;
          for(int j=i;j<i+m*l;j+=l)
          {
              if(mp[x[j]]==0)
              {
                  ans++;
              }
              mp[x[j]]++;
          }
          if(ans==m)  cnt++;//以上是用来计算出第一个完整的m*l块的情况，之后尺取就可以了
          for(int j=i+m*l;j+l<=len;j+=l)//注意范围
          {
              ull tmp=x[j-m*l];
              mp[tmp]--;//去掉当前第一个长度为l的子串
              if(mp[tmp]==0) ans--;
              tmp=x[j];//加上当前长度l的子串
              if(mp[tmp]==0)
              {
                  ans++;
              }
              mp[tmp]++;
              if(ans==m) cnt++;//当前的m个均不相同
          }
      }
      printf("%d\n",cnt);
  }
  return 0;
}

未完待续…