暴雪hash murmurhash3 cityhash 性能对比

前些天要写一个小功能，想到了之前看到的暴雪hash，同事推荐murmurhash，于是写了点简单的代码测试了一下。后来在网上又找到了cityhash，于是加了进去测了一下。

结论：暴雪hash算法 murmurhash64A算法 cityhash算法中，性能最高的是cityhash算法

cityhash网址:https://github.com/google/cityhash linux可以很容易的configure make make install

murmurhash64a在网上找的

暴雪hash算法网上找的

最初我是直接用的网上一个叫陈相礼封装的暴雪hash进行对比的，但是发现性能差的巨大(比murmurhash64a)，后来实际分析了一下，发现性能是损耗在了冲突上面。

暴雪hash做了两件事：第一、hash(并且是三次，防止冲突) 第二、把它放到散列表中(这样就可以下标访问了)。随着数量越来越多，散列出来的冲突概率也就越来越大，于是乎性能完全不在一个层次上了。原因在于它处理冲突的方式是直接找下一个一直到没有为止。

改进了直接进行暴雪的一次hash，性能对比：

before initialization:1481569294902902us diff:0us
150
size:6250000
after initialization:1481569302088635us diff:7185733us
MurmurHash64A:1481569362733291us diff:60644656us
CityHash64:1481569402254517us diff:39521226us
blizzard hash initialize:1481569403307466us diff:1052949us
blizzard hash:1481569477163901us diff:73856435us

对625万个汉字组合进行100次hash，

murmurhash64a用了60秒

cityhash64用了39秒

暴雪hash用了73秒
测试代码：

main.cpp

#define TESTNUM (50 * 51 * 52 * 53)
#define TESTLOOP 100

#include <iostream>
#include <fstream>
#include <list>
#include <vector>
#include "hash_algo.h"
#include <string>
#include <sys/time.h>

#include <city.h>
#include "murmurhash.h"

using namespace std;

long getTimeUS()
{
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec * 1000000 + tv.tv_usec;
}

int main( int argc, char**argv )
{
	long uslast = getTimeUS();
	long uscurr = getTimeUS();

#define PRINTTIME(str) uscurr = getTimeUS();\
	cout<<str<<uscurr<<"us diff:"<<uscurr - uslast<<"us"<<endl;\
	uslast = uscurr;

	PRINTTIME("before initialization:");

	vector<string> dic;
	do{
		string sks = "我人有的和主产不为这工要在地一上是中国经以发了民同金八月白禾言立水火之啊木大土王目日口田给又女子已山";
		cout<<sks.size()<<endl;
		for(int i = 0;i < sks.size() / 3;++i)
		{
			dic.push_back(sks.substr(i*3, 3));
		}
	}while(false);
#if 0
	for(int i = 0;i < dic.size();++i)
	{
		cout<<dic[i]<<endl;
	}
#endif

	list<string> ks;

	for(int i = 0;i < dic.size();++i)
	{
		/*
		   ks.push_back(dic[i]);
		   */
		for(int j = 0;j < dic.size();++j)
		{
			/*
			   string s2 = dic[i] + dic[j];
			   ks.push_back(s2);
			   */
			for(int k = 0;k < dic.size();++k)
			{
				/*
				   string s3 = dic[i] + dic[j] + dic[k];
				   ks.push_back(s3);
				   */
				for(int l = 0;l < dic.size();++l)
				{
					string s4 = dic[i] + dic[j] + dic[k] + dic[l];
					ks.push_back(s4);
					/*
					*/
					/*
					   for(int m = 0;m < dic.size();++m)
					   {
					   string s5 = dic[i] + dic[j] + dic[k] + dic[l] + dic[m];
					   ks.push_back(s5);
					 *                        for(int n = 0;n < dic.size();++n)
					 *                                               {
					 *                                                                      string s6 = dic[i] + dic[j] + dic[k] + dic[l] + dic[m] + dic[n];
					 *                                                                                             ks.push_back(s6);
					 *                                                                                                                    }
					 }
					 *                                                                                                                    */
				}
			}
		}
	}

	cout<<"size:"<<ks.size()<<endl;

	PRINTTIME("after initialization:");

#if 1
	for(auto idx = 0;idx < TESTLOOP;++idx)
		for(auto it = ks.begin();it != ks.end();++it)
			MurmurHash64A((*it).c_str(), (*it).size(), 32);

	PRINTTIME("MurmurHash64A:");
#endif

#if 1
	for(auto idx = 0;idx < TESTLOOP;++idx)
		for(auto it = ks.begin();it != ks.end();++it)
			CityHash64((*it).c_str(), (*it).size());

	PRINTTIME("CityHash64:");
#endif

#if 1
	CHashAlgo hash_test( ks.size() );

	PRINTTIME("blizzard hash initialize:");

	for(auto idx = 0;idx < TESTLOOP;++idx)
		for(auto it = ks.begin();it != ks.end();++it)
		{
			/*
			   bool is_success = hash_test.SetHashTable( (*it).c_str() );
			   if ( is_success )
			   {
			   }
			   else
			   {
			   cout <<"hash failed!"<< endl;
			   }
			   */
			hash_test.HashString( (*it).c_str(), 0 );
		}

	PRINTTIME("blizzard hash:");
#endif
#if 0
	bool is_success = hash_test.SetHashTable( "test" );
	if ( is_success )
	{
		cout <<"散列结果一：成功！"<< endl;
	}
	else
	{
		cout <<"散列结果一：失败！"<< endl;
	}

	is_success = hash_test.SetHashTable( "测试" );
	if ( is_success )
	{
		cout <<"散列结果二：成功！"<< endl;
	}
	else
	{
		cout <<"散列结果二：失败！"<< endl;
	}

	long pos = hash_test.GetHashTablePos( "test" );
	cout <<"查找测试字符串：\"test\" 的散列位置："<< pos << endl;
	pos = hash_test.GetHashTablePos( "测试" );
	cout <<"查找测试字符串：“测试” 的散列位置："<< pos << endl;

	for ( int i =0; i < TESTNUM; i++ )
	{
		char buff[32];
		sprintf(buff, "abcdefg%d.", i);
		is_success = hash_test.SetHashTable(buff);
		is_success ? cout << buff <<"散列结果：成功！位置："<< endl : cout << buff <<"散列结果：失败！"<< endl;
	}
#endif
#if 0
	int iii;
	cin>> iii;
	for ( int i =0; i < TESTNUM; i++ )
	{
		char buff[32];
		sprintf(buff, "abcdefg%d.", i);
		pos = hash_test.GetHashTablePos( buff );
		pos !=-1?  cout <<"查找测试字符串："<< buff <<" 的散列位置："<< pos << endl : cout << buff <<"存在冲突！"<< endl;
	}

	cin>> iii;
#endif

	return 0;
}

murmurhash.h:

#pragma once

static uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
{
	const uint64_t m = 0xc6a4a7935bd1e995;
	const int r = 47;

	uint64_t h = seed ^ (len * m);

	const uint64_t * data = (const uint64_t *)key;
	const uint64_t * end = data + (len/8);

	while(data != end)
	{
		uint64_t k = *data++;

		k *= m;
		k ^= k >> r;
		k *= m;

		h ^= k;
		h *= m;
	}

	const unsigned char * data2 = (const unsigned char*)data;

	switch(len & 7)
	{
		case 7: h ^= uint64_t(data2[6]) << 48;
		case 6: h ^= uint64_t(data2[5]) << 40;
		case 5: h ^= uint64_t(data2[4]) << 32;
		case 4: h ^= uint64_t(data2[3]) << 24;
		case 3: h ^= uint64_t(data2[2]) << 16;
		case 2: h ^= uint64_t(data2[1]) << 8;
		case 1: h ^= uint64_t(data2[0]);
				h *= m;
	};

	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

hash_algo.h:

#pragma once

#define MAXFILENAME 255     // 最大文件名长度
#define MAXTABLELEN 1024    // 默认哈希索引表大小
struct MPQHASHTABLE
{
	long nHashA;
	long nHashB;
	bool bExists;
};

class CHashAlgo
{
	public:
		CHashAlgo( const long nTableLength = MAXTABLELEN )// 创建指定大小的哈希索引表，不带参数的构造函数创建默认大小的哈希索引表
		{
			prepareCryptTable();
			m_tablelength = nTableLength;

			m_HashIndexTable =new MPQHASHTABLE[nTableLength];
			for ( int i =0; i < nTableLength; i++ )
			{
				m_HashIndexTable[i].nHashA =-1;
				m_HashIndexTable[i].nHashB =-1;
				m_HashIndexTable[i].bExists =false;
			}
		}
		void prepareCryptTable();                                              // 对哈希索引表预处理
		unsigned long HashString(const char*lpszFileName, unsigned long dwHashType); // 求取哈希值
		unsigned long GetHashTablePos(const char* lpszString );                      // 得到在定长表中的位置
		bool SetHashTable( const char*lpszString );                                  // 将字符串散列到哈希表中
		unsigned long GetTableLength(void){return m_tablelength;}
		void SetTableLength( const unsigned long nLength ){m_tablelength = nLength;}
		~CHashAlgo()
		{
			if ( nullptr != m_HashIndexTable )
			{
				delete []m_HashIndexTable;
				m_HashIndexTable = nullptr;
				m_tablelength =0;
			}
		}
	protected:
	private:
		unsigned long cryptTable[0x500];
		unsigned long m_tablelength;    // 哈希索引表长度
		MPQHASHTABLE *m_HashIndexTable;
};

hash_algo.cpp:

#include "hash_algo.h"
#include <ctype.h> // for toupper
#include <stdio.h> // for printf

void CHashAlgo::prepareCryptTable()
{
	unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;

	for( index1 =0; index1 < 0x100; index1++ )
	{
		for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 )
		{
			unsigned long temp1, temp2;
			seed = (seed * 125 + 3) % 0x2AAAAB;
			temp1 = (seed & 0xFFFF) << 0x10;
			seed = (seed * 125 + 3) % 0x2AAAAB;
			temp2 = (seed & 0xFFFF);
			cryptTable[index2] = ( temp1 | temp2 );
		}
	}
}

unsigned long CHashAlgo::HashString(const char*lpszFileName, unsigned long dwHashType)
{
	unsigned char*key = (unsigned char*)lpszFileName;
	unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
	int ch;

	while(*key !=0)
	{
		//ch = toupper(*key++);
		ch = *key++;

		seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);
		seed2 = ch + seed1 + seed2 + (seed2 << 5) +3;
	}
	return seed1;
}

unsigned long CHashAlgo::GetHashTablePos(const char*lpszString)
{
	const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
	unsigned long nHash = HashString(lpszString, HASH_OFFSET);
	unsigned long nHashA = HashString(lpszString, HASH_A);
	unsigned long nHashB = HashString(lpszString, HASH_B);
	unsigned long nHashStart = nHash % m_tablelength;
	unsigned long nHashPos = nHashStart;

	while ( m_HashIndexTable[nHashPos].bExists)
	{
		if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHashB)
			return nHashPos;
		else
			nHashPos = (nHashPos + 1) % m_tablelength;

		if (nHashPos == nHashStart)
			break;
	}

	return -1; //没有找到
}
bool CHashAlgo::SetHashTable( const char* lpszString )
{
	const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
	unsigned long nHash = HashString(lpszString, HASH_OFFSET);
	unsigned long nHashA = HashString(lpszString, HASH_A);
	unsigned long nHashB = HashString(lpszString, HASH_B);
	unsigned long nHashStart = nHash % m_tablelength,
				  nHashPos = nHashStart;

	int __debug__conflict = 0;
	while ( m_HashIndexTable[nHashPos].bExists)
	{
		++__debug__conflict;
		nHashPos = (nHashPos + 1) % m_tablelength;
		if (nHashPos == nHashStart)
		{
			return false;
		}
	}

	if(__debug__conflict > 0) printf("conflict %lu, %lu\n", nHashStart, __debug__conflict);

	m_HashIndexTable[nHashPos].bExists = true;
	m_HashIndexTable[nHashPos].nHashA = nHashA;
	m_HashIndexTable[nHashPos].nHashB = nHash;

	return true;
}

编译脚本：

g++ -g -pg -std=c++11 main.cpp hash_algo.cpp -lcityhash

暴雪hash murmurhash3 cityhash 性能对比

猜你喜欢