simhash值的海明距离计算
二进制串A 和 二进制串B 的海明距离 就是 A xor B
后二进制中1的个数。
举例如下:
A = 100111;
B = 101010;
hamming_distance(A, B) = count_1(A xor B) = count_1(001101) = 3;
当我们算出所有doc的simhash值之后,需要计算doc A和doc B之间是否相似的条件是:
A和B的海明距离是否小于等于n,这个n值根据经验一般取值为3,
simhash本质上是局部敏感性的hash,和md5之类的不一样。 正因为它的局部敏感性,所以我们可以使用海明距离来衡量simhash值的相似度。
高效计算二进制序列中1的个数
/* src/Simhasher.hpp */
bool isEqual(uint64_t lhs, uint64_t rhs, unsigned short n = 3)
{
unsigned short cnt = 0;
lhs ^= rhs;
while(lhs && cnt <= n)
{
lhs &= lhs - 1;
cnt++;
}
if(cnt <= n)
{
return true;
}
return false;
}
由上式这个函数来计算的话,时间复杂度是 O(n); 这里的n默认取值为3。由此可见还是蛮高效的。
对比其他算法
百度的去重算法
百度的去重算法最简单,就是直接找出此文章的最长的n句话,做一遍hash签名。n一般取3。 工程实现巨简单,据说准确率和召回率都能到达80%以上。
shingle算法
shingle原理略复杂,不细说。 shingle算法我认为过于学院派,对于工程实现不够友好,速度太慢,基本上无法处理海量数据。
参考
- Similarity estimation techniques from rounding algorithms
- simhash与Google的网页去重
- 海量数据相似度计算之simhash和海明距离
Python的代码实现如下:
Python Code
1
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
#!/usr/bin/python # coding=utf-8 class simhash: #构造函数 def __init__(self, tokens= '', hashbits= 128): self.hashbits = hashbits self. hash = self.simhash(tokens); #toString函数 def __str__(self): return str(self. hash) #生成simhash值 def simhash(self, tokens): v = [ 0] * self.hashbits for t in [self._string_hash(x) for x in tokens]: #t为token的普通hash值 for i in range(self.hashbits): bitmask = 1 << i if t & bitmask : v[i] += 1 #查看当前bit位是否为1,是的话将该位+1 else: v[i] -= 1 #否则的话,该位-1 fingerprint = 0 for i in range(self.hashbits): if v[i] >= 0: fingerprint += 1 << i return fingerprint #整个文档的fingerprint为最终各个位>=0的和 #求海明距离 def hamming_distance(self, other): x = (self. hash ^ other. hash) & (( 1 << self.hashbits) - 1) tot = 0; while x : tot += 1 x &= x - 1 return tot #求相似度 def similarity (self, other): a = float(self. hash) b = float(other. hash) if a > b : return b / a else: return a / b #针对source生成hash值 (一个可变长度版本的Python的内置散列) def _string_hash(self, source): if source == "": return 0 else: x = ord(source[ 0]) << 7 m = 1000003 mask = 2 ** self.hashbits - 1 for c in source: x = ((x * m) ^ ord(c)) & mask x ^= len(source) if x == - 1: x = - 2 return x if __name__ == '__main__': s = 'This is a test string for testing' hash1 = simhash(s. split()) s = 'This is a test string for testing also' hash2 = simhash(s. split()) s = 'nai nai ge xiong cao' hash3 = simhash(s. split()) print(hash1.hamming_distance(hash2) , " " , hash1.similarity(hash2)) print(hash1.hamming_distance(hash3) , " " , hash1.similarity(hash3)) |
Java的代码如下如下:
Java Code
1
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import java.math.BigInteger;
import java.util.StringTokenizer; public class SimHash { private String tokens; private BigInteger strSimHash; private int hashbits = 128; public SimHash( String tokens) { this.tokens = tokens; this.strSimHash = this.simHash(); } public SimHash( String tokens, int hashbits) { this.tokens = tokens; this.hashbits = hashbits; this.strSimHash = this.simHash(); } public BigInteger simHash() { int[] v = new int[ this.hashbits]; StringTokenizer stringTokens = new StringTokenizer( this.tokens); while (stringTokens.hasMoreTokens()) { String temp = stringTokens.nextToken(); BigInteger t = this.hash(temp); for ( int i = 0; i < this.hashbits; i++) { BigInteger bitmask = new BigInteger( "1").shiftLeft(i); if (t.and(bitmask).signum() != 0) { v[i] += 1; } else { v[i] -= 1; } } } BigInteger fingerprint = new BigInteger( "0"); for ( int i = 0; i < this.hashbits; i++) { if (v[i] >= 0) { fingerprint = fingerprint.add( new BigInteger( "1").shiftLeft(i)); } } return fingerprint; } private BigInteger hash( String source) { if (source == null || source.length() == 0) { return new BigInteger( "0"); } else { char[] sourceArray = source.toCharArray(); BigInteger x = BigInteger.valueOf((( long) sourceArray[ 0]) << 7); BigInteger m = new BigInteger( "1000003"); BigInteger mask = new BigInteger( "2").pow( this.hashbits).subtract( new BigInteger( "1")); for ( char item : sourceArray) { BigInteger temp = BigInteger.valueOf(( long) item); x = x.multiply(m).xor(temp).and(mask); } x = x.xor( new BigInteger( String.valueOf(source.length()))); if (x.equals( new BigInteger( "-1"))) { x = new BigInteger( "-2"); } return x; } } public int hammingDistance(SimHash other) { BigInteger m = new BigInteger( "1").shiftLeft( this.hashbits).subtract( new BigInteger( "1")); BigInteger x = this.strSimHash.xor(other.strSimHash).and(m); int tot = 0; while (x.signum() != 0) { tot += 1; x = x.and(x.subtract( new BigInteger( "1"))); } return tot; } public static void main( String[] args) { String s = "This is a test string for testing"; SimHash hash1 = new SimHash(s, 128); System.out.println(hash1.strSimHash + " " + hash1.strSimHash.bitLength()); s = "This is a test string for testing also"; SimHash hash2 = new SimHash(s, 128); System.out.println(hash2.strSimHash + " " + hash2.strSimHash.bitCount()); s = "This is a test string for testing als"; SimHash hash3 = new SimHash(s, 128); System.out.println(hash3.strSimHash + " " + hash3.strSimHash.bitCount()); System.out.println( "============================"); System.out.println(hash1.hammingDistance(hash2)); System.out.println(hash1.hammingDistance(hash3)); } } |
另外说明,位运算只适合整数哦。。。因为浮点的存储方案决定不能位运算,如果非要位运算,就需要Float.floatToIntBits,运算完,再通过Float.intBitsToFloat转化回去。(java默认的float,double的hashcode其实就是对应的floatToIntBits的int值)