<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.0</version>
</dependency>
public class TestSomething { private static final int capacity = 1000000; private static final int key = 999998; private static BloomFilter<Integer> bloomFilter = BloomFilter.create(Funnels.integerFunnel(), capacity); static { for (int i = 0; i < capacity; i++) { bloomFilter.put(i); } } public static void main(String[] args) { /*返回计算机最精确的时间,单位微妙*/ long start = System.nanoTime(); if (bloomFilter.mightContain(key)) { System.out.println("成功过滤到" + key); } long end = System.nanoTime(); System.out.println("布隆过滤器消耗时间:" + (end - start)); int sum = 0; for (int i = capacity + 20000; i < capacity + 30000; i++) { if (bloomFilter.mightContain(i)) { sum = sum + 1; } } System.out.println("错判率为:" + sum); } }
布隆过滤器实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都远远超过一般的算法,缺点是有一定的误识别率和删除困难
当一个元素被加入集合时,通过K个散列函数将这个元素映射成一个位数组中的K个点,把它们置为1。检索时,我们只要看看这些点是不是都是1就(大约)知道集合中有没有它了:如果这些点有任何一个0,则被检元素一定不在;如果都是1,则被检元素很可能在。因为存在哈希冲突导致3%%左右的误判,即没有存在的判断存在,但是在的一定就是在的。
因此BloomFilter最理想的应用场景是在一些复杂的查询时,在DB上做一层BloomFilter判断,如果BloomFilter判断不存在,则没必要到DB去查了。顶多就是出现误判时,多到DB查询一下,而这个概率是很低的。利用redis的高性能以及通过pipeline将多条bit操作命令批量提交,实现了多机BloomFilter的bit数据共享,数据限制512M。
在java中实现存在2个问题:1.OOM 2.持久化
整合redis如下
@Component @Scope("prototype") public class BloomFilter<E> { @Autowired private RedisUtil redisUtil; @Value("${bloomfilter.expireDays}") private long expireDays; private String redisKey = "DEFAULT"; /** * 总长度 */ private int sizeOfBloomFilter; /** * 预估过滤数 */ private int expectedNumberOfFilterElements; /** * 哈希次数 */ private int numberOfHashFunctions; private final Charset charset = Charset.forName("UTF-8"); private static final String hashName = "MD5"; private static final MessageDigest digestFunction; // The digest method is reused between instances static { MessageDigest tmp; try { tmp = java.security.MessageDigest.getInstance(hashName); } catch (NoSuchAlgorithmException e) { tmp = null; } digestFunction = tmp; } public BloomFilter() { this(0.0001, 600000); } /** * Constructs an empty Bloom filter. * * @param m is the total length of the Bloom filter. * @param n is the expected number of elements the filter will contain. * @param k is the number of hash functions used. */ public BloomFilter(int m, int n, int k) { this.sizeOfBloomFilter = m; this.expectedNumberOfFilterElements = n; this.numberOfHashFunctions = k; } /** * Constructs an empty Bloom filter with a given false positive probability. * The size of bloom filter and the number of hash functions is estimated * to match the false positive probability. * 给定期望的错误率,过滤量 * * @param falsePositiveProbability is the desired false positive probability. * @param expectedNumberOfElements is the expected number of elements in the Bloom filter. */ public BloomFilter(double falsePositiveProbability, int expectedNumberOfElements) { // m = ceil(kn/ln2) k = ceil(-ln(f)/ln2) this((int) Math.ceil((int) Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) * expectedNumberOfElements / Math.log(2)), expectedNumberOfElements, (int) Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))); } /** * Adds all elements from a Collection to the Bloom filter. * * @param c Collection of elements. */ public void addAll(Collection<? extends E> c) { for (E element : c) { add(element); } } /** * Adds an object to the Bloom filter. The output from the object's * toString() method is used as input to the hash functions. * 添加元素 * * @param element is an element to register in the Bloom filter. */ public void add(E element) { add(element.toString().getBytes(charset)); } /** * Adds an array of bytes to the Bloom filter. * * @param bytes array of bytes to add to the Bloom filter. */ public void add(byte[] bytes) { if (redisUtil.get(redisKey) == null) { redisUtil.setBit(redisKey, 0, false); redisUtil.expire(redisKey, expireDays); } int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { redisUtil.setBit(redisKey, Math.abs(hash % sizeOfBloomFilter), true); } } /** * Returns true if the element could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param element element to check. * @return true if the element could have been inserted into the Bloom filter. */ public boolean contains(E element) { return contains(element.toString().getBytes(charset)); } /** * Returns true if the array of bytes could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param bytes array of bytes to check. * @return true if the array could have been inserted into the Bloom filter. */ public boolean contains(byte[] bytes) { int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { if (!redisUtil.getBit(redisKey, Math.abs(hash % sizeOfBloomFilter))) { return false; } } return true; } /** * Returns true if all the elements of a Collection could have been inserted * into the Bloom filter. Use getFalsePositiveProbability() to calculate the * probability of this being correct. * * @param c elements to check. * @return true if all the elements in c could have been inserted into the Bloom filter. */ public boolean containsAll(Collection<? extends E> c) { for (E element : c) { if (!contains(element)) { return false; } } return true; } /** * Generates digests based on the contents of an array of bytes and splits the result into 4-byte int's and store them in an array. The * digest function is called until the required number of int's are produced. For each call to digest a salt * is prepended to the data. The salt is increased by 1 for each call. * * @param data specifies input data. * @param hashes number of hashes/int's to produce. * @return array of int-sized hashes */ public static int[] createHashes(byte[] data, int hashes) { int[] result = new int[hashes]; int k = 0; byte salt = 0; while (k < hashes) { byte[] digest; synchronized (digestFunction) { digestFunction.update(salt); salt++; digest = digestFunction.digest(data); } for (int i = 0; i < digest.length / 4 && k < hashes; i++) { int h = 0; for (int j = (i * 4); j < (i * 4) + 4; j++) { h <<= 8; h |= ((int) digest[j]) & 0xFF; } result[k] = h; k++; } } return result; } public int getSizeOfBloomFilter() { return this.sizeOfBloomFilter; } public int getExpectedNumberOfElements() { return this.expectedNumberOfFilterElements; } public int getNumberOfHashFunctions() { return this.numberOfHashFunctions; } /** * Compares the contents of two instances to see if they are equal. * * @param obj is the object to compare to. * @return True if the contents of the objects are equal. */ @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final BloomFilter<E> other = (BloomFilter<E>) obj; if (this.sizeOfBloomFilter != other.sizeOfBloomFilter) { return false; } if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { return false; } if (this.numberOfHashFunctions != other.numberOfHashFunctions) { return false; } return true; } public String getRedisKey() { return redisKey; } public void setRedisKey(String redisKey) { this.redisKey = redisKey; } /** * Calculates a hash code for this class. * * @return hash code representing the contents of an instance of this class. */ @Override public int hashCode() { int hash = 7; hash = 61 * hash + this.sizeOfBloomFilter; hash = 61 * hash + this.expectedNumberOfFilterElements; hash = 61 * hash + this.numberOfHashFunctions; return hash; } }
使用场景:黑名单,URL重复检查,字典纠错,垃圾邮件,缓存穿透: 将数据库中所有的查询条件,放到布隆过滤器中。当一个查询请求来临的时候,先经过布隆过滤器进行检查,如果请求存在这个条件中,那么继续执行,如果不在,直接丢弃。