位图
- 位图就是用一个比特位来表示两种状态,0或者1;这样做可以大大的减少空间的使用节约空间,一个整形数据在32位机中为32个字节,用一个整形数据就可以表示32个“事件”的状态,适用于大规模数据,但数据状态又不是很多的情况,通常是用来判断某个数据存不存在的。
代码实现如下:
BitMap.h
#ifndef __BITMAP_H__
#define __BITMAP_H__
#include <stdio.h>
#include <malloc.h>
#include <assert.h>
typedef struct BitMap
{
int* _bmp;
int _capacity;
int _size;
}BMap;
void BitMapInit(BMap* bp, int bitcount);
void BitMapSet(BMap* bp, int which);
void BitMapReSet(BMap* bp, int which);
int BitMapTest(BMap* bp, int which);
int BitMapSize(BMap* bp);
int BitMapCount(BMap* bp);
void BitMapDestroy(BMap* bp);
#endif
BitMap.c
#include "BitMap.h"
void BitMapInit(BMap* bp, int bitcount)
{
assert(bp);
bp->_capacity = bitcount / 32 + 1;
bp->_bmp = (int*)calloc(bp->_capacity, sizeof(int));
if (NULL == bp->_bmp)
{
assert(0);
return;
}
bp->_size = bitcount;
}
void BitMapSet(BMap* bp, int which)
{
int index = 0;
int pos = 0;
assert(bp);
if (which >= bp->_size)
return;
index = which / 32;
pos = which % 32;
bp->_bmp[index] |= (1 << pos);
}
void BitMapReSet(BMap* bp, int which)
{
int index = 0;
int pos = 0;
assert(bp);
if (which >= bp->_size)
return;
index = index = which / 32;
pos = which % 32;
bp->_bmp[index] &= ~(1 << pos);
}
int BitMapTest(BMap* bp, int which)
{
int index = 0;
int pos = 0;
assert(bp);
if (which >= bp->_size)
{
printf("which超过表示范围!\n");
return 0;
}
index = which / 32;
pos = which % 32;
return bp->_bmp[index] & (1 << pos);
}
int BitMapSize(BMap* bp)
{
assert(bp);
return bp->_size;
}
int BitMapCount(BMap* bp)
{
int i = 0;
int count = 0;
const char bitCount[] = "\0\1\1\2\1\2\2\3\1\2\2\3\2\3\3\4";
for (; i < bp->_capacity; i++)
{
int tmp = bp->_bmp[i];
int j = 0;
while (j < sizeof(bp->_bmp[0]))
{
char c = tmp;
count += bitCount[c & 0x0f];
c >>= 4;
count += bitCount[c & 0x0f];
tmp >>= 8;
j++;
}
}
return count;
}
void BitMapDestroy(BMap* bp)
{
assert(bp);
free(bp->_bmp);
bp->_bmp = NULL;
bp->_capacity = 0;
bp->_size = 0;
}
test.c
#include "BitMap.h"
#include "BloomFilter.h"
void TestBitMap()
{
BMap bp;
BitMapInit(&bp, 28);
BitMapSet(&bp, 3);
BitMapSet(&bp, 4);
BitMapSet(&bp, 1);
BitMapReSet(&bp, 1);
printf("第零位为:%d\n", BitMapTest(&bp, 0));
printf("第一位为:%d\n", BitMapTest(&bp, 1));
printf("位图中共有1:%d个\n", BitMapCount(&bp));
BitMapDestroy(&bp);
}
int main()
{
TestBitMap();
}
布隆过滤器
- 布隆过滤器就是用于海量数据中查找某个元素在不在,如:一个单词是否在已知的词典当中;一个嫌疑人的名字是否已经在嫌疑人名单当中;这些数据的特点就是占用空间大,如果直接用哈希表的话,不好处理,这样我们就很容易想到用上面的位图来解决这些问题,但是,数据,该怎么放呢?我们可以采用哈希函数,来将数据映射到对应的比特位,取的时候用同样的方法,但是,又存在一个问题,那就是哈希冲突,解决哈希冲突的办法就是使用多个哈希函数来映射,如果他们说有一个元素不在集合,那么他就肯定不在。如果他们都说在,虽然有可能不在,但直觉上判断这种概率相对较低。
代码实现如下:
BloomFilter.h
#pragma once
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include "BitMap.h"
#include "comm.h"
typedef char* DataType;
typedef unsigned(*PSTI)(DataType str);
typedef struct BloomFilter
{
BMap _bp;
int _size;
PSTI _STI[5];
}BFilter;
void BloomFilterInit(BFilter* bf, int capacity, PSTI* pSTI, int size);
void BloomFilterInseret(BFilter* bf, DataType data);
int BloomFilterIsIn(BFilter* bf, DataType data);
int BloomFilterSize(BFilter* bf);
void BloomFilterDestroy(BFilter* bf);
BloomFilter.c
#include "BloomFilter.h"
#include "BitMap.h"
#include "comm.h"
//初始化
void BloomFilterInit(BFilter* bf, int capacity, PSTI* pSTI, int size)//size表示数组的大小
{
int i = 0;
assert(bf);
BitMapInit(&bf->_bp, capacity * 5);//
bf->_size = 0;
for (; i < size; i++)
bf->_STI[i] = pSTI[i];
}
//插入元素
void BloomFilterInseret(BFilter* bf, DataType data)
{
assert(bf);
unsigned hashaddr = -1;
hashaddr = bf->_STI[0](data) % bf->_bp._size;
BitMapSet(&bf->_bp, hashaddr);
hashaddr = bf->_STI[1](data) % bf->_bp._size;
BitMapSet(&bf->_bp, hashaddr);
hashaddr = bf->_STI[2](data) % bf->_bp._size;
BitMapSet(&bf->_bp, hashaddr);
hashaddr = bf->_STI[3](data) % bf->_bp._size;
BitMapSet(&bf->_bp, hashaddr);
hashaddr = bf->_STI[4](data) % bf->_bp._size;
BitMapSet(&bf->_bp, hashaddr);
bf->_size++;
}
//判断数据是否在布隆中
int BloomFilterIsIn(BFilter* bf, DataType data)
{
assert(bf);
unsigned hashaddr = -1;
hashaddr = bf->_STI[0](data) % bf->_bp._size;
if (!BitMapTest(&bf->_bp, hashaddr))
return 0;
hashaddr = bf->_STI[1](data) % bf->_bp._size;
if (!BitMapTest(&bf->_bp, hashaddr))
return 0;
hashaddr = bf->_STI[2](data) % bf->_bp._size;
if (!BitMapTest(&bf->_bp, hashaddr))
return 0;
hashaddr = bf->_STI[3](data) % bf->_bp._size;
if (!BitMapTest(&bf->_bp, hashaddr))
return 0;
hashaddr = bf->_STI[4](data) % bf->_bp._size;
if (!BitMapTest(&bf->_bp, hashaddr))
return 0;
return 1;
}
int BloomFilterSize(BFilter* bf)
{
assert(bf);
return bf->_size;
}
void BloomFilterDestroy(BFilter* bf)
{
assert(bf);
BitMapDestroy(&bf->_bp);
bf->_size = 0;
}
comm.h
#pragma once
unsigned StrToInt1(const char * str);
unsigned int StrToInt2(char *str);
unsigned int StrToInt3(char *str);
unsigned int StrToInt4(char *str);
unsigned int StrToInt5(char *str);
unsigned IntToInt(int data);
comm.c
#include "comm.h"
unsigned StrToInt1(const char * str)
{
unsigned int seed = 131;
unsigned int hash = 0;
while (*str)
{
hash = hash * seed + (*str++);
}
return (hash & 0x7FFFFFFF);
}
unsigned IntToInt(int data)
{
return data;
}
unsigned int StrToInt2(char *str)
{
unsigned int hash = 0;
while (*str)
{
hash = (*str++) + (hash << 6) + (hash << 16) - hash;
}
return (hash & 0x7FFFFFFF);
}
unsigned int StrToInt3(char *str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
while (*str)
{
hash = hash * a + (*str++);
a *= b;
}
return (hash & 0x7FFFFFFF);
}
unsigned int StrToInt4(char *str)
{
unsigned int hash = 1315423911;
while (*str)
{
hash ^= ((hash << 5) + (*str++) + (hash >> 2));
}
return (hash & 0x7FFFFFFF);
}
unsigned int StrToInt5(char *str)
{
unsigned int hash = 0;
unsigned int x = 0;
while (*str)
{
hash = (hash << 4) + (*str++);
if ((x = hash & 0xF0000000L) != 0)
{
hash ^= (x >> 24);
hash &= ~x;
}
}
return (hash & 0x7FFFFFFF);
}