大数据去重——位图

100亿整型数据去重?


整型数据为32位最多有2^32(42亿多),所以100亿整型数据一定有重复的,2^32个整形用位表示,需要(2^32)bit==512MB,需要512MB内存表示。

下面是去重算法:

#include <stdio.h>
#include <stdlib.h>

#define MAX (0xffffffff)

void setBuf(char *buf, unsigned int num)
{
    *(buf+(num>>0x3)) |= (0x1<<(num&0x7));
    return;
}
unsigned int getBuf(char *buf, int num)
{
    unsigned int flag = 0;
    flag = ((*(buf+(num>>0x3)) & (0x1<<(num&0x7))) != 0)? 1:0;
    return flag;
}

int main(int argc,char **argv)
{
    if(argc < 2)
    { 
        printf("usage:./a {0-9}*\n");
        return 0;
    }
    unsigned int index = 1;
    unsigned int num;
    unsigned int max = 0;
    char* buf = (char*)calloc((MAX>>0x3)+1,sizeof(char));
    while(index < argc)    
    {
        num = atoi(argv[index]);
        max = max>num? max:num;
        setBuf(buf,num);
        ++index;
    }
    for(index = 0; index <= max; index++)
    {   
        if(getBuf(buf,index) == 1)
        {
           printf("id:%-10u flag:0x%-16x value:%-10u state:%-2d\n",
                   index>>0x3,
                   (unsigned int)buf[index>>0x3],
                   index,
                   getBuf(buf,index));
        }
        printf("process[%u]:%.2f%%\r",index,(float)(index)/max*100);
    }
    printf("\n");
    return 0;
}

测试结果:

[root@centos code]# ./a.out 100 100 45 4 53 4 23 23 23 24 35 454 6 4 6543  3242 2 324 54 6 23 23 2 32 4 354 654 65 6 1000
id:0          flag:0x54               value:2          state:1 
id:0          flag:0x54               value:4          state:1 
id:0          flag:0x54               value:6          state:1 
id:2          flag:0xffffff80         value:23         state:1 
id:3          flag:0x1                value:24         state:1 
id:4          flag:0x9                value:32         state:1 
id:4          flag:0x9                value:35         state:1 
id:5          flag:0x20               value:45         state:1 
id:6          flag:0x60               value:53         state:1 
id:6          flag:0x60               value:54         state:1 
id:8          flag:0x2                value:65         state:1 
id:12         flag:0x10               value:100        state:1 
id:40         flag:0x10               value:324        state:1 
id:44         flag:0x4                value:354        state:1 
id:56         flag:0x40               value:454        state:1 
id:81         flag:0x40               value:654        state:1 
id:125        flag:0x1                value:1000       state:1 
id:405        flag:0x4                value:3242       state:1 
id:817        flag:0xffffff80         value:6543       state:1 
process[6543]:100.00%


猜你喜欢

转载自blog.csdn.net/weixin_38679007/article/details/80623262