100亿整型数据去重?
整型数据为32位最多有2^32(42亿多),所以100亿整型数据一定有重复的,2^32个整形用位表示,需要(2^32)bit==512MB,需要512MB内存表示。
下面是去重算法:
#include <stdio.h> #include <stdlib.h> #define MAX (0xffffffff) void setBuf(char *buf, unsigned int num) { *(buf+(num>>0x3)) |= (0x1<<(num&0x7)); return; } unsigned int getBuf(char *buf, int num) { unsigned int flag = 0; flag = ((*(buf+(num>>0x3)) & (0x1<<(num&0x7))) != 0)? 1:0; return flag; } int main(int argc,char **argv) { if(argc < 2) { printf("usage:./a {0-9}*\n"); return 0; } unsigned int index = 1; unsigned int num; unsigned int max = 0; char* buf = (char*)calloc((MAX>>0x3)+1,sizeof(char)); while(index < argc) { num = atoi(argv[index]); max = max>num? max:num; setBuf(buf,num); ++index; } for(index = 0; index <= max; index++) { if(getBuf(buf,index) == 1) { printf("id:%-10u flag:0x%-16x value:%-10u state:%-2d\n", index>>0x3, (unsigned int)buf[index>>0x3], index, getBuf(buf,index)); } printf("process[%u]:%.2f%%\r",index,(float)(index)/max*100); } printf("\n"); return 0; }
测试结果:
[root@centos code]# ./a.out 100 100 45 4 53 4 23 23 23 24 35 454 6 4 6543 3242 2 324 54 6 23 23 2 32 4 354 654 65 6 1000 id:0 flag:0x54 value:2 state:1 id:0 flag:0x54 value:4 state:1 id:0 flag:0x54 value:6 state:1 id:2 flag:0xffffff80 value:23 state:1 id:3 flag:0x1 value:24 state:1 id:4 flag:0x9 value:32 state:1 id:4 flag:0x9 value:35 state:1 id:5 flag:0x20 value:45 state:1 id:6 flag:0x60 value:53 state:1 id:6 flag:0x60 value:54 state:1 id:8 flag:0x2 value:65 state:1 id:12 flag:0x10 value:100 state:1 id:40 flag:0x10 value:324 state:1 id:44 flag:0x4 value:354 state:1 id:56 flag:0x40 value:454 state:1 id:81 flag:0x40 value:654 state:1 id:125 flag:0x1 value:1000 state:1 id:405 flag:0x4 value:3242 state:1 id:817 flag:0xffffff80 value:6543 state:1 process[6543]:100.00%