1、(topK问题)海量日志数据,提取出某日访问百度次数最多的10个IP。

首先是这一天,并且是访问百度的日志中的IP取出来,逐个写入到一个大文件中。注意到IP是32位的,最多有个2^32个IP。同样可以采用映射的方法,比如模1000,把整个大文件映射为1000个小文件,再找出每个小文件中出现频率最大的IP(可以采用hash_map进行频率统计,然后再找出频率最大的几个)及相应的频率。然后再在这1000个最大的IP中,找出那个频率最大的IP,即为所求。


#include <iostream>
#include <fstream>
#include <string.h>
#include <ctime>
#include <hash_map>


#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>




using namespace std;
using namespace __gnu_cxx;


#define TRUE 1
#define FALSE 0
#define UP 184549374 /* 10.255.255.254 */
#define LOW 167772161 /* 10.0.0.1 */
#define DIVIDENUM 1000
#define USERLOGFILE "userlog.dat"


int generatelogfile(void)
{
unsigned int i;
unsigned int j;
int ret;
struct in_addr userip;
fstream userlog;


userlog.open(USERLOGFILE, ios::out | ios::in | ios::trunc);


srandom((unsigned int)time(NULL));


for (i = 1; i <= 128 * 1024; i++)
{
string str;


str.clear();
for (j = 1; j <= 1024; j++)
{
//userip.s_addr = random();
userip.s_addr = (random() % (UP - LOW + 1)) + LOW;
userip.s_addr = htonl(userip.s_addr);
if ((userip.s_addr & 0xFF) == 0)
{
userip.s_addr += 1;
}
if ((userip.s_addr & 0xFF) == 0xFF)
{
userip.s_addr -= 1;
}


if ((userip.s_addr & 0xFF000000) == 0)
{
userip.s_addr += 0x0A000000;
}


if ((userip.s_addr & 0xFF000000) >= 0xE0000000)
{
userip.s_addr -= 0xDF000000;
}


//if (i % 1024 == 0)
// printf("%X: \n", i);
userip.s_addr = ntohl(userip.s_addr);
str = str + inet_ntoa(userip) + "\r\n";
}
userlog << str;
//userlog << inet_ntoa(userip) << endl;
//printf("%-15s\n", inet_ntoa(userip));
}


userlog.close();
return 0;
}


void dividelogfile(void)
{
unsigned int i;
char line[256];
char filename[50];
fstream userlog;
fstream userlogfile[DIVIDENUM];
struct in_addr userip;


userlog.open(USERLOGFILE, ios::in);


for (i = 0; i < DIVIDENUM; i++)
{
memset(filename, 0, sizeof(filename));
sprintf(filename, "./data/%s_%d.dat", "userlog", i);
userlogfile[i].open(filename, ios::out | ios::in | ios::trunc);
}


do
{
memset(line, 0, sizeof(line));
userlog.getline(line, 256);
inet_aton(line, &userip);
//printf("%s\n", line);
userip.s_addr = htonl(userip.s_addr);
//userlogfile[userip.s_addr % DIVIDENUM] << line << endl;
userlogfile[userip.s_addr % DIVIDENUM] << line << "\n";
//printf("0x%X\n", userip.s_addr);
} while (!userlog.eof());


userlog.close();
for (i = 0; i < DIVIDENUM; i++)
{
userlogfile[i].close();
}
}


typedef struct __min_heap
{
struct in_addr userip;
unsigned count;
} MIN_HEAP;


void ajustDown(MIN_HEAP *minheap, int i, int n)
{
int j;
MIN_HEAP tmpnode;


j = 2*i + 1;


while (j < n)
{
if ((j + 1) < n && (minheap[j + 1].count < minheap[j].count))
{
j++;
}


if (minheap[i].count > minheap[j].count)
{
tmpnode = minheap[i];
minheap[i] = minheap[j];
minheap[j] = tmpnode;
}


i = j;
j = 2*i + 1;
}
}


void makeMinHeap(MIN_HEAP *minheap, int len)
{
int i;


for (i = len/2 - 1; i >= 0; i--)
{
ajustDown(minheap, i, len);
}
}


void minHeapSort(MIN_HEAP *minheap, int len)
{
int i;
MIN_HEAP tmpnode;


makeMinHeap(minheap, len);
for (i = len - 1; i >= 0; i--)
{
tmpnode = minheap[0];
minheap[0] = minheap[i];
minheap[i] = tmpnode;
ajustDown(minheap, 0, i);
}
}


void parse(void)
{
unsigned int i;
char line[256];
char filename[50];
fstream userlog;
fstream userlogfile[DIVIDENUM];
struct in_addr userip;
hash_map<int, int> hm;


MIN_HEAP top_k[10];
memset(top_k, 0, sizeof(top_k));


for (i = 0; i < DIVIDENUM; i++)
{
memset(filename, 0, sizeof(filename));
sprintf(filename, "./data/%s_%d.dat", "userlog", i);
userlogfile[i].open(filename, ios::in);


hm.clear();



while (!userlogfile[i].eof())
{
memset(line, 0, sizeof(line));
userlogfile[i].getline(line, 256);
inet_aton(line, &userip);
userip.s_addr = htonl(userip.s_addr);
if (userip.s_addr == 0x00)
{
continue;
}
hm[userip.s_addr] = hm[userip.s_addr] + 1;
}

if (hm.size() > 0)
{
printf("hm.size = %d\n", hm.size());
for(hash_map<int, int>::iterator itb = hm.begin(); itb != hm.end(); itb++)
{


if (itb->second > top_k[9].count)
{
top_k[9].userip.s_addr = itb->first;
top_k[9].count = itb->second;
minHeapSort(top_k, 10);
}
}
}


for (int j = 0; j < 10; j++)
{
printf("%s %d\n", inet_ntoa(top_k[j].userip), top_k[j].count);
}
}


for (i = 0; i < DIVIDENUM; i++)
{
userlogfile[i].close();
}
}




int main(int argc, char *argv[])
{
/*  生成样本数据*/
//generatelogfile();


/*  分治*/
//dividelogfile();


/*  统计*/
parse();


return 0;
}


猜你喜欢

转载自blog.csdn.net/zhangyun75/article/details/80272222
今日推荐