Reprinted from: Hash divide and conquer - the highest number of first 10 IP statistics appear massive data
Codes are as follows:
// Find the highest number of pre-emergence 10 IP in massive data // algorithm thinking, divide and conquer // IP log file size 10 GB, about one billion IP, MB memory limit for 64- // the IP log file the IP modulo 500, 500 to a small hash file // same IP are hashed to the same file // average small file 20 MB, 64 MB in the // again in each count the number of small files using a binary tree for each IP // binary tree traversal and returns the maximum number of occurrences of IP // IP because there are 2 ^ 32, to ensure that can produce duplicate IP need to generate 2 ^ 32 IP // about to 64 GB document, in order to ensure 10 GB of file can be generated duplicate IP // set the IP section 0.0.0.0 - 100.100.100.100 // the fopen () can only open up to 507 files @ postscript: the master record after the file broken down into small files 500 (points) // can be calculated to other computers (rule) these small files sent over the network and then returned to the main program // result, i.e. similar to the MapReduce google works #include <stdio. H> #include <stdlib.h> #include <time.h> #include <the io.h> // Access () #include <string.h> const int num = 1000000; // 10 million manually modify NUM unsigned int ITON (char * IP); // IP address of the sub-point notation into integer void ntoi (NUM unsigned int, char * IP); int fileexist (char * path); // determine the file exists void fclose_all (fILE ** t); // close all files int random_write (char * path); // 670,000,000 randomly generated IP, about 10 GB // statistics in hashfile most often the IP void COUNT (* hashfile char, unsigned int * Data, NUM unsigned int *); void Sort (unsigned int * max, IP unsigned int *, int n-); // Sort inline unsigned int hash (unsigned int ip ) // hash function {return (IP 500%);} typedef struct // node node binary { unsigned int IP; // the IP unsigned int n-; // number of occurrences node * left; node * right; } node; main int (void) { the FILE * in = NULL; the FILE * tmpfile [505]; char * path = "C: \\ ip_data.dat"; char hashfile [50]; char buf [20 is]; unsigned int the Add, Data , n-; unsigned int IP [10], max [10]; // record the top ten the IP unsigned int T1, T2, S, E; // recording time int i, j, len, now ; // IP number printf ( "S is generated data% \ n-\ n-", path); iF return 0 (random_write (path)!); // record file randomly generated IP // determines folder exists, access () == 0 exists IF (Access ( "C: \\ hashfile", 0) == 0) System ( "rmdir / S / QC: \\ hashfile"); System ( "mkdir C: \\ hashfile"); // build working directory system ( "attrib + hc: \\ hashfile"); // hidden directory in = fopen (path, "rt ");// open IP log file if (in == NULL) return 0 ; for (i = 0; i <505; i ++) tmpfile [i] = NULL; // IP will be 670 million to 505 small hash file printf ( "\ r is S hash% \ n-\ n-", path); E = S = T1 = Clock (); now = 0; the while (the fscanf (in,"% S ", buf)! = the EOF) { Data = ITON (buf ); // IP digitized add = hash (data); // compute the hash address, the file address sprintf (hashfile, "C:. hashfile \\ \\ hash_% U ~ tmp", the Add); IF (tmpfile [the Add ] == NULL) tmpfile [the Add] = the fopen (hashfile, "A"); sprintf (buf, "% U \ n-", Data); len = strlen (buf); // file is written to the IP , repeated slow read and write disk fwrite (buf, len,. 1, tmpfile [the Add]); now ++; E = Clock (); IF (E - S>1000) to calculate progress // { printf ( "\ r %% progress of 0.2f% \ T", (now *) / NUM 100.0); S = E; } } fclose (in); fclose_all (tmpfile); Remove (path); // every small file statistics duplicate highest IP, may be a plurality of computer processing for (I = 0; I <10; I ++) max [I] = 0; for (I = 0; I <500; I ++) { sprintf (hashfile, "C:. hashfile \\ \\ D ~ tmp% hash_", I); IF (fileexist (hashfile)) { the printf (. "\ R & lt processing hash_% d ~ tmp \ t" , i); // count the small file the largest number of IP COUNT (hashfile, & the Data, the n-&); // because only 10 elements, with the idea of recording the largest selection sort 10 // if too many elements, you can use the insertion sort thought to find a stack or unsigned int min = 0xFFFFFFFF, POS; for (J = 0; J <10; J ++) { IF (max [J] <I) { Min = max [J]; POS = J; } } IF (n-> min) { max [POS] = n-; IP [POS] = Data; } } } T2 = Clock (); Sort (max, IP, 10); the FILE * log = NULL; // while in C: \ ip_result.txt record result log = the fopen ( "C: \\ ip_result.txt", "wt"); fprintf (log, "\ n up visits the first 10 IP: \ the n-\ the n-"); fprintf (log,"% -15s% S \ the n-"," IP "," visits "); printf (" the n-\ the n-number of \ the top 10 most visited a the IP: \ n-\ n-"); the printf ("% -15s% S \ n-"," the IP "," visits "); for (I = 0; I <10; I ++) { ntoi (IP [I ], buf);Decoding // printf ( "% -20s% u \ n", buf, max [i]); fprintf (log, "% U% -20s \ n-", buf, max [I]); } fprintf (log, "\ n ---% 0.3f seconds when using \ n-", (T2 - T1) / 1000.0); the printf ( "\ n----% 0.3f seconds when using \ n \ n", (t2 - T1) / 1000.0); fclose (log); System ( "rmdir / S / QC: \\ hashfile"); return 0; } void fclose_all (the fILE ** T) // close all files { int I; for (I = 0; I <500; I ++) { IF (T [I]) { fclose (T [I]); T [I] = NULL; } } } // randomly generated 670000000 IP, about 10 GB int random_write (char * path) { the FILE * OUT = NULL; int I, J, B; char buf [20 is]; char * CUR; unsigned int S, E; OUT = the fopen (path, "wt"); IF (OUT == NULL) return 0; srand (Time (NULL)); E = S = Clock (); for (I = 0; I <NUM; I ++) { Clock = E (); IF (E - S> 1000) progress // calculate { the printf ( "\ %% R & lt progress of 0.2f% \ T", (I * 100.0) / NUM); S = E; } for (J = 0; J <20 is; J ++) buf [J] = '\ 0'; CUR = buf; for (J = 0; J <. 4; J ++) { // dotted notation should be generated 0-255 value // value generated here is 0-101 B = RAND ()% 101; sprintf (CUR, "% D.", B); the while (* CUR = '\ 0'!) CUR ++; } * (CUR - . 1) = '\ n-'; fwrite (buf, cur (char *) buf,. 1, OUT); } fclose (out); // remember to close the return. 1; } // binary insertion void INSERT (Tree Node **, unsigned int IP) { IF ((* Tree) == NULL) { // new_node (Tree *) = (Node *) the malloc (the sizeof (Node)); (* Tree) -> IP = IP; (* Tree) -> n-=. 1; (* Tree) -> left = (* Tree) -> right = NULL; } the else IF ((* tree) -> IP == IP) { (* tree) -> n-++; return; } the else IF (IP <(* tree) -> IP) // left subtree insert (& (( tree *) -> left), IP); the else INSERT (& ((* tree) -> right), IP); // right subtree } unsigned int MAXN; // entry parameter node * max_node; // Output parameters void max_n (node * tree) // find the largest tree node n { if (tree) { if (tree->n > maxn) { maxn = tree->n; max_node = tree; } max_n(tree->left); max_n(tree->right); } } void destory(node *tree) // 释放树 { if (tree) { destory(tree->left); destory(tree->right); free(tree); } } // 统计 hashfile 中次数最多的 IP void count(char *hashfile, unsigned int *data, unsigned int *n) { FILE *in = NULL; node *tree = NULL; unsigned int ip; in = fopen(hashfile, "rt"); while (fscanf(in, "%d", &ip) != EOF) { insert(&tree, ip); } fclose(in); maxn = 0; max_n(tree); // 结果在 max_node *n = max_node->n; Data = max_node- *> IP; Destory (Tree); } // insertion sort void Sort (unsigned int * max, IP unsigned int *, int n-) { int I, J; unsigned int TMPM, TMPI; for (I = . 1; I <n-; I ++) { IF (max [I-. 1] <max [I]) { TMPM = max [I]; TMPI = IP [I]; for (J = I; J> 0; J- -) { IF (max [J-. 1] <TMPM) { max [J] = max [J-. 1]; IP [J] = IP [J-. 1]; } the else BREAK; } max [J] = TMPM ; IP [J] = TMPI; } } } // IP address of the string into the integer unsigned int ITON (char * IP) { } ip[cur - 1] = '\0'; } // file exists is determined int fileexist (char * path) { the FILE * FP = NULL; FP = the fopen (path, "RT"); IF (FP) { fclose (FP); return. 1; } the else return 0; }