Hash Divide and Conquer

Reprinted from: Hash divide and conquer - the highest number of first 10 IP statistics appear massive data

Codes are as follows:

// Find the highest number of pre-emergence 10 IP in massive data 
// algorithm thinking, divide and conquer 
// IP log file size 10 GB, about one billion IP, MB memory limit for 64- 
// the IP log file the IP modulo 500, 500 to a small hash file 
// same IP are hashed to the same file 
// average small file 20 MB, 64 MB in the 
// again in each count the number of small files using a binary tree for each IP 

// binary tree traversal and returns the maximum number of occurrences of IP 
// IP because there are 2 ^ 32, to ensure that can produce duplicate IP need to generate 2 ^ 32 IP 
// about to 64 GB document, in order to ensure 10 GB of file can be generated duplicate IP 
// set the IP section 0.0.0.0 - 100.100.100.100 
// the fopen () can only open up to 507 files 

@ postscript: the master record after the file broken down into small files 500 (points) 
// can be calculated to other computers (rule) these small files sent over the network 
and then returned to the main program // result, i.e. similar to the MapReduce google works 

#include <stdio. H> 
#include <stdlib.h> 
#include <time.h> 
#include <the io.h> // Access () 
#include <string.h>

const int num = 1000000; // 10 million manually modify NUM 

unsigned int ITON (char * IP); // IP address of the sub-point notation into integer 
void ntoi (NUM unsigned int, char * IP); 
int fileexist (char * path); // determine the file exists 
void fclose_all (fILE ** t); // close all files 
int random_write (char * path); // 670,000,000 randomly generated IP, about 10 GB 
// statistics in hashfile most often the IP 
void COUNT (* hashfile char, unsigned int * Data, NUM unsigned int *); 
void Sort (unsigned int * max, IP unsigned int *, int n-); // Sort 
inline unsigned int hash (unsigned int ip ) // hash function 
{return (IP 500%);} 

typedef struct // node node binary 
{ 
	unsigned int IP; // the IP 
	unsigned int n-; // number of occurrences 
	node * left; 
	node * right; 
} node;

main int (void) 
{ 
	the FILE * in = NULL; 
	the FILE * tmpfile [505]; 
	char * path = "C: \\ ip_data.dat"; 
	char hashfile [50]; 
	char buf [20 is]; 
	unsigned int the Add, Data , n-; 
	unsigned int IP [10], max [10]; // record the top ten the IP 
	unsigned int T1, T2, S, E; // recording time 
	int i, j, len, now ; // IP number 

	printf ( "S is generated data% \ n-\ n-", path); 
	iF return 0 (random_write (path)!); // record file randomly generated IP 

	// determines folder exists, access () == 0 exists 
	IF (Access ( "C: \\ hashfile", 0) == 0) 
		System ( "rmdir / S / QC: \\ hashfile"); 
	System ( "mkdir C: \\ hashfile"); // build working directory 
	system ( "attrib + hc: \\ hashfile"); // hidden directory 

	in = fopen (path, "rt ");// open IP log file 
	if (in == NULL) return 0 ; 
	for (i = 0; i <505; i ++) tmpfile [i] = NULL; 

	// IP will be 670 million to 505 small hash file 
	printf ( "\ r is S hash% \ n-\ n-", path); 
	E = S = T1 = Clock (); 
	now = 0; 
	the while (the fscanf (in,"% S ", buf)! = the EOF) 
	{ 
		Data = ITON (buf ); // IP digitized 
		add = hash (data); // compute the hash address, the file address 
		sprintf (hashfile, "C:. hashfile \\ \\ hash_% U ~ tmp", the Add); 
		IF (tmpfile [the Add ] == NULL) 
			tmpfile [the Add] = the fopen (hashfile, "A"); 
		sprintf (buf, "% U \ n-", Data); 
		len = strlen (buf); 
		// file is written to the IP , repeated slow read and write disk 
		fwrite (buf, len,. 1, tmpfile [the Add]); 
		now ++; 
		E = Clock (); 
		IF (E - S>1000) to calculate progress // 
		{
			printf ( "\ r %% progress of 0.2f% \ T", (now *) / NUM 100.0); 
			S = E; 
		} 
	} 
	fclose (in); 
	fclose_all (tmpfile); 
	Remove (path); 

	// every small file statistics duplicate highest IP, may be a plurality of computer processing 
	for (I = 0; I <10; I ++) max [I] = 0; 
	for (I = 0; I <500; I ++) 
	{ 
		sprintf (hashfile, "C:. hashfile \\ \\ D ~ tmp% hash_", I); 
		IF (fileexist (hashfile)) 
		{ 
			the printf (. "\ R & lt processing hash_% d ~ tmp \ t" , i); 
			// count the small file the largest number of IP 
			COUNT (hashfile, & the Data, the n-&); 
			// because only 10 elements, with the idea of recording the largest selection sort 10 
			// if too many elements, you can use the insertion sort thought to find a stack or 
			unsigned int min = 0xFFFFFFFF, POS; 
			for (J = 0; J <10; J ++) 
			{ 
				IF (max [J] <I)
				{ 
					Min = max [J]; 
					POS = J; 
				} 
			} 
			IF (n-> min) 
			{ 
				max [POS] = n-; 
				IP [POS] = Data; 
			} 
		} 
	} 
	T2 = Clock (); 
	Sort (max, IP, 10); 

	the FILE * log = NULL; // while in C: \ ip_result.txt record result 
	log = the fopen ( "C: \\ ip_result.txt", "wt"); 
	fprintf (log, "\ n up visits the first 10 IP: \ the n-\ the n-"); 
	fprintf (log,"% -15s% S \ the n-"," IP "," visits "); 
	printf (" the n-\ the n-number of \ the top 10 most visited a the IP: \ n-\ n-"); 
	the printf ("% -15s% S \ n-"," the IP "," visits "); 
	for (I = 0; I <10; I ++) 
	{ 
		ntoi (IP [I ], buf);Decoding // 
		printf ( "% -20s% u \ n", buf, max [i]);
		fprintf (log, "% U% -20s \ n-", buf, max [I]);
	}
	fprintf (log, "\ n ---% 0.3f seconds when using \ n-", (T2 - T1) / 1000.0); 
	the printf ( "\ n----% 0.3f seconds when using \ n \ n", (t2 - T1) / 1000.0); 
	fclose (log); 
	System ( "rmdir / S / QC: \\ hashfile"); 

	return 0; 
} 

void fclose_all (the fILE ** T) // close all files 
{ 
	int I; 

	for (I = 0; I <500; I ++) 
	{ 
		IF (T [I]) 
		{ 
			fclose (T [I]); 
			T [I] = NULL; 
		} 
	} 
} 

// randomly generated 670000000 IP, about 10 GB 
int random_write (char * path) 
{ 
	the FILE * OUT = NULL; 
	int I, J, B; 
	char buf [20 is]; 
	char * CUR; 
	unsigned int S, E; 

	OUT = the fopen (path, "wt"); 
	IF (OUT == NULL) return 0; 
	srand (Time (NULL)); 
	E = S = Clock (); 
	for (I = 0; I <NUM; I ++) 
	{ 
		Clock = E (); 
		IF (E - S> 1000) progress // calculate 
		{ 
			the printf ( "\ %% R & lt progress of 0.2f% \ T", (I * 100.0) / NUM); 
			S = E; 
		} 
		for (J = 0; J <20 is; J ++) buf [J] = '\ 0'; 
		CUR = buf; 
		for (J = 0; J <. 4; J ++) 
		{ 
			// dotted notation should be generated 0-255 value 
			// value generated here is 0-101 
			B = RAND ()% 101; 
			sprintf (CUR, "% D.", B); 
			the while (* CUR = '\ 0'!) CUR ++; 
		} 
		* (CUR - . 1) = '\ n-'; 
		fwrite (buf, cur (char *) buf,. 1, OUT); 
	}
	fclose (out); // remember to close the 
	return. 1; 
}

// binary insertion 
void INSERT (Tree Node **, unsigned int IP) 
{ 
	IF ((* Tree) == NULL) 
	{ 
		// new_node 
		(Tree *) = (Node *) the malloc (the sizeof (Node)); 
		(* Tree) -> IP = IP; 
		(* Tree) -> n-=. 1; 
		(* Tree) -> left = (* Tree) -> right = NULL; 
	} 
	the else IF ((* tree) -> IP == IP) 
	{ 
		(* tree) -> n-++; 
		return; 
	} 
	the else IF (IP <(* tree) -> IP) // left subtree 
		insert (& (( tree *) -> left), IP); 
	the else INSERT (& ((* tree) -> right), IP); // right subtree 
} 

unsigned int MAXN; // entry parameter 
node * max_node; // Output parameters 
void max_n (node * tree) // find the largest tree node n
{
	if (tree)
	{
		if (tree->n > maxn)
		{
			maxn = tree->n;
			max_node = tree;
		}
		max_n(tree->left);
		max_n(tree->right);
	}
}

void destory(node *tree)		// 释放树
{
	if (tree)
	{
		destory(tree->left);
		destory(tree->right);
		free(tree);
	}
}

// 统计 hashfile 中次数最多的 IP
void count(char *hashfile, unsigned int *data, unsigned int *n)
{
	FILE *in = NULL;
	node *tree = NULL;
	unsigned int ip;

	in = fopen(hashfile, "rt");
	while (fscanf(in, "%d", &ip) != EOF)
	{
		insert(&tree, ip);
	}
	fclose(in);
	maxn = 0;
	max_n(tree);	// 结果在 max_node
	*n = max_node->n;
	Data = max_node- *> IP; 
	Destory (Tree); 
} 

// insertion sort 
void Sort (unsigned int * max, IP unsigned int *, int n-) 
{ 
	int I, J; 
	unsigned int TMPM, TMPI; 

	for (I = . 1; I <n-; I ++) 
	{ 
		IF (max [I-. 1] <max [I]) 
		{ 
			TMPM = max [I]; 
			TMPI = IP [I]; 
			for (J = I; J> 0; J- -) 
			{ 
				IF (max [J-. 1] <TMPM) 
				{ 
					max [J] = max [J-. 1]; 
					IP [J] = IP [J-. 1]; 
				} 
				the else BREAK; 
			} 
			max [J] = TMPM ; 
			IP [J] = TMPI; 
		} 
	}
} 

// IP address of the string into the integer 
unsigned int ITON (char * IP) 
{
	}



	ip[cur - 1] = '\0';
}

// file exists is determined 
int fileexist (char * path) 
{ 
	the FILE * FP = NULL; 

	FP = the fopen (path, "RT"); 
	IF (FP) 
	{ 
		fclose (FP); 
		return. 1; 
	} 
	the else return 0; 
}

  

Guess you like

Origin www.cnblogs.com/iupoint/p/11576879.html