Result file data Description:
Ip : 106.39.41.166, (city)
DATE : 10 / Nov / 2016: 00: 01: 02 +0800, (date)
Day : 10, (number of days)
Traffic: 54 is, (traffic)
Type: video, (Type: Video video or article Article This article was )
The above mentioned id: 8701 (video or article of the above mentioned id )
Testing requirements:
1, data cleaning: Cleaning in accordance with the data, and import data washing hive data repository .
Two-stage data cleaning:
( 1 ) First stage: the required information is extracted from the original log
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
Article: Article This article was / 11325
Video: Video / 3235
( 2 ) The second stage: to do fine operation based on information extracted from the
ip ---> City City ( IP )
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
( . 3 ) Hive database table structure :
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
2 , the data processing:
· Statistical most popular video / article Top10 visits ( Video / Article This article was )
· According to the statistics of the most popular cities Top10 course ( ip )
· According to traffic statistics of the most popular Top10 course ( traffic )
3 , Data Visualization: The statistical results poured MySql database, unfolded through a graphical display mode.
Phase One:
/ * * Log file mapreduce firstly washed out, and then introduced into the hive where * * / Package classtest3; Import java.io.IOException; Import the java.text.SimpleDateFormat; Import java.util.Date; Import the java.util.Iterator; Import java.util.Locale; Import java.util.StringTokenizer; Import org.apache.hadoop.conf.Configuration; Import org.apache.hadoop.fs.Path; Import org.apache.hadoop.io.IntWritable; Import ORG. apache.hadoop.io.LongWritable; Import org.apache.hadoop.io.Text; Import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class Result{ public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式 public static Final the SimpleDateFormat dateformat1 = new new the SimpleDateFormat ( "the MM-dd-YYYY"); // current time format Private a Date parseDateFormat (String String) { // convert time format a Date the parse = null ; the try { the parse = FORMAT.parse (String); } the catch (Exception E) { e.printStackTrace (); } return the parse; } // the row of data into a single cleaning string array public string [] the parse (string line) { string IP= parseIP(line); //ip String time = parseTime(line); //时间 String url = parseURL(line); //url String status = parseStatus(line); //状态 String traffic = parseTraffic(line);//流量 return new String[] { ip, time, url, status, traffic }; } private String parseTraffic(String line) { //流量 final String trim = line.substring(line.lastIndexOf("\"") + 1) .trim(); String traffic = trim.split(" ")[1]; return traffic; } private String parseStatus(String line) { //状态 final String trim = line.substring(line.lastIndexOf("\"") + 1) .trim(); String status = trim.split(" ")[0]; return status; } private String parseURL(String line) { //url final int first = line.indexOf("\""); final int last = line.lastIndexOf("\""); URL String = line.substring (+. 1 First , Last); return URL; } // parse Time Private String parseTime (Line String) { // Time Final int First line.indexOf = ( "["); // Find a string, the position of the first occurrence of the specified string. Final int Last line.indexOf = ( "+ 0800]" ); String Time = line.substring (+. 1 First, Last) .trim (); // "Hamburger" .substring (3,8) Returns "Burge" a Date DATE = parseDateFormat (Time); return dateformat1.format (DATE); } PrivateParseIP String (String Line) { // IP String IP = line.split ( "- -") [0 ] .trim (); return IP; } public static class the Map the extends Mapper <LongWritable, the Text, the Text, IntWritable> { public void Map (LongWritable Key, the text value, the context context) throws IOException, InterruptedException { // data input plain text file will be converted into String the text OutputValue = new new the text (); String Line = value.toString (); the Result AA= New new the Result (); StringTokenizer tokenizerArticle = new new StringTokenizer (Line, "\ n-" ); // separately processing each row the while (tokenizerArticle.hasMoreElements ()) { // each row of spaces divided by String value of the stra = tokenizerArticle.nextToken () .toString (); string [] newstr = aa.parse (value of the stra); IF (newstr [2] .startsWith ( "the GET /")) { // filtered string beginning newstr [ 2] = newstr [2] .substring ( "the GET /" .length ()); } the else IF (newstr [2] .startsWith ( "the POST /" )) { Newstr[2] = Newstr[2].substring("POST /".length()); } if (Newstr[2].endsWith(" HTTP/1.1")) { //过滤结尾字符串 Newstr[2] = Newstr[2].substring(0, Newstr[2].length() - " HTTP/1.1".length()); } String[] words = Newstr[2].split("/"); if(words.length==4){ outputValue.set(Newstr[0] + "\t" + Newstr[1] + "\t" + words[0]+"\t"+words[1]+"\t"+words[2]+"\t"+words[3]+"\t"+"0"); context.write(outputValue,new IntWritable(1)); } } } } public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { // 实现reduce函数 public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; Iterator<IntWritable> iterator = values.iterator(); while (iterator.hasNext()) { sum +=Iterator.next () GET ();. } context.write (Key, new new IntWritable (SUM)); } } public static void main (String [] args) throws Exception { the Configuration the conf = new new the Configuration (); // the conf .set ( "mapred.jar", "Namecount.jar"); // // String [] = ioArgs new new String [] { "name", "name_out"}; // // String [] = new new GenericOptionsParser otherArgs (the conf, ioArgs) .getRemainingArgs (); // O path // // // validate the input-output path exists // if (otherArgs.length != 2) { // System.err.println("Usage: Score Average <in> <out>"); // System.exit(2); // } Job job = Job.getInstance(); job.setJarByClass(Result.class); // 设置Map、Combine和Reduce处理类 job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); // 设置输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass (IntWritable. class ); // input data set is divided into small data blocks splites, a RecordReder implementation job.setInputFormatClass (the TextInputFormat. class ); // provide processes for implementing RecordWriter responsible for data output job. setOutputFormatClass (the TextOutputFormat. class ); // set the input and output directories FileInputFormat.addInputPath (Job, new new the Path ( "HDFS: //192.168.57.128: 9000 / MyMapReduce / classtest3 / name.txt" )); FileOutputFormat.setOutputPath (Job , new new the Path ( "HDFS: //192.168.57.128: 9000 / MyMapReduce / classtest3 / test1result" )); System.exit (job.waitForCompletion ( to true) ? 0 : 1); } }
Import Hive statement:
load data inpath '/MyMapReduce/classtest3/test1result/part-r-00000' into table acc_log;