Mapreduce data cleansing Change

package test;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;






import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class Filter {
    
    public static class Map extends Mapper<Object, Text, Text, NullWritable> {
        private static Text newKey = new Text();

        /*public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            System.out.println(line);
            String arr[] = line.split(" ");
            newKey.set(arr[1]);
            context.write(newKey, NullWritable.get());
            System.out.println(newKey);
        }
    }*/
    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        String S1 = value.toString    ();
         LogParser parser = new LogParser();
            final String[] array = parser.parse(S1);
        System.out.println(S1);
        /*System.out.format(
                "解析结果:  ip=%s, time=%s,day=%s, traffic=%s, type=%s,id=%s",
                array[0], array[1], array[2], array[3], array[4],array[5]);*/
        String a=array[0];
        String u=array[1];
        String c=array[2];
        String d=array[3];
        String e=array[4];
        String f=array[5];
        
        String str = a +","+u +","+c+","+d+","+e+","+f;
        
        newKey.set(str);
        context.write(newKey, NullWritable.get());
        System.out.println(newKey);

    classstaticPublic 
}
    }  Reduce extends Reducer<Text, NullWritable, Text, NullWritable> {
        public void reduce(Text key, Iterable<NullWritable> values, Context context)
                throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        System.out.println("start");
        
    
        Job job = new Job(conf, "filter");
        job.setJarByClass(Filter.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        Path in = new Path("hdfs://localhost:9000/user/hadoop/in/Result");
        Path out = new Path("hdfs://localhost:9000/user/hadoop/out");
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
    static class MyMapper extends
    Mapper<LongWritable, Text, LongWritable, Text> {
LogParser logParser = new LogParser();
Text outputValue = new Text();

protected void map(
        Key LongWritable, 
        the Text value,
        org.apache.hadoop.mapreduce.Mapper<LongWritable, the Text, LongWritable, the Text> .context context)
         throws java.io.IOException, InterruptedException {
     Final String [] of Parsed = logParser.parse (value.toString ()); 

    // . Step1 filter out static resource access request 
    if (of Parsed [2] .startsWith ( "the GET / static /" )
             || of Parsed [2] .startsWith ( "the GET / uc_server" )) {
         return ; 
    } 
    // . Step2 filtered out at the beginning of a specified string 
    IF (of Parsed [ 2] .startsWith ( "the GET /" )) { 
        of Parsed [ 2] = of Parsed [2] .substring ( "the GET /" .length ()); 
    } the else  IF (of Parsed [2] .startsWith ( "POST /" )) { 
        of Parsed [ 2] = of Parsed [2] .substring ( "the POST /" .length ()); 
    } 
    // Step3 filter out specific string end. 
    IF (of Parsed [2] .endsWith ( "the HTTP /1.1 " )) { 
        of Parsed [ 2] = of Parsed [2] .substring (0, of Parsed [2 ] .length ()
                 -" the HTTP / 1.1 " .length ()); 
    } 
    // Step4 is written only three. recording the type of item 
    outputValue.set (of Parsed [0] + "\ T" of Parsed + [. 1] + "\ T" of Parsed + [2 ]); 
    context.write (Key, OutputValue); 
} 
} 

static  class MyReducer the extends 
    the Reducer <LongWritable, Text, Text,NullWritable> {
 protected  void the reduce (
        LongWritable k2,
        java.lang.Iterable<Text> v2s,
        org.apache.hadoop.mapreduce.Reducer<LongWritable, Text, Text, NullWritable>.Context context)
        throws java.io.IOException, InterruptedException {
    for (Text v2 : v2s) {
        context.write(v2, NullWritable.get());
    }
};
}

/*
* 日志解析类
*/
static class  LogParser {
public static final SimpleDateFormat FORMAT = new SimpleDateFormat(
        "d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
public static final SimpleDateFormat dateformat1 = new SimpleDateFormat(
        "yyyy-MM-dd HH:mm:ss");



/**
 * 解析英文时间字符串
 * 
 * @param string
 * @return
 * @throws ParseException
 */
private Date parseDateFormat(String string) {
    Date parse = null;
    try {
        parse = FORMAT.parse(string);
    } catch (ParseException e) {
        e.printStackTrace();
    } 
    Return the parse; 
} 

/ ** 
 * log resolved rows 
 * 
 * @param Line 
 * @return array contains five elements, namely, IP, time, date, status, flow rate
  * / 
public String [] the parse (String Line) { 
    String IP = parseIP (Line); 
    String Time = parseTime (Line); 
    String Day = parseday (Line); 
    String the traffic = parseTraffic (Line); 
    String type = parsertype (Line); 
    String ID = parseid (Line); 


    return  new String [] {IP, Time , Day, the traffic, type, ID}; 
} 
Private String parseIP(String line) {
    String ip = line.split(",")[0].trim();
    return ip;
}

private String parseTime(String line) {
    final int first = line.indexOf(",");
    final int last = line.indexOf(" +0800,");
    String time = line.substring(first + 1, last).trim();
    Date date = parseDateFormat(time);
    return dateformat1.format(date);
}

private String parseday(String line) {
    String riqi = line.split(",")[2].trim();
    return riqi;
}
private String parseTraffic(String line) {
    String riqi = line.split(",")[3].trim();
    return riqi;
}
//private String parseTraffic(String line) {
   // final String trim = line.substring(line.lastIndexOf(",") + 1)
      //      .trim();
    //String traffic = trim.split(" ")[0];
    //return traffic;
//}

//private String parsertype(String line) {
 //   final int first = line.indexOf(",");
   // final int last = line.lastIndexOf(",");
  //  String url = line.substring(first + 1, last);
  //  return url;
//}
private String parsertype(String line) {
    String riqi = line.split(",")[4].trim();
    return riqi;
}

private String parseid(String line) {
    final String trim = line.substring(line.lastIndexOf(",") + 1)
            .trim();
    String id = trim.split(" ")[0];
    return id;
}






}

    
    
}

After washing the separator to the output, "", and then build table, separated by commas.

create table if not exists hive.data(ip string,`time` string,day string,traffic bigint,type string,id string)row format delimited fields terminated by ','; 建表语句

load data inpath 'hdfs: // localhost: 9000 / user / hadoop / out / part-r-00000' overwrite into table data; import table data hive

 

 

Guess you like

Origin www.cnblogs.com/zlj843767688/p/11854709.html