Thorough understanding of Hadoop sequence

1. Serialization Overview

  1.1 What is serialized

    Is the sequence of objects in memory is converted into a sequence of bytes (or other data transfer protocol) for storage to disk (persistent) and network transmission;

    Deserialization is to receive a sequence of bytes (or other data transfer protocol) or a persistent data disk, turned into objects in memory;

  1.2 Why should serialize

    In general, the "live" objects only survive in memory, there is no shut off; and "live" objects can only be used by a local process that can not be sent to another computer on the network; however serialization You may store a "live" object, the "live" object is sent to a remote computer;

  Without staggering 1.3 Java serialization

  1.4 hadoop serialization characteristics

    1.4.1 Compact: efficient use of storage space;

    1.4.2 Fast: small overhead reading and writing data;

    Scalable 1.4.3: With upgraded communication protocol and scalable;

    1.4.4 Interoperability: supports interactive multi-language;

2. Custom bean object implements serial port (the Writable)

  In the enterprise development is often common basic serialization types can not meet all the requirements, such as passing a bean objects within the framework hadoop, then the object would need to implement serial interfaces;

  Writable 2.1 must implement the interface;

  2.2 deserialization, reflecting the need to call an empty argument constructor, configuration parameters must be free;

public FlowBean() {
     super();      
}

  2.3 Method override sequence

    / * Serialization method 
    data frame DataOutput * outlet provided us 
    * * / 
    @Override 
    public  void Write (DataOutput of DataOutput) throws IOException { 
        DataOutput.writeLong (Upflow); 
        DataOutput.writeLong (downflow); 
        DataOutput.writeLong (sumFlow); 
    }

  2.4 Method rewriting deserialization

 / * Deserialize method 
    * Source framework provided DataInput 
    * * / 
    @Override 
    public  void readFields (DataInput of DataInput for primitive) throws IOException { 
        Upflow = dataInput.readLong (); 
        downflow = dataInput.readLong (); 
        sumFlow = dataInput.readLong ( ); 
    }

3. Case

  3.1 Writing FlowwBean

package com.wn.flow;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FlowwBean implements Writable {
    private long upFlow;
    private long downFlow;
    private long sumFlow;

    public FlowwBean() {
    }

    @Override
    public String toString() {
        return "FlowwBean{" +
                "upFlow=" + upFlow +
                ", downFlow=" + downFlow +
                ", sumFlow=" + sumFlow +
                '}';
    }

    public void set(long upFlow, long downFlow){
        this.upFlow=upFlow;
        this.downFlow=downFlow;
        this.sumFlow=upFlow+downFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        the this .downFlow = downflow; 
    } 

    public  Long getSumFlow () {
         return sumFlow; 
    } 

    public  void setSumFlow ( Long sumFlow) {
         the this .sumFlow = sumFlow; 
    } 

    public  Long getUpFlow () {
         return Upflow; 
    } 

    public  void setUpFlow ( Long Upflow) {
         the this .upFlow = Upflow; 
    } 

    / * serialization method for 
    data frame DataOutput * outlet provided us 
    * * /
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(upFlow);
        dataOutput.writeLong(downFlow);
        dataOutput.writeLong(sumFlow);
    }

    /*顺序要完全一致*/

    /*反序列化方法
    * dataInput 框架提供的数据来源
    * */
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        upFlow=dataInput.readLong();
        downFlow=dataInput.readLong();
        sumFlow=dataInput.readLong();
    }
}

  3.2 write FlowMapper

package com.wn.flow;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text,Text,FlowwBean> {

    private Text phone=new Text();
    private FlowwBean flow=new FlowwBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] split = value.toString().split("\t");
        phone.set(split[1]);
        long upFlow = Long.parseLong(split[split.length - 3]);
        long downFlow = Long.parseLong(split[split.length - 2]);
        flow.set(upFlow,downFlow);
        context.write(phone,flow);
    }
}

  3.3 write FlowReducer

package com.wn.flow;

import org.apache.hadoop.mapreduce.Reducer;

import javax.xml.soap.Text;
import java.io.IOException;

public class FlowReducer extends Reducer<Text,FlowwBean,Text,FlowwBean> {

    private FlowwBean sumFlow=new FlowwBean();

    @Override
    protected void reduce(Text key, Iterable<FlowwBean> values, Context context) throws IOException, InterruptedException {
        long sumUpFlow=0;
        long sumDownFlow=0;
        for (FlowwBean value:values){
            sumUpFlow+=value.getUpFlow();
            sumDownFlow+=value.getDownFlow();
        }
        sumFlow.set(sumUpFlow,sumDownFlow);
        context.write(key,sumFlow);
    }
}

  3.4 write FlowDriver

package com.wn.flow;

import com.wn.wordcount.WcDriver;
import com.wn.wordcount.WcMapper;
import com.wn.wordcount.WcReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FlowDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // Get an instance of Job 
        Job Job = Job.getInstance ( new new the Configuration ()); 

        // Set the classpath 
        job.setJarByClass (WcDriver. Class ); 

        // set the mapper and the reducer 
        job.setMapperClass (FlowMapper. Class ); 
        Job. setReducerClass (FlowReducer. class ); 

        // set the mapper and reducer output type 
        job.setMapOutputKeyClass (org.apache.hadoop.io.Text. class ); 
        job.setMapOutputValueClass (FlowwBean. class ); 
        job.setOutputKeyClass (. the Text class ); 
        job.setOutputValueClass (FlowwBean.class ); 

        // set the data input 
        FileInputFormat.setInputPaths (Job, new new the Path (args [ 0 ])); 
        FileOutputFormat.setOutputPath (Job, new new the Path (args [ . 1 ])); 

        // Submit Job 
       Boolean B = Job. the waitForCompletion ( to true ); 
        System.exit (B ? 0 : . 1 ); 
    } 

}

Guess you like

Origin www.linuxidc.com/Linux/2020-03/162761.htm