MapReduce 自定义数据类型

需求：

具体实现

（1）创建类实现WritableComparable或者Writable

（2）根据需要定义属性，生成get/set

（3）构造：空参，带参数

（4）序列化和反序列方法实现

（5）比较方法compareTo

（6）toString方法的实现

Map代码：

package com.huadian.bigdata.usertest;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class UserMapper extends Mapper<LongWritable,Text,Text, IntWritable> {
    private Text mapOutKey   = new Text(  );
    private final  static IntWritable mapOutValue = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //todo:实现对应业务
        String row = value.toString();//行内容hadoop java spring springMvc
        String[] strs = row.split( " " );
        for (String str:strs) {

            mapOutKey.set( str );
            //借助context将Map方法结果进行输出
            context.write( mapOutKey,mapOutValue );
        }
    }
}

Reduce代码：

package com.huadian.bigdata.usertest;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


public class UserReducer extends Reducer<Text, IntWritable,UserTestWritable, NullWritable> {

    private UserTestWritable outKey = new UserTestWritable();
    private NullWritable outValue = NullWritable.get();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value:values) {
            sum += value.get();
        }
        outKey.setFirstKey(key.toString());
        outKey.setSecondKey(sum);
        context.write(outKey,outValue);
    }
}

自定义数据类型：

package com.huadian.bigdata.usertest;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class UserTestWritable implements WritableComparable<UserTestWritable> {

    private String firstKey;
    private int secondKey;

    public UserTestWritable() {
    }

    public UserTestWritable(String firstKey, int secondKey) {
        this.firstKey = firstKey;
        this.secondKey = secondKey;
    }

    @Override
    public int compareTo(UserTestWritable o) {
        int comp = this.getFirstKey().compareTo( o.getFirstKey() );
        if(comp ==0){
            return Integer.valueOf( this.secondKey ).compareTo( Integer.valueOf( o.secondKey ) );
        }
        return comp;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(firstKey);
        out.writeInt(secondKey);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.firstKey = in.readUTF();
        this.secondKey = in.readInt();
    }

    public String getFirstKey() {
        return firstKey;
    }

    public void setFirstKey(String firstKey) {
        this.firstKey = firstKey;
    }

    public int getSecondKey() {
        return secondKey;
    }

    public void setSecondKey(int secondKey) {
        this.secondKey = secondKey;
    }

    @Override
    public String toString() {
        return  firstKey + "--" + secondKey ;
    }
}

主代码：

package com.huadian.bigdata.usertest;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class UserMapReducerDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(this.getConf(), "UserMapReducerDriver");
        job.setJarByClass(UserMapReducerDriver.class);

        Path inputPath = new Path(args[0]);
        FileInputFormat.setInputPaths(job,inputPath);

        job.setMapperClass(UserMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);


        job.setReducerClass( UserReducer.class );
        job.setOutputKeyClass(  UserTestWritable.class );
        job.setOutputValueClass( NullWritable.class );
        job.setNumReduceTasks( 2 );

        FileSystem hdfs = FileSystem.get(this.getConf());
        Path outputPath = new Path(args[1]);
        if (hdfs.exists(outputPath)){
            hdfs.delete(outputPath,true);
        }
        FileOutputFormat.setOutputPath(job,outputPath);

        boolean isSuccess = job.waitForCompletion( true );
        return isSuccess?0:1;
    }

    public static void main(String[] args) {
        Configuration configuration = new Configuration();

        try {
            int status = ToolRunner.run(configuration, new UserMapReducerDriver(), args);
            System.exit(status);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

写完达成jar包到linux 的hadoop上运行。

因为使用了 job.setNumReduceTasks( 2 );

他将reduce结果分成了两个。

MapReduce 自定义数据类型

MapReduce 自定义数据类型

需求：

具体实现

猜你喜欢