第7章 MapReduce进阶

原文地址：http://blog.csdn.net/chengyuqiang/article/details/73441493

7.4 自定义Key类型

Hadoop提供了多种基本的Writable类型，但是在实际开发中这些基本的Writable类型可能不能满足需求，这时候需要根据具体情况自定义Writable类型。

7.4.1 问题描述

针对NCDC提供的气象数据，求2016年和2017年每个月份最高温度。

7.4.2 上传数据

hdfs dfs -mkdir -p input
hdfs dfs -put /root/data/ncdc.txt input

[root@node1 ~]# hdfs dfs -mkdir -p input
[root@node1 ~]# hdfs dfs -put /root/data/ncdc.txt input
[root@node1 ~]# hdfs dfs -ls input
Found 1 items
-rw-r--r--   3 root hbase  871353053 2017-06-21 20:32 input/ncdc.txt

7.4.2 自定义Key

如何区分每一条数据，也就是如何寻求key的类型？

package cn.hadron.mr.ncdc;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class Weather implements WritableComparable<Weather> {
    private int year;
    private int month;
    private double hot;

    public Weather() {
    }

    public Weather(int year, int month, double hot) {
        this.year = year;
        this.month = month;
        this.hot = hot;
    }

    @Override
    public String toString() {
        return "[year=" + year + ", month=" + month + "]";
    }

    /**
     * 从输入流in中读取字节流反序列化为对象
     */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.year = in.readInt();
        this.month = in.readInt();
        this.hot = in.readDouble();
    }

    /**
     * 将对象转换为字节流并写入到输出流out中
     */
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(year);
        out.writeInt(month);
        out.writeDouble(hot);
    }

    // 判断对象是否是同一个对象，当该对象作为输出的key
    @Override
    public int compareTo(Weather t) {
        int r1 = Integer.compare(this.year, t.getYear());
        if (r1 == 0) {
            //如果年份相同，则判断月份
            int r2 = Integer.compare(this.month, t.getMonth());
            if (r2 == 0) {
                return Double.compare(this.hot, t.getHot());
            } else {
                return r2;
            }
        } else {
            return r1;
        }
    }

    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public int getMonth() {
        return month;
    }

    public void setMonth(int month) {
        this.month = month;
    }

    public double getHot() {
        return hot;
    }

    public void setHot(double hot) {
        this.hot = hot;
    }
}

自定义Partitioner

package cn.hadron.mr.ncdc;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

public class MyPartitioner extends HashPartitioner<Weather, DoubleWritable> {
    // 执行时间越短越好
    public int getPartition(Weather key, DoubleWritable value, int numReduceTasks) {
        // 根据年份分区
        return key.getYear() % numReduceTasks;
    }
}

自定义比较器

package cn.hadron.mr.ncdc;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class MyComparator extends WritableComparator {

    protected MyComparator() {
        super(Weather.class, true);
    }

    @Override
    public int compare(WritableComparable k1, WritableComparable k2) {
        Weather key1=(Weather)k1;
        Weather key2=(Weather)k2;
        int r1 = Integer.compare(key1.getYear(), key2.getYear());
        if (r1 == 0) {
            //如果年份相同，则判断月份
            return Integer.compare(key1.getMonth(), key2.getMonth());
        } else {
            return r1;
        }
    }
}

Mapper和Reducer

package cn.hadron.mr.ncdc;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class RunJob {

    public static void main(String[] args) {
        // 设置环境变量HADOOP_USER_NAME，其值是root
        System.setProperty("HADOOP_USER_NAME", "root");
        // Configuration类包含了Hadoop的配置
        Configuration config = new Configuration();
        // 设置fs.defaultFS
        config.set("fs.defaultFS", "hdfs://192.168.1.117:8020");
        // 设置yarn.resourcemanager节点
        config.set("yarn.resourcemanager.hostname", "node1");
        try {
            FileSystem fs = FileSystem.get(config);
            Job job = Job.getInstance(config);
            job.setJarByClass(RunJob.class);
            job.setJobName("weather");
            job.setMapperClass(WeatherMapper.class);
            job.setReducerClass(WeatherReducer.class);
            job.setMapOutputKeyClass(Weather.class);
            job.setMapOutputValueClass(DoubleWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(DoubleWritable.class);
            job.setPartitionerClass(MyPartitioner.class);
            job.setSortComparatorClass(MyComparator.class);
            //只有两年的数据，所以ReduceTask设置2
            job.setNumReduceTasks(2);
            FileInputFormat.addInputPath(job, new Path("/user/root/input/ncdc.txt"));
            Path outpath = new Path("/user/root/output");
            if (fs.exists(outpath)) {
                fs.delete(outpath, true);
            }
            FileOutputFormat.setOutputPath(job, outpath);
            System.out.println(job.waitForCompletion(true));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static class WeatherMapper extends Mapper<LongWritable, Text, Weather, DoubleWritable> {
        private static final String MISSING = "9999.9";
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String val=value.toString();
            int year=0;
            int month=0;
            double hot=0.0;
            Weather w=null;
            try{
                year = Integer.parseInt(val.substring(14,18));
                month =Integer.parseInt(val.substring(18,20));
                String hotStr=val.substring(102,108);
                if(!MISSING.equals(hotStr)){
                    hot =Double.parseDouble(hotStr);
                    w=new Weather(year,month,hot);
                    context.write(w, new DoubleWritable(hot));  
                }
            }catch(Exception e){
                System.out.println(e);
            }
        }
    }

    public static class WeatherReducer extends Reducer<Weather, DoubleWritable, Text, DoubleWritable> {
        protected void reduce(Weather key, Iterable<DoubleWritable> values, Context context)
                throws IOException, InterruptedException {
            double maxValue = 0.0;
            for(DoubleWritable value : values) {  
                 maxValue = Math.max(maxValue, value.get());  
            }  
            context.write(new Text(key.toString()), new DoubleWritable(maxValue));  
        }
    }
}

运行

Eclipse运行结果

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
true

HDFS查看结果
hdfs dfs -ls /user/root/output

[root@hds117 data]# hdfs dfs -ls /user/root/output
Found 3 items
-rw-r--r--   3 root hbase          0 2017-06-27 16:29 /user/root/output/_SUCCESS
-rw-r--r--   3 root hbase        327 2017-06-27 16:29 /user/root/output/part-r-00000
-rw-r--r--   3 root hbase        162 2017-06-27 16:29 /user/root/output/part-r-00001
You have new mail in /var/spool/mail/root

hdfs dfs -cat /user/root/output/part-r-00000
hdfs dfs -cat /user/root/output/part-r-00001

[root@hds117 data]# hdfs dfs -cat /user/root/output/part-r-00000
[year=2016, month=1]    119.5
[year=2016, month=2]    118.6
[year=2016, month=3]    122.0
[year=2016, month=4]    120.2
[year=2016, month=5]    126.5
[year=2016, month=6]    129.0
[year=2016, month=7]    127.2
[year=2016, month=8]    127.4
[year=2016, month=9]    124.2
[year=2016, month=10]   121.1
[year=2016, month=11]   114.1
[year=2016, month=12]   126.9
[root@hds117 data]# hdfs dfs -cat /user/root/output/part-r-00001
[year=2017, month=1]    116.1
[year=2017, month=2]    117.3
[year=2017, month=3]    123.8
[year=2017, month=4]    129.6
[year=2017, month=5]    129.2
[year=2017, month=6]    123.6

灰常灰常感谢原博主的辛苦工作，为防止删博，所以转载，只供学习使用，不做其他任何商业用途。 https://blog.csdn.net/chengyuqiang/article/details/73441493

第7章 MapReduce进阶

原文地址：http://blog.csdn.net/chengyuqiang/article/details/73441493

7.4 自定义Key类型

Hadoop提供了多种基本的Writable类型，但是在实际开发中这些基本的Writable类型可能不能满足需求，这时候需要根据具体情况自定义Writable类型。

7.4.1 问题描述

针对NCDC提供的气象数据，求2016年和2017年每个月份最高温度。

7.4.2 上传数据

hdfs dfs -mkdir -p input
hdfs dfs -put /root/data/ncdc.txt input

[root@node1 ~]# hdfs dfs -mkdir -p input
[root@node1 ~]# hdfs dfs -put /root/data/ncdc.txt input
[root@node1 ~]# hdfs dfs -ls input
Found 1 items
-rw-r--r--   3 root hbase  871353053 2017-06-21 20:32 input/ncdc.txt

7.4.2 自定义Key

如何区分每一条数据，也就是如何寻求key的类型？

package cn.hadron.mr.ncdc;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class Weather implements WritableComparable<Weather> {
    private int year;
    private int month;
    private double hot;

    public Weather() {
    }

    public Weather(int year, int month, double hot) {
        this.year = year;
        this.month = month;
        this.hot = hot;
    }

    @Override
    public String toString() {
        return "[year=" + year + ", month=" + month + "]";
    }

    /**
     * 从输入流in中读取字节流反序列化为对象
     */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.year = in.readInt();
        this.month = in.readInt();
        this.hot = in.readDouble();
    }

    /**
     * 将对象转换为字节流并写入到输出流out中
     */
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(year);
        out.writeInt(month);
        out.writeDouble(hot);
    }

    // 判断对象是否是同一个对象，当该对象作为输出的key
    @Override
    public int compareTo(Weather t) {
        int r1 = Integer.compare(this.year, t.getYear());
        if (r1 == 0) {
            //如果年份相同，则判断月份
            int r2 = Integer.compare(this.month, t.getMonth());
            if (r2 == 0) {
                return Double.compare(this.hot, t.getHot());
            } else {
                return r2;
            }
        } else {
            return r1;
        }
    }

    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public int getMonth() {
        return month;
    }

    public void setMonth(int month) {
        this.month = month;
    }

    public double getHot() {
        return hot;
    }

    public void setHot(double hot) {
        this.hot = hot;
    }
}

自定义Partitioner

package cn.hadron.mr.ncdc;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

public class MyPartitioner extends HashPartitioner<Weather, DoubleWritable> {
    // 执行时间越短越好
    public int getPartition(Weather key, DoubleWritable value, int numReduceTasks) {
        // 根据年份分区
        return key.getYear() % numReduceTasks;
    }
}

自定义比较器

package cn.hadron.mr.ncdc;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class MyComparator extends WritableComparator {

    protected MyComparator() {
        super(Weather.class, true);
    }

    @Override
    public int compare(WritableComparable k1, WritableComparable k2) {
        Weather key1=(Weather)k1;
        Weather key2=(Weather)k2;
        int r1 = Integer.compare(key1.getYear(), key2.getYear());
        if (r1 == 0) {
            //如果年份相同，则判断月份
            return Integer.compare(key1.getMonth(), key2.getMonth());
        } else {
            return r1;
        }
    }
}

Mapper和Reducer

package cn.hadron.mr.ncdc;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class RunJob {

    public static void main(String[] args) {
        // 设置环境变量HADOOP_USER_NAME，其值是root
        System.setProperty("HADOOP_USER_NAME", "root");
        // Configuration类包含了Hadoop的配置
        Configuration config = new Configuration();
        // 设置fs.defaultFS
        config.set("fs.defaultFS", "hdfs://192.168.1.117:8020");
        // 设置yarn.resourcemanager节点
        config.set("yarn.resourcemanager.hostname", "node1");
        try {
            FileSystem fs = FileSystem.get(config);
            Job job = Job.getInstance(config);
            job.setJarByClass(RunJob.class);
            job.setJobName("weather");
            job.setMapperClass(WeatherMapper.class);
            job.setReducerClass(WeatherReducer.class);
            job.setMapOutputKeyClass(Weather.class);
            job.setMapOutputValueClass(DoubleWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(DoubleWritable.class);
            job.setPartitionerClass(MyPartitioner.class);
            job.setSortComparatorClass(MyComparator.class);
            //只有两年的数据，所以ReduceTask设置2
            job.setNumReduceTasks(2);
            FileInputFormat.addInputPath(job, new Path("/user/root/input/ncdc.txt"));
            Path outpath = new Path("/user/root/output");
            if (fs.exists(outpath)) {
                fs.delete(outpath, true);
            }
            FileOutputFormat.setOutputPath(job, outpath);
            System.out.println(job.waitForCompletion(true));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static class WeatherMapper extends Mapper<LongWritable, Text, Weather, DoubleWritable> {
        private static final String MISSING = "9999.9";
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String val=value.toString();
            int year=0;
            int month=0;
            double hot=0.0;
            Weather w=null;
            try{
                year = Integer.parseInt(val.substring(14,18));
                month =Integer.parseInt(val.substring(18,20));
                String hotStr=val.substring(102,108);
                if(!MISSING.equals(hotStr)){
                    hot =Double.parseDouble(hotStr);
                    w=new Weather(year,month,hot);
                    context.write(w, new DoubleWritable(hot));  
                }
            }catch(Exception e){
                System.out.println(e);
            }
        }
    }

    public static class WeatherReducer extends Reducer<Weather, DoubleWritable, Text, DoubleWritable> {
        protected void reduce(Weather key, Iterable<DoubleWritable> values, Context context)
                throws IOException, InterruptedException {
            double maxValue = 0.0;
            for(DoubleWritable value : values) {  
                 maxValue = Math.max(maxValue, value.get());  
            }  
            context.write(new Text(key.toString()), new DoubleWritable(maxValue));  
        }
    }
}

运行

Eclipse运行结果

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
true

HDFS查看结果
hdfs dfs -ls /user/root/output

[root@hds117 data]# hdfs dfs -ls /user/root/output
Found 3 items
-rw-r--r--   3 root hbase          0 2017-06-27 16:29 /user/root/output/_SUCCESS
-rw-r--r--   3 root hbase        327 2017-06-27 16:29 /user/root/output/part-r-00000
-rw-r--r--   3 root hbase        162 2017-06-27 16:29 /user/root/output/part-r-00001
You have new mail in /var/spool/mail/root

hdfs dfs -cat /user/root/output/part-r-00000
hdfs dfs -cat /user/root/output/part-r-00001

[root@hds117 data]# hdfs dfs -cat /user/root/output/part-r-00000
[year=2016, month=1]    119.5
[year=2016, month=2]    118.6
[year=2016, month=3]    122.0
[year=2016, month=4]    120.2
[year=2016, month=5]    126.5
[year=2016, month=6]    129.0
[year=2016, month=7]    127.2
[year=2016, month=8]    127.4
[year=2016, month=9]    124.2
[year=2016, month=10]   121.1
[year=2016, month=11]   114.1
[year=2016, month=12]   126.9
[root@hds117 data]# hdfs dfs -cat /user/root/output/part-r-00001
[year=2017, month=1]    116.1
[year=2017, month=2]    117.3
[year=2017, month=3]    123.8
[year=2017, month=4]    129.6
[year=2017, month=5]    129.2
[year=2017, month=6]    123.6

Hadoop基础教程-第7章 MapReduce进阶（7.4 自定义Key类型）

第7章 MapReduce进阶

7.4 自定义Key类型

7.4.1 问题描述

7.4.2 上传数据

7.4.2 自定义Key

自定义Partitioner

自定义比较器

Mapper和Reducer

运行

第7章 MapReduce进阶

7.4 自定义Key类型

7.4.1 问题描述

7.4.2 上传数据

7.4.2 自定义Key

自定义Partitioner

自定义比较器

Mapper和Reducer

运行

猜你喜欢