Hadoop | 代码:求出每一个订单中成交金额最大的一笔交易

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/fragrant_no1/article/details/86309767

订单数据:

订单编号,商品编号,金额
10000,p1,100
10000,p2,230
10001,p1,120
10001,p2,650
10001,p3,101
10002,p1,102

分析

  • 订单的排序问题
    从高效的方面出发,最好用框架以有的功能来实现,maptask自带排序功能,但是默认的排序不满足需求,所以需要自定义comparaTo,按照订单号,统一订单下的金额来排序。

  • 分区问题
    因为默认分区是按照key的hashCode值来分区的,但是不同对象的hashCode值是不同的,不能按照订单号相同来分区,所以需要自定义分区规则,这里使用partitioner,按照订单号的hashCode来进行分区。

  • 分组问题
    reducetask中会对数据按照key进行分组,但是如果封装成对象是无法按照订单号来默认分组的,因为不同对象是不同的组,所以需要自定义分组规则,这里使用GroupingComparator,自定义按照订单号来分组。
    在这里插入图片描述

代码实现

package com.example.demo.ordermax;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * maptask中的分区组件
 */
public class OrderPartitioner extends Partitioner<OrderBean,NullWritable> {
    //根据订单号的hashCode值对reduce个数取模,这样可以保证相同订单号的对象被分配到同一个reduce中去
    @Override
    public int getPartition(OrderBean orderBean, NullWritable nullWritable, int reduceTasks) {
        //&操作是截取位/长度
        //相同id的订单bean,会发往相同的partition,而且,产生的分区数,是会跟用户设置的reduce task数保持一致
        return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE) % reduceTasks;
    }
}

package com.example.demo.ordermax;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * reducetask 中用于分组的
 * 自定义实现将一组bean(订单号相同的)看成相同的key
 */
public class OrderGroupingComparet extends WritableComparator {

    //传入作为key的bean的class类型,以及制定需要让框架做反射获取实例对象然后比较-这个方法必须要有,不然无法自定义分组
    protected OrderGroupingComparet() {
        super(OrderBean.class, true);
    }

    /**
     * 用于实际比较分组的方法
     * WritableComparable 就是反序列化之后的对象
     */
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        OrderBean a1 = (OrderBean) a;
        OrderBean b1 = (OrderBean) b;

        //比较两个bean时,指定只比较bean中的orderid
        return a1.getOrderId().compareTo(b1.getOrderId());
    }
}

package com.example.demo.ordermax;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;

public class OrderBean implements WritableComparable<OrderBean> {

    private String orderId;
    private String pid;
    private int sumMoney;

    //反序列化的时候使用
    public OrderBean() {
    }

    public void set(String orderId, String pid, int sumMoney) {
        this.orderId = orderId;
        this.pid = pid;
        this.sumMoney = sumMoney;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public int getSumMoney() {
        return sumMoney;
    }

    public void setSumMoney(int sumMoney) {
        this.sumMoney = sumMoney;
    }

    /**
     * 返回>0 的值代表用升序排列,反之用降序排列
     * 溢出,溢出的小文件合并,reducetask分组 这个过程中都会通过key进行排序
     */
    @Override
    public int compareTo(OrderBean o) {
        //先订单比较,升序
        String orderId = o.getOrderId();
        int flag = this.orderId.compareTo(orderId);

        //订单相同后金额比较,降序
        if(flag == 0){
            flag = this.sumMoney > o.getSumMoney() ? -1 : 1;
        }
        return flag;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(orderId);
        dataOutput.writeUTF(pid);
        dataOutput.writeInt(sumMoney);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.orderId = dataInput.readUTF();
        this.pid = dataInput.readUTF();
        this.sumMoney = dataInput.readInt();
    }

    @Override
    public String toString() {
        return "OrderBean{" +
                "orderId='" + orderId + '\'' +
                ", pid='" + pid + '\'' +
                ", sumMoney=" + sumMoney +
                '}';
    }
}

package com.example.demo.ordermax;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 效率上考虑
 * 高(能让框架做的,那么效率就高)
 * 低(自己写逻辑mapreduce的相对低)
 */
public class OrderMax {

    static class OrderMaxMap extends Mapper<LongWritable,Text,OrderBean,NullWritable>{
        private OrderBean bean = new OrderBean();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(",");
            bean.set(split[0], split[1], Integer.parseInt(split[2]));
            context.write(bean,NullWritable.get());
        }
    }

    static class OrderMaxReduce extends Reducer<OrderBean,NullWritable,OrderBean,NullWritable>{
        //到达reduce时,相同orderId的所有bean已经被看成一组,且金额最大的那个一排在第一位
        @Override
        protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }


    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(OrderMax.class);
        job.setMapperClass(OrderMaxMap.class);
        job.setReducerClass(OrderMaxReduce.class);

        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        //数据源目录
        FileInputFormat.setInputPaths(job,new Path("E:/abc"));
        //结果输出目录
        FileOutputFormat.setOutputPath(job,new Path("E:/abc/output"));
        //在此设置自定义的Groupingcomparator类
        job.setGroupingComparatorClass(OrderGroupingComparet.class);
        //在此设置自定义的partitioner类
        job.setPartitionerClass(OrderPartitioner.class);
        //自定义reducetask个数
        job.setNumReduceTasks(2);
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);

    }
}

说明:
这里跑的是本地模式而不是集群模式

猜你喜欢

转载自blog.csdn.net/fragrant_no1/article/details/86309767