[MapReduce] Comprehensive case



Comprehensive case

① Data file

Insert picture description here


② Specific requirements

  • information.txt Upload to HDFS
  • student.txt Put it locally
  • By distribute缓存reading the data on hdfs,
  • Encapsulate HDFS and local data into an JavaBeanobject
  • The object is required to be encapsulated on the map side, and the number of attributes in the object whose attribute is Null is calculated on the reduce side as the number of value output, and the output key is the tostring of the Bean object.
  • The partition is set to two,根据性别分区

③ Concrete realization

• upload files

Insert picture description here


• Encapsulate the Bean class

package CSDN综合练习;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Bean implements Writable {
    
    
    // 封装属性
    private String id;
    private String name;
    private String sex;
    private String hobby;
    private String job;
    // 无参构造器
    public Bean() {
    
    
    }
    // 重写toString
    @Override
    public String toString() {
    
    
        return "Bean{" +
                "id='" + id + '\'' +
                ", name='" + name + '\'' +
                ", sex='" + sex + '\'' +
                ", hobby='" + hobby + '\'' +
                ", job='" + job + '\'' +
                '}';
    }
    // 序列化
    @Override
    public void write(DataOutput dataOutput) throws IOException {
    
    
        dataOutput.writeUTF(id);
        dataOutput.writeUTF(name);
        dataOutput.writeUTF(sex);
        dataOutput.writeUTF(hobby);
        dataOutput.writeUTF(job);
    }
    // 反序列化
    @Override
    public void readFields(DataInput dataInput) throws IOException {
    
    
        id = dataInput.readUTF();
        name = dataInput.readUTF();
        sex = dataInput.readUTF();
        hobby = dataInput.readUTF();
        job = dataInput.readUTF();
    }
    // set\get
    public String getId() {
    
    
        return id;
    }

    public void setId(String id) {
    
    
        this.id = id;
    }

    public String getName() {
    
    
        return name;
    }

    public void setName(String name) {
    
    
        this.name = name;
    }

    public String getSex() {
    
    
        return sex;
    }

    public void setSex(String sex) {
    
    
        this.sex = sex;
    }

    public String getHobby() {
    
    
        return hobby;
    }

    public void setHobby(String hobby) {
    
    
        this.hobby = hobby;
    }

    public String getJob() {
    
    
        return job;
    }

    public void setJob(String job) {
    
    
        this.job = job;
    }
}

Back to top


• The Mapper class caches information.txt and realizes the connection with student.txt

package CSDN综合练习;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

public class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text,Text, Bean> {
    
    

    Bean bean = new Bean();
    HashMap map = new HashMap();
    /**
     *  缓存hdfs上的数据表
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
    
    
        try {
    
    
            // 获取缓存文件
            URI[] cacheFiles = context.getCacheFiles();
            // 通过缓存文件获取路径
            String path = cacheFiles[0].getPath().toString();
            System.out.println(path);
            // 读取文件信息
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path),"UTF-8"));
            String line;
            while (StringUtils.isNotEmpty(line = br.readLine())){
    
    
                // 游戏	大数据	1
                // 读取一行数据,拆分
                String[] fields = line.split("\t");
                map.put(fields[2],fields[0]+"\t"+fields[1]);
            }
            // 关闭资源
            IOUtils.closeStream(br);

        } catch (Exception e){
    
    
            e.printStackTrace();
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
    
        // 1	张三	女
        // 读取一行数据,进行拆分
        String line = value.toString();
        String[] fields = line.split("\t");
        // 封装bean对象
        String id = fields[0];
        String name = fields[1];
        String sex = fields[2];
        String hobby = (String)map.get(id).toString().split("\t")[0];
        String job = (String)map.get(id).toString().split("\t")[1];
        bean.setId(id);
        bean.setName(name);
        bean.setSex(sex);
        bean.setHobby(hobby);
        bean.setJob(job);
        // 写出
        context.write(new Text(id),bean);
    }
}

Back to top


• The attribute of the Reducer class statistical object is NULL

package CSDN综合练习;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;

public class Reduce extends Reducer<Text, Bean,Bean,LongWritable> {
    
    
    @Override
    protected void reduce(Text key, Iterable<Bean> values, Context context) throws IOException, InterruptedException {
    
    
        // 获取数据进行null值统计
        int count = 0;
        for (Bean bean:values){
    
    
            if (bean.getName()==null||bean.getName().equals("Null")){
    
    
                count++;
            }
            if (bean.getJob()==null||bean.getJob().equals("Null")){
    
    
                count++;
            }
            if (bean.getHobby()==null||bean.getHobby().equals("Null")){
    
    
                count++;
            }
            if (bean.getSex()==null||bean.getSex().equals("Null")){
    
    
                count++;
            }
            context.write(bean,new LongWritable(count));
        }
    }
}

Back to top


• The Partition class implements partitioning according to gender

package CSDN综合练习;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class Partition extends Partitioner<Text, Bean> {
    
    
    @Override
    public int getPartition(Text text,Bean bean, int numPartitions) {
    
    
        // 获取用户信息
        String sex = bean.getSex();
        // 指定分区数
        int partition = 1;
        if (sex.equals("男")){
    
    
            partition = 0;
        } else {
    
    
            partition = 1;
        }
        return partition;
    }
}

Back to top


• Driver configuration

package CSDN综合练习;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;

public class Driver {
    
    
    public static void main(String[] args) {
    
    
        Job job;
        Configuration conf = new Configuration();
        try {
    
    
            // 获取job
            job = Job.getInstance(conf);
            // 配置
            job.setMapperClass(Mapper.class);
            job.setReducerClass(Reduce.class);
            job.setJarByClass(Driver.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Bean.class);
            job.setOutputKeyClass(Bean.class);
            job.setOutputValueClass(LongWritable.class);

            // 配置缓存
            String path = "hdfs://192.168.64.178:9000/user/MR/input/information.txt";
            job.addCacheFile(new URI("file:///G:/Projects/IdeaProject-C/MapReduce/src/main/java/CSDN综合练习/data/information.txt"));

            // 自定义分区
            job.setPartitionerClass(Partition.class);
            // reduce计算的数量
            job.setNumReduceTasks(2);

            // 配置输入输出文件
            FileInputFormat.setInputPaths(job,new Path("G:/Projects/IdeaProject-C/MapReduce/src/main/java/CSDN综合练习/data/student.txt"));
            FileOutputFormat.setOutputPath(job,new Path("G:/Projects/IdeaProject-C/MapReduce/src/main/java/CSDN综合练习/output_withoutReducer"));

            // 提交job
            boolean result = job.waitForCompletion(true);
            System.exit(result? 0:1);
        } catch (Exception e){
    
    
            e.printStackTrace();
        }
    }
}

Insert picture description here

note:

  • When using the above hdfs file path here, an error will be reported: the java.io.FileNotFoundException: \user\MR\input\information.txt (系统找不到指定的路径。)reason is temporarily unknown, and it will be updated after the solution~

Back to top


Guess you like

Origin blog.csdn.net/qq_45797116/article/details/114045303