Hadoop Learning - MapReduce Quiz


Code is commented

1. Word Count

1、WordCountDriverNew

package net.sherry.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 功能:词频统计驱动器类
 * 作者:sherry
 * 日期:2022年12月07日
 */
public class WordCountDriverNew {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 创建配置对象
        Configuration conf = new Configuration();
        // 设置数据节点主机名属性
        conf.set("dfs.client.use.datanode.hostname", "true");

        // 获取作业实例
        Job job = Job.getInstance(conf);
        // 设置作业启动类
        job.setJarByClass(WordCountDriverNew.class);

        // 设置Mapper类
        job.setMapperClass(WordCountMapper.class);
        // 设置map任务输出键类型
        job.setMapOutputKeyClass(Text.class);
        // 设置map任务输出值类型
        job.setMapOutputValueClass(IntWritable.class);

        // 设置reducer类
        job.setReducerClass(WordCountReducer.class);
        // 设置reduce任务输出键类型
        job.setOutputKeyClass(Text.class);
        // 设置reduce任务输出值类型
        job.setOutputValueClass(IntWritable.class);

        // 设置分区数量(reduce任务的数量,结果文件的数量)
        job.setNumReduceTasks(3);
        // 定义uri字符串
        String uri = "hdfs://hadoop102:8020";
        String user = "sherry";
        Path inputPath = null;
        Path outputPath = null;
        if (args.length == 0){
    
    
            // 创建输入目录
            inputPath = new Path(uri + "/wordcount/input");
            // 创建输出目录
            outputPath = new Path(uri + "/wordcount/output");
        } else if (args.length==2) {
    
    
            // 创建输入目录
            inputPath = new Path(uri + args[0]);
            // 创建输出目录
            outputPath = new Path(uri + args[1]);
        }else{
    
    
            // 提示用户参数个数不符合要求
            System.out.println("参数个数要么是0个要么是2个");
            // 结束应用程序
            return;
        }

        // 获取文件系统
        FileSystem fs =  FileSystem.get(new URI(uri), conf, user);
        // 删除输出目录(第二个参数设置是否递归)
        fs.delete(outputPath, true);

        // 给作业添加输入目录(允许多个)
        FileInputFormat.addInputPath(job, inputPath);
        // 给作业设置输出目录(只能一个)
        FileOutputFormat.setOutputPath(job, outputPath);

        // 等待作业完成
        job.waitForCompletion(true);

        // 输出统计结果
        System.out.println("======统计结果======");
        FileStatus[] fileStatuses = fs.listStatus(outputPath);
        for (int i = 1; i < fileStatuses.length; i++) {
    
    
            // 输出结果文件路径
            System.out.println(fileStatuses[i].getPath());
            // 获取文件系统数据字节输入流
            FSDataInputStream in = fs.open(fileStatuses[i].getPath());
            // 将结果文件显示在控制台
            IOUtils.copyBytes(in, System.out, 4096, false);
        }
    }
}

2、WordCountMapper

package net.sherry.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 功能:词频统计映射器类
 * 作者:sherry
 * 日期:2022年12月07日
 */
public class WordCountMapper extends Mapper<LongWritable, Text,  Text , IntWritable> {
    
    
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
    
    
        //获取行内容
        String line = value.toString();
        System.out.println("源文件行:"+line);
//        System.out.println(line.getClass());
        line = line.replaceAll("[\\pP]", "");
        System.out.println("正则处理之后:"+line);

        //按空格拆分得到单词数组
        String[] words = line.split(" ");
        //遍历单词数组,生成键值对<单词,1>
        for (String word : words){
    
    
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

3、WordCountReducer

package net.sherry.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class WordCountReducer extends Reducer<Text, IntWritable , Text , IntWritable> {
    
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) // Reducer<Text, IntWritable, Text, Text>.
            throws IOException, InterruptedException {
    
    
//        定义单词出现次数count
        int count = 0;
//        遍历输入值迭代器
        for (IntWritable value : values) {
    
    
            count += value.get();  // 为代码通用性   + value.get()
        }
//        将新的键值对输出, 注意要将java的int类型转换成hadoop的intwriteble类型
        context.write(key, new IntWritable(count));
    }
}

2. Sorting of student information

1、Student

package net.sherry.student;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 功能:学生实体类
 *      实现序列化可比接口
 * 作者: Sherry
 * 日期:2022年12月17日
 */
public class Student implements WritableComparable<Student> {
    
    
    private String name;
    private String gender;
    private int age;
    private String phone;
    private String major;



    public String getName() {
    
    
        return name;
    }

    public void setName(String name) {
    
    
        this.name = name;
    }

    public String getGender() {
    
    
        return gender;
    }

    public void setGender(String gender) {
    
    
        this.gender = gender;
    }

    public int getAge() {
    
    
        return age;
    }

    public void setAge(int age) {
    
    
        this.age = age;
    }

    public String getPhone() {
    
    
        return phone;
    }

    public void setPhone(String phone) {
    
    
        this.phone = phone;
    }

    public String getMajor() {
    
    
        return major;
    }

    public void setMajor(String major) {
    
    
        this.major = major;
    }
    @Override
    public String toString() {
    
    
        return "Student{" +
                "name='" + name + '\'' +
                ", gender='" + gender + '\'' +
                ", age='" + age + '\'' +
                ", phone='" + phone + '\'' +
                ", major='" + major + '\'' +
                '}';
    }

    public int compareTo(Student o){
    
    
        if (this.getGender().compareTo(o.getGender())==0){
    
    
            return o.getAge() - this.getAge();  // 降序
        }else {
    
    
            return o.getGender().compareTo(this.getGender());
        }
    }

    @Override
    public void write(DataOutput out) throws IOException {
    
    
        out.writeUTF(name);
        out.writeUTF(gender);
        out.writeInt(age);
        out.writeUTF(phone);
        out.writeUTF(major);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
    
    
        name = in.readUTF();
        gender = in.readUTF();
        age = in.readInt();
        phone = in.readUTF();
        major = in.readUTF();
    }
}

2、StudentDriver

package net.sherry.student;

import net.sherry.sum.ScoreMapper;
import net.sherry.sum.ScoreReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 功能:词频统计驱动器类
 * 作者:sherry
 * 日期:2022年12月07日
 */
public class StudentDriver {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 创建配置对象
        Configuration conf = new Configuration();
        // 设置数据节点主机名属性
        conf.set("dfs.client.use.datanode.hostname", "true");

        // 获取作业实例
        Job job = Job.getInstance(conf);
        // 设置作业启动类
        job.setJarByClass(StudentDriver.class);

        // 设置Mapper类
        job.setMapperClass(StudentMapper.class);
        // 设置map任务输出键类型
        job.setMapOutputKeyClass(Student.class);
        // 设置map任务输出值类型
        job.setMapOutputValueClass(NullWritable.class);

        // 设置reducer类
        job.setReducerClass(StudentReducer.class);
        // 设置reduce任务输出键类型
        job.setOutputKeyClass(Student.class);
        // 设置reduce任务输出值类型
        job.setOutputValueClass(NullWritable.class);

        // 定义uri字符串
        String uri = "hdfs://hadoop102:8020";
        String user = "sherry";
        // 创建输入目录
        Path inputPath = new Path(uri + "/wordcount/input/student.txt");
        // 创建输出目录
        Path outputPath = new Path(uri + "/wordcount/output/student");

        // 获取文件系统
        FileSystem fs =  FileSystem.get(new URI(uri), conf, user);
        // 删除输出目录(第二个参数设置是否递归)
        fs.delete(outputPath, true);

        // 给作业添加输入目录(允许多个)
        FileInputFormat.addInputPath(job, inputPath);
        // 给作业设置输出目录(只能一个)
        FileOutputFormat.setOutputPath(job, outputPath);

        // 等待作业完成
        job.waitForCompletion(true);

        // 输出统计结果
        System.out.println("======统计结果======");
        FileStatus[] fileStatuses = fs.listStatus(outputPath);
        for (int i = 1; i < fileStatuses.length; i++) {
    
    
            // 输出结果文件路径
            System.out.println(fileStatuses[i].getPath());
            // 获取文件系统数据字节输入流
            FSDataInputStream in = fs.open(fileStatuses[i].getPath());
            // 将结果文件显示在控制台
            IOUtils.copyBytes(in, System.out, 4096, false);
        }
    }
}

3、Student Mapper

package net.sherry.student;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 功能: 学生映射器类
 *
 */
public class StudentMapper extends Mapper<LongWritable, Text, Student, NullWritable> {
    
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
    
    
//        获取行内容
        String line = value.toString();
        System.out.println(line);
//        按空格拆分得到字段数组
        String[] fields = line.split(" ");
//        获取学生信息
        String name = fields[0];
        String gender = fields[1];
        int age = Integer.parseInt(fields[2]);
        String phone = fields[3];
        String major = fields[4];
//        创建学生对象
        Student student = new Student();
//        设置学生对象属性
        student.setName(name);
        student.setGender(gender);
        student.setAge(age);
        student.setPhone(phone);
        student.setMajor(major);
//        写入键值对<Student, null>
        context.write(student, NullWritable.get());
    }
}

4、StudentReduce

package net.sherry.student;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class StudentReducer extends Reducer<Student, NullWritable, Text, NullWritable> {
    
    
    @Override
    protected void reduce(Student key, Iterable<NullWritable> values,Context context)
            throws IOException, InterruptedException {
    
    
        for(NullWritable value: values){
    
    
            //        获取学生对象
            Student student = key;
//        拼接学生信息
            String studentInfo = student.getName() + "\t"
                    + student.getGender() + "\t"
                    + student.getAge() + "\t"
                    + student.getPhone() + "\t"
                    + student.getMajor();
//        写入键值对<studentInfo, null>
            context.write(new Text(studentInfo), NullWritable.get());
        }
     }
}

3. Summation of student achievement

1、ScoreDriver

package net.sherry.sum;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 功能:词频统计驱动器类
 * 作者:sherry
 * 日期:2022年12月07日
 */
public class ScoreDriver {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 创建配置对象
        Configuration conf = new Configuration();
        // 设置数据节点主机名属性
        conf.set("dfs.client.use.datanode.hostname", "true");

        // 获取作业实例
        Job job = Job.getInstance(conf);
        // 设置作业启动类
        job.setJarByClass(ScoreDriver.class);

        // 设置Mapper类
        job.setMapperClass(ScoreMapper.class);
        // 设置map任务输出键类型
        job.setMapOutputKeyClass(Text.class);
        // 设置map任务输出值类型
        job.setMapOutputValueClass(IntWritable.class);

        // 设置reducer类
        job.setReducerClass(ScoreReducer.class);
        // 设置reduce任务输出键类型
        job.setOutputKeyClass(Text.class);
        // 设置reduce任务输出值类型
        job.setOutputValueClass(NullWritable.class);

        // 设置分区数量(reduce任务的数量,结果文件的数量)
//        job.setNumReduceTasks(3);
        // 定义uri字符串
        String uri = "hdfs://hadoop102:8020";
        String user = "sherry";
        // 创建输入目录
        Path inputPath = new Path(uri + "/wordcount/input/score.txt");
        // 创建输出目录
        Path outputPath = new Path(uri + "/wordcount/output/score");

        // 获取文件系统
        FileSystem fs =  FileSystem.get(new URI(uri), conf, user);
        // 删除输出目录(第二个参数设置是否递归)
        fs.delete(outputPath, true);

        // 给作业添加输入目录(允许多个)
        FileInputFormat.addInputPath(job, inputPath);
        // 给作业设置输出目录(只能一个)
        FileOutputFormat.setOutputPath(job, outputPath);

        // 等待作业完成
        job.waitForCompletion(true);

        // 输出统计结果
        System.out.println("======统计结果======");
        FileStatus[] fileStatuses = fs.listStatus(outputPath);
        for (int i = 1; i < fileStatuses.length; i++) {
    
    
            // 输出结果文件路径
            System.out.println(fileStatuses[i].getPath());
            // 获取文件系统数据字节输入流
            FSDataInputStream in = fs.open(fileStatuses[i].getPath());
            // 将结果文件显示在控制台
            IOUtils.copyBytes(in, System.out, 4096, false);
        }
    }
}

2、ScoreMapper

package net.sherry.sum;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ScoreMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException{
    
    
//        获取行内容
        String line = value.toString();
//        空格拆分得到字段数组
        String[] fields = line.split("  ");
//        获取姓名   trim 消除空格
        String name = fields[0].trim();
//        遍历各科成绩
        for (int i = 1; i< fields.length; i++){
    
    
//            获取成绩
            System.out.println(fields.length);
            int score = Integer.parseInt(fields[i]);
//            将<姓名,成绩>键值对写入下一个阶段
            context.write(new Text(name), new IntWritable(score));
        }
    }
}

3、ScoreReduce

package net.sherry.sum;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.text.DecimalFormat;

/**
 * @归并器
 * @Au
 *
 */

public class ScoreReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
    
    
    @Override
    protected void  reduce(Text key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
    
    
        int count = 0; // 科目数
        int sum = 0;  // 总分
        double avg = 0; // 平均分
//        遍历迭代器计算总分
        for (IntWritable value : values){
    
    
            count++; // 科目累加
            sum += value.get(); // 总分累加·
        }
        // 计算平均分
        avg = sum *1.0 / count;
        // 创建小鼠格式对象
        DecimalFormat df = new DecimalFormat("#.#");
        String scoreInfo = key + " " + sum + " " + df.format(avg);
//        写入键值对
        context.write(new Text(scoreInfo), NullWritable.get());
    }
}

Guess you like

Origin blog.csdn.net/weixin_53547097/article/details/128356941