1. Word Count

1、WordCountDriverNew

package net.sherry.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 功能：词频统计驱动器类
 * 作者：sherry
 * 日期：2022年12月07日
 */
public class WordCountDriverNew {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 创建配置对象
        Configuration conf = new Configuration();
        // 设置数据节点主机名属性
        conf.set("dfs.client.use.datanode.hostname", "true");

        // 获取作业实例
        Job job = Job.getInstance(conf);
        // 设置作业启动类
        job.setJarByClass(WordCountDriverNew.class);

        // 设置Mapper类
        job.setMapperClass(WordCountMapper.class);
        // 设置map任务输出键类型
        job.setMapOutputKeyClass(Text.class);
        // 设置map任务输出值类型
        job.setMapOutputValueClass(IntWritable.class);

        // 设置reducer类
        job.setReducerClass(WordCountReducer.class);
        // 设置reduce任务输出键类型
        job.setOutputKeyClass(Text.class);
        // 设置reduce任务输出值类型
        job.setOutputValueClass(IntWritable.class);

        // 设置分区数量（reduce任务的数量，结果文件的数量）
        job.setNumReduceTasks(3);
        // 定义uri字符串
        String uri = "hdfs://hadoop102:8020";
        String user = "sherry";
        Path inputPath = null;
        Path outputPath = null;
        if (args.length == 0){
    
    
            // 创建输入目录
            inputPath = new Path(uri + "/wordcount/input");
            // 创建输出目录
            outputPath = new Path(uri + "/wordcount/output");
        } else if (args.length==2) {
    
    
            // 创建输入目录
            inputPath = new Path(uri + args[0]);
            // 创建输出目录
            outputPath = new Path(uri + args[1]);
        }else{
    
    
            // 提示用户参数个数不符合要求
            System.out.println("参数个数要么是0个要么是2个");
            // 结束应用程序
            return;
        }

        // 获取文件系统
        FileSystem fs =  FileSystem.get(new URI(uri), conf, user);
        // 删除输出目录（第二个参数设置是否递归）
        fs.delete(outputPath, true);

        // 给作业添加输入目录（允许多个）
        FileInputFormat.addInputPath(job, inputPath);
        // 给作业设置输出目录（只能一个）
        FileOutputFormat.setOutputPath(job, outputPath);

        // 等待作业完成
        job.waitForCompletion(true);

        // 输出统计结果
        System.out.println("======统计结果======");
        FileStatus[] fileStatuses = fs.listStatus(outputPath);
        for (int i = 1; i < fileStatuses.length; i++) {
    
    
            // 输出结果文件路径
            System.out.println(fileStatuses[i].getPath());
            // 获取文件系统数据字节输入流
            FSDataInputStream in = fs.open(fileStatuses[i].getPath());
            // 将结果文件显示在控制台
            IOUtils.copyBytes(in, System.out, 4096, false);
        }
    }
}

2、WordCountMapper

package net.sherry.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 功能：词频统计映射器类
 * 作者：sherry
 * 日期：2022年12月07日
 */
public class WordCountMapper extends Mapper<LongWritable, Text,  Text , IntWritable> {
    
    
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
    
    
        //获取行内容
        String line = value.toString();
        System.out.println("源文件行："+line);
//        System.out.println(line.getClass());
        line = line.replaceAll("[\\pP]", "");
        System.out.println("正则处理之后："+line);

        //按空格拆分得到单词数组
        String[] words = line.split(" ");
        //遍历单词数组，生成键值对<单词,1>
        for (String word : words){
    
    
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

3、WordCountReducer

package net.sherry.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class WordCountReducer extends Reducer<Text, IntWritable , Text , IntWritable> {
    
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) // Reducer<Text, IntWritable, Text, Text>.
            throws IOException, InterruptedException {
    
    
//        定义单词出现次数count
        int count = 0;
//        遍历输入值迭代器
        for (IntWritable value : values) {
    
    
            count += value.get();  // 为代码通用性   + value.get()
        }
//        将新的键值对输出, 注意要将java的int类型转换成hadoop的intwriteble类型
        context.write(key, new IntWritable(count));
    }
}

2. Sorting of student information

1、Student

package net.sherry.student;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 功能：学生实体类
 *      实现序列化可比接口
 * 作者： Sherry
 * 日期：2022年12月17日
 */
public class Student implements WritableComparable<Student> {
    
    
    private String name;
    private String gender;
    private int age;
    private String phone;
    private String major;



    public String getName() {
    
    
        return name;
    }

    public void setName(String name) {
    
    
        this.name = name;
    }

    public String getGender() {
    
    
        return gender;
    }

    public void setGender(String gender) {
    
    
        this.gender = gender;
    }

    public int getAge() {
    
    
        return age;
    }

    public void setAge(int age) {
    
    
        this.age = age;
    }

    public String getPhone() {
    
    
        return phone;
    }

    public void setPhone(String phone) {
    
    
        this.phone = phone;
    }

    public String getMajor() {
    
    
        return major;
    }

    public void setMajor(String major) {
    
    
        this.major = major;
    }
    @Override
    public String toString() {
    
    
        return "Student{" +
                "name='" + name + '\'' +
                ", gender='" + gender + '\'' +
                ", age='" + age + '\'' +
                ", phone='" + phone + '\'' +
                ", major='" + major + '\'' +
                '}';
    }

    public int compareTo(Student o){
    
    
        if (this.getGender().compareTo(o.getGender())==0){
    
    
            return o.getAge() - this.getAge();  // 降序
        }else {
    
    
            return o.getGender().compareTo(this.getGender());
        }
    }

    @Override
    public void write(DataOutput out) throws IOException {
    
    
        out.writeUTF(name);
        out.writeUTF(gender);
        out.writeInt(age);
        out.writeUTF(phone);
        out.writeUTF(major);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
    
    
        name = in.readUTF();
        gender = in.readUTF();
        age = in.readInt();
        phone = in.readUTF();
        major = in.readUTF();
    }
}

2、StudentDriver

package net.sherry.student;

import net.sherry.sum.ScoreMapper;
import net.sherry.sum.ScoreReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 功能：词频统计驱动器类
 * 作者：sherry
 * 日期：2022年12月07日
 */
public class StudentDriver {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 创建配置对象
        Configuration conf = new Configuration();
        // 设置数据节点主机名属性
        conf.set("dfs.client.use.datanode.hostname", "true");

        // 获取作业实例
        Job job = Job.getInstance(conf);
        // 设置作业启动类
        job.setJarByClass(StudentDriver.class);

        // 设置Mapper类
        job.setMapperClass(StudentMapper.class);
        // 设置map任务输出键类型
        job.setMapOutputKeyClass(Student.class);
        // 设置map任务输出值类型
        job.setMapOutputValueClass(NullWritable.class);

        // 设置reducer类
        job.setReducerClass(StudentReducer.class);
        // 设置reduce任务输出键类型
        job.setOutputKeyClass(Student.class);
        // 设置reduce任务输出值类型
        job.setOutputValueClass(NullWritable.class);

        // 定义uri字符串
        String uri = "hdfs://hadoop102:8020";
        String user = "sherry";
        // 创建输入目录
        Path inputPath = new Path(uri + "/wordcount/input/student.txt");
        // 创建输出目录
        Path outputPath = new Path(uri + "/wordcount/output/student");

        // 获取文件系统
        FileSystem fs =  FileSystem.get(new URI(uri), conf, user);
        // 删除输出目录（第二个参数设置是否递归）
        fs.delete(outputPath, true);

        // 给作业添加输入目录（允许多个）
        FileInputFormat.addInputPath(job, inputPath);
        // 给作业设置输出目录（只能一个）
        FileOutputFormat.setOutputPath(job, outputPath);

        // 等待作业完成
        job.waitForCompletion(true);

        // 输出统计结果
        System.out.println("======统计结果======");
        FileStatus[] fileStatuses = fs.listStatus(outputPath);
        for (int i = 1; i < fileStatuses.length; i++) {
    
    
            // 输出结果文件路径
            System.out.println(fileStatuses[i].getPath());
            // 获取文件系统数据字节输入流
            FSDataInputStream in = fs.open(fileStatuses[i].getPath());
            // 将结果文件显示在控制台
            IOUtils.copyBytes(in, System.out, 4096, false);
        }
    }
}

3、Student Mapper

package net.sherry.student;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 功能： 学生映射器类
 *
 */
public class StudentMapper extends Mapper<LongWritable, Text, Student, NullWritable> {
    
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
    
    
//        获取行内容
        String line = value.toString();
        System.out.println(line);
//        按空格拆分得到字段数组
        String[] fields = line.split(" ");
//        获取学生信息
        String name = fields[0];
        String gender = fields[1];
        int age = Integer.parseInt(fields[2]);
        String phone = fields[3];
        String major = fields[4];
//        创建学生对象
        Student student = new Student();
//        设置学生对象属性
        student.setName(name);
        student.setGender(gender);
        student.setAge(age);
        student.setPhone(phone);
        student.setMajor(major);
//        写入键值对<Student, null>
        context.write(student, NullWritable.get());
    }
}

4、StudentReduce

package net.sherry.student;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class StudentReducer extends Reducer<Student, NullWritable, Text, NullWritable> {
    
    
    @Override
    protected void reduce(Student key, Iterable<NullWritable> values,Context context)
            throws IOException, InterruptedException {
    
    
        for(NullWritable value: values){
    
    
            //        获取学生对象
            Student student = key;
//        拼接学生信息
            String studentInfo = student.getName() + "\t"
                    + student.getGender() + "\t"
                    + student.getAge() + "\t"
                    + student.getPhone() + "\t"
                    + student.getMajor();
//        写入键值对<studentInfo, null>
            context.write(new Text(studentInfo), NullWritable.get());
        }
     }
}

3. Summation of student achievement

1、ScoreDriver

package net.sherry.sum;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 功能：词频统计驱动器类
 * 作者：sherry
 * 日期：2022年12月07日
 */
public class ScoreDriver {
    
    
    public static void main(String[] args) throws Exception {
    
    
        // 创建配置对象
        Configuration conf = new Configuration();
        // 设置数据节点主机名属性
        conf.set("dfs.client.use.datanode.hostname", "true");

        // 获取作业实例
        Job job = Job.getInstance(conf);
        // 设置作业启动类
        job.setJarByClass(ScoreDriver.class);

        // 设置Mapper类
        job.setMapperClass(ScoreMapper.class);
        // 设置map任务输出键类型
        job.setMapOutputKeyClass(Text.class);
        // 设置map任务输出值类型
        job.setMapOutputValueClass(IntWritable.class);

        // 设置reducer类
        job.setReducerClass(ScoreReducer.class);
        // 设置reduce任务输出键类型
        job.setOutputKeyClass(Text.class);
        // 设置reduce任务输出值类型
        job.setOutputValueClass(NullWritable.class);

        // 设置分区数量（reduce任务的数量，结果文件的数量）
//        job.setNumReduceTasks(3);
        // 定义uri字符串
        String uri = "hdfs://hadoop102:8020";
        String user = "sherry";
        // 创建输入目录
        Path inputPath = new Path(uri + "/wordcount/input/score.txt");
        // 创建输出目录
        Path outputPath = new Path(uri + "/wordcount/output/score");

        // 获取文件系统
        FileSystem fs =  FileSystem.get(new URI(uri), conf, user);
        // 删除输出目录（第二个参数设置是否递归）
        fs.delete(outputPath, true);

        // 给作业添加输入目录（允许多个）
        FileInputFormat.addInputPath(job, inputPath);
        // 给作业设置输出目录（只能一个）
        FileOutputFormat.setOutputPath(job, outputPath);

        // 等待作业完成
        job.waitForCompletion(true);

        // 输出统计结果
        System.out.println("======统计结果======");
        FileStatus[] fileStatuses = fs.listStatus(outputPath);
        for (int i = 1; i < fileStatuses.length; i++) {
    
    
            // 输出结果文件路径
            System.out.println(fileStatuses[i].getPath());
            // 获取文件系统数据字节输入流
            FSDataInputStream in = fs.open(fileStatuses[i].getPath());
            // 将结果文件显示在控制台
            IOUtils.copyBytes(in, System.out, 4096, false);
        }
    }
}

2、ScoreMapper

package net.sherry.sum;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ScoreMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
    
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException{
    
    
//        获取行内容
        String line = value.toString();
//        空格拆分得到字段数组
        String[] fields = line.split("  ");
//        获取姓名   trim 消除空格
        String name = fields[0].trim();
//        遍历各科成绩
        for (int i = 1; i< fields.length; i++){
    
    
//            获取成绩
            System.out.println(fields.length);
            int score = Integer.parseInt(fields[i]);
//            将<姓名，成绩>键值对写入下一个阶段
            context.write(new Text(name), new IntWritable(score));
        }
    }
}

3、ScoreReduce

package net.sherry.sum;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.text.DecimalFormat;

/**
 * @归并器
 * @Au
 *
 */

public class ScoreReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
    
    
    @Override
    protected void  reduce(Text key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
    
    
        int count = 0; // 科目数
        int sum = 0;  // 总分
        double avg = 0; // 平均分
//        遍历迭代器计算总分
        for (IntWritable value : values){
    
    
            count++; // 科目累加
            sum += value.get(); // 总分累加·
        }
        // 计算平均分
        avg = sum *1.0 / count;
        // 创建小鼠格式对象
        DecimalFormat df = new DecimalFormat("#.#");
        String scoreInfo = key + " " + sum + " " + df.format(avg);
//        写入键值对
        context.write(new Text(scoreInfo), NullWritable.get());
    }
}

Hadoop Learning - MapReduce Quiz

Article Directory

1. Word Count

1、WordCountDriverNew

2、WordCountMapper

3、WordCountReducer

2. Sorting of student information

1、Student

2、StudentDriver

3、Student Mapper

4、StudentReduce

3. Summation of student achievement

1、ScoreDriver

2、ScoreMapper

3、ScoreReduce

Guess you like