package kaoshi3;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//统计单词在每个文件中出现的次数,并且将出现次数按照降序排列
public class wordcount {
static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
Text mk=new Text();
Text mv=new Text();
String filename="";
@Override
//setup job任务运行时加载一次,可以获取文件信息
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
InputSplit insplit = context.getInputSplit(); //通过上下文对象,获取切片文件的切片信息
FileSplit fs=(FileSplit)insplit; //转换其类型,inputsplit中没有获取文件名的
filename = fs.getPath().getName(); //获取文件名
}
@Override
protected void map(LongWritable key,
Text value,
Context context)
throws IOException, InterruptedException {
//liangchaowei love liujialing
String[] sp = value.toString().split(" ");
for(String word: sp){
mk.set(word);
mv.set(filename);
context.write(mk, mv); //单词作为key,文件名作为value发送
}
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>{
Text t = new Text();
String sv="";
int sum1=0;
int sum2=0;
String file1="";
String file2="";
String out1="";
String out2="";
@Override
protected void reduce(Text key,
Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
for(Text v:values){
sv = v.toString();
if(sv.startsWith("mapreduce-4-1.txt")){
file1=sv; //文件名
sum1++; //获取次数
}else{
file2=sv; //获取次数
sum2++; //文件名
}
}
if(sum1>sum2){ //判断排序
out1=file1+":"+sum1+"\t"+file2+":"+sum2;//输出数据比较少就用了String类型
t.set(out1);
context.write(key,t);
}else{
out2=file2+":"+sum2+"\t"+file1+":"+sum1;
t.set(out2);
context.write(key,t);
}
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
//本地运行添加
System.setProperty("HADOOP_USER_NAME", "hadoop");
//添加配置文件
Configuration conf = new Configuration();
Job job = Job.getInstance(conf); //创建一个job任务
job.setJarByClass(kaoshi3.wordcount.class); //指定驱动类的加载路径
job.setMapperClass(MyMapper.class); //指定mapper的加载类
job.setReducerClass(MyReducer.class); //指定reduce加载类
job.setOutputKeyClass(Text.class); //设置输出key的类型
job.setOutputValueClass(Text.class); //设置输出valse的类型
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop01:9000/ksin02")); //添加输入路径
FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), conf);
Path path = new Path("/ksout02"); //输出路径
if(fs.exists(path)){ //判断输出路径是否存在
fs.delete(path,true);
}
FileOutputFormat.setOutputPath(job, path);
job.waitForCompletion(true);
}
}
统计单词在每个文件中出现的次数,并且将出现次数按照降序排列
猜你喜欢
转载自blog.csdn.net/YZY_001/article/details/82289780
今日推荐
周排行