单词数量统计简单(案例)

版权声明:该版权归博主个人所有,在非商用的前提下可自由使用,转载请注明出处. https://blog.csdn.net/qq_24696571/article/details/86099860

基于

快捷跳转

- JobWordCount

package com.wordcount;



import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Jobwc {
	public static void main(String[] args) throws IOException {
		//1.环境变量,连接HDFS
		Configuration configuration = new Configuration();
		configuration.set("fs.defaultFS","hdfs://node01:8020");//NameNode,active状态的namenode
		configuration.set("yarn.resourcemanager.hostname", "node03:8088");//resourceManager节点
	
		//2.设置job任务的相关信息
		Job job = Job.getInstance(configuration);//配置信息加载到job
		job.setJarByClass(JobWordCount.class);//反射操作,写该类的class类
		job.setJobName("wc");//给任务取个名称
		
		job.setMapperClass(Mapwc.class);//设置Mapper,分解
		job.setReducerClass(Reducewc.class);//设置Reduce,清洗汇总
		
		job.setMapOutputKeyClass(Text.class);//输出 key的类型
		job.setMapOutputValueClass(IntWritable.class);//输出.value的类型
		
		//3 输入数据文件(读取hdfs上的文件)
		FileInputFormat.addInputPaths(job, "/WC/input/word.txt");//路径,HDFS上的要处理的文件路径
		
		//4.输出结果到指定的地方
		Path path = new Path("/WC/output/");//结果输出的路径
		FileSystem fs = FileSystem.get(configuration);//加载
		if(fs.exists(path)){ //是否存在
			fs.delete(path,true);//存在就删除
		}
		FileOutputFormat.setOutputPath(job, path);//写出数据
		
		//5.结束
		boolean f;
		try{
			f = job.waitForCompletion(true);//任务是否执行成功
			if(f){
				System.out.println("job success ~");
			}else{
				System.out.println("------error-------");
			}
		} catch (ClassNotFoundException e){
			e.printStackTrace();
		} catch (InterruptedException e){
			e.printStackTrace();
		}
	}
}

- Mapwc

package com.wordcount;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;


public class Mapwc extends Mapper<LongWritable, Text, Text, IntWritable> {
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		StringTokenizer words = new StringTokenizer(line);
		while (words.hasMoreTokens()) {
			context.write(new Text(words.nextToken()), new IntWritable(1));
		}	
//		String words[] = StringUtils.split(line, ' ');
//		for (String ww : words) {
//			context.write(new Text(ww), new IntWritable(1));
//		}
	}

}

- Reducewc

package com.wordcount;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

//把一个个<单词,1>的数据整合成<单词,x>
public class Reducewc extends Reducer<Text, IntWritable, Text, IntWritable>{
	//单词,1111迭代,context上下文     处理key:word处理value:{1,1,1,1,1}
	@Override
	protected void reduce(Text words, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
		int sum = 0;
		for(IntWritable s : values){
			sum +=s.get(); //s.get的值 1 1 ... 1
		}
		context.write(words, new IntWritable(sum)); //把清洗后的数据写出
	}
}

猜你喜欢

转载自blog.csdn.net/qq_24696571/article/details/86099860