Hadoop_WordCount单词统计

Hadoop_WordCount单词统计

  1. 创建hadoop02工程

  2. pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.blu</groupId>
	<artifactId>hadoop02</artifactId>
	<version>0.0.1-SNAPSHOT</version>

	<dependencies>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>2.9.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>2.9.2</version>
		</dependency>
	
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>2.9.2</version>
		</dependency>
		
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>

		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>1.8</version>
			<scope>system</scope>
			<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
		</dependency>
	</dependencies>

</project>

  1. MyWordCountMapper
package com.blu.mywordcount;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


/**
 * 输入数据类
 * KEYIN: 默认情况下,输入的KEY数据类型是待处理数据中的某一行内容的起始偏移量。
 * 			类型为Long,hadoop提供了自己的序列化框架,LongWritable代替Long类型
 * VALUEIN:   默认情况下,输入的VALUE的值是某一行数据,类型是String,这里应该使用Text
 * 
 * 输出数据类型
 * KEYOUT: map方法处理完成后,要返回的KEY的数据类型
 * VALUEOUT: map方法处理完成后,要返回的VALUE的数据类型
 * 
 * 输入的数据格式:
 * good morning
 * good afternoon
 * 
 * 输出的数据格式:
 * good 1
 * morning 1
 * good 1
 * afternoon 1
 * 
 * @author BLU
 */
public class MyWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
	
	private Text text = new Text();
	private IntWritable iw = new IntWritable(1);
	
	
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
			throws IOException, InterruptedException {
		
		//获取一行内容
		String content = value.toString();//good morning
		String[] vals = content.split(" ");
		for(String v : vals) {
			text.set(v);
			context.write(text, iw);
		}
	}
}

  1. MyWordCountReducer
package com.blu.mywordcount;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * 输入的数据类型:mapper输出的数据类型
 * KEYIN,VALUEIN
 * 输出的数据类型:最终输出的数据类型
 * KEYOUT,VALUEOUT
 * 
 * 最终输出的数据格式:
 * good 2
 * morning 1
 * afternoon 1
 * 
 * @author BLU
 *
 */


public class MyWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
	
	/**
	 * 从mapper来的数据(按字典顺序排序)
	 * afternoon 1
	 * good 1
	 * good 1
	 * morning 1
	 */
	
	
	IntWritable iwsum = new IntWritable();
	
	@Override
	protected void reduce(Text text, Iterable<IntWritable> value,
			Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
		
		int sum = 0;
		for(IntWritable iw : value) {
			sum +=iw.get();
		}
		iwsum.set(sum);
		//要输出的数据:
//		afternoon 1
//		good 2
//		morning 1
		context.write(text, iwsum);
		
	}
	
}

  1. MyWordCount
package com.blu.mywordcount;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MyWordCount {
	
	public static void main(String[] args) {
		
		try {
			Configuration conf = new Configuration();
			Job job = Job.getInstance(conf);
			
			//设置运行的类
			job.setJarByClass(MyWordCount.class);
			
			//设置mapper和reducer对应的类
			job.setMapperClass(MyWordCountMapper.class);
			job.setReducerClass(MyWordCountReducer.class);
			
			//运行mapper的输出数据类型和最终输出的数据类型
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(IntWritable.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			
			//设置文件的输入和输出的路径
			// hadoop jar example.jar wordcount /input/a.txt /output
			FileInputFormat.addInputPath(job, new Path(args[0]));
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
			
			//运行job
			boolean flag = job.waitForCompletion(true);
			//0表示正常退出
			//1表示不正常退出
			System.exit(flag ?0 : 1);
			
			
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}

  1. 在Resource目录下创建log4j.properties文件
 ### 设置 ###
log4j.rootLogger = debug,stdout,D,E

### 输出信息到控制台 ###
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n

### 输出DEBUG 级别以上的日志到=E://logs/error.log ###
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = E://logs/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG 
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n

### 输出ERROR 级别以上的日志到=E://logs/error.log ###
log4j.appender.E = org.apache.log4j.DailyRollingFileAppender
log4j.appender.E.File =E://logs/error.log 
log4j.appender.E.Append = true
log4j.appender.E.Threshold = ERROR 
log4j.appender.E.layout = org.apache.log4j.PatternLayout
log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n

  1. 在D:\data下创建testdata.txt
good morning
good afternoon
good evening

  1. 带参数运行MyWordCount的main函数
    java.jpg

  2. 运行结果
    在 D:\data下生成output目录,在该目录下生成了4个文件。在这里插入图片描述
    用记事本打开part-r-00000文件:

afternoon	1
evening	1
good	3
morning	1

第二种运行方式

  1. 生成jar包
    右键项目 >> Run As >> Maven build…
    在Goals中填入打包方式为package
java.jpg
  1. 在项目路径的target目录下生成了jar包:hadoop02-0.0.1-SNAPSHOT.jar
java.jpg
  1. 将该jar包上传至虚拟机,将包含以下内容的testdata.txt文件上传至HDFS
good morning
good afternoon
good evening
  1. 运行以下命令:
hadoop jar hadoop02-0.0.1-SNAPSHOT.jar com.blu.mywordcount.MyWordCount /testdata.txt /outout
hadoop jar (要运行的jar包的具体地址)  (要运行的主函数所在的类名的全路径) (输入文件的路径,HDFS上) (运行结果的输出路径,HDFS上,需要写不存在的路径)
  1. 在HDFS中生成了output文件夹及一些子文件在这里插入图片描述
  2. 运行以下命令下载文件至虚拟机的output文件夹
hdfs dfs -get /output/* /output
  1. 运行以下命令查询 part-r-00000 文件:
cat output/part-r-00000
  1. 结果:
afternoon	1
evening	1
good	3
morning	1
发布了20 篇原创文章 · 获赞 3 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/BLU_111/article/details/105699551
今日推荐