MapReduce official case wordcount

Insert picture description here
wordcountReduce.java

package MaperReduce;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//reduce阶段

/*
* 四个泛型的解释
*  KEYIN:K2的类型
*  VALUEIN:V2的类型
*  
*  KEYOUT:K3的类型
*  VALUEOUT:V3的类型
*/
public class wordcountReduce extends Reducer<Text, LongWritable, Text, LongWritable> {
    
    
	/*
	 * 参数 : key :新k2 
	 * value:新v2 
	 * context:表示上下文对象
	 */
	@Override
	protected void reduce(Text key, Iterable<LongWritable> values,
			Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
    
    
		long count = 0;
		//遍历集合,将集合中的数字相加,得到v3
		for (LongWritable value : values) {
    
    
			count += value.get();
		}
		//将k3和v3写入上下文中
		context.write(key, new LongWritable(count));
	}
}

wrodcountMaper.java

package MaperReduce;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//map阶段

/*
 * 四个泛型的解释
 *  KEYIN:K1的类型
 *  VALUEIN:V1的类型
 *  
 *  KEYOUT:K2的类型
 *  VALUEOUT:V2的类型
 */
public class wrodcountMaper extends Mapper<LongWritable, Text, Text, LongWritable> {
    
    
//map方法就是将k1和v1转为k2和v2
	/*
	 * 参数: key :k1 行偏移量 value :v1 每行的文本数据(就是统计的单词本身) context:表示上下文对象
	 */
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
			throws IOException, InterruptedException {
    
    
		Text text = new Text();
		LongWritable longWritable = new LongWritable();
		// 1、将一行的文本数据进行拆分
		String[] split = value.toString().split(",");
		// 2、遍历数组,组装k2和v2
		for (String word : split) {
    
    
			// 可以new出来调用 context.write(new Text(word), new LongWritable(1));
			// 3、将k2和v2写入上下文
			text.set(word);
			longWritable.set(1);
			context.write(text, longWritable);
		}
	}
}

JobMain.java (that is, the driver class, the driver class)

package MaperReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class JobMain extends Configured implements Tool{
    
    
	//该方法用于指定一个job任务
	public int run(String[] args) throws Exception {
    
    
		//1、创建一个job任务对象
		Job job=Job.getInstance(super.getConf(),"wordcount");
		//2、配置job任务对象(8个步骤)
		
		//一、指定文件的读取方式和读取路径
		job.setInputFormatClass(TextInputFormat.class);
		TextInputFormat.addInputPath(job, new Path("hdfs:192.168.2.101:9000/wordcount"));
		
		//二、指定Map阶段的处理方式
		job.setMapperClass(wrodcountMaper.class);
		//设置map阶段k2的类型
		job.setMapOutputKeyClass(Text.class);
		//设置map阶段v2的类型
		job.setMapOutputValueClass(LongWritable.class);
		
		
		//suffer阶段 第三四五六步采用默认的方式
		//第七步:指定Reduce阶段的处理方式和和数据类型
		job.setReducerClass(wordcountReduce.class);
		//设置k3的类型
		job.setOutputKeyClass(Text.class);
		//设置v3的类型
		job.setMapOutputValueClass(LongWritable.class);
		
		//第八步:设置输出类型
		job.setOutputFormatClass(TextOutputFormat.class);
		//设置输出的路径
		TextOutputFormat.setOutputPath(job,new Path("hdfs:192.168.2.101:9000/wordcount_out"));
		
		//等待任务结束
		boolean b1 = job.waitForCompletion(true);
		
		//如果b1是ture那就返回0,反之返回1
		return b1 ? 0:1;
		
	}
public static void main(String[] args) throws Exception {
    
    
	Configuration configuration=new Configuration();
	//启动job任务
	int run=ToolRunner.run(configuration,new JobMain(), args);
	System.exit(run);
}
}

Operation mode:
cluster operation:
1. Submit the MapReduce program to the Yarn cluster and divide it into many nodes for concurrent execution.
2. The processed data and output results should be located in the HDFS file system.
3. Submit the cluster implementation steps and type the program into a jar. Package, upload, and then start with hadoop command on the cluster

hadoop jar hadoop_hdfs_operate-1.0-SNAPSHOT.jar
cn.itcast.mapreduce.JobMain

Use maven to play jar package, need to add packaged plug-in dependency

  <build>
		<plugins>
			<plugin>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>2.3.2</version>
				<configuration>
					<source>1.8</source>
					<target>1.8</target>
				</configuration>
			</plugin>
			<plugin>
				<artifactId>maven-assembly-plugin </artifactId>
				<configuration>
					<descriptorRefs>
						<descriptorRef>jar-with-dependencies</descriptorRef>
					</descriptorRefs>
					<archive>
						<manifest>
							<mainClass>MaperReduce.JobMain</mainClass>
						</manifest>
					</archive>
				</configuration>
				<executions>
					<execution>
						<id>make-assembly</id>
						<phase>package</phase>
						<goals>
							<goal>single</goal>
						</goals>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>

Insert picture description here
The content in the red box above is the place that needs to be modified. It is your own main class path.
Insert picture description here
Insert picture description here
There is a cross, it's okay, it will be fine in a while.

Insert picture description here
Packaging:
Insert picture description hereInsert picture description here
Insert picture description here
Rename it,
Insert picture description here
specify a directory and upload the jar package just now

Insert picture description hereInsert picture description here

Local operation mode:
1. The MapReduce program runs locally in the form of a single process.
2. The processed data and output results are in the local file system

Guess you like

Origin blog.csdn.net/weixin_46457946/article/details/113844962