词频统计

代码文件

继承Mapper，类实现自己的Mapper类，并重写map()方法。

package wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text,Text,IntWritable>{
	@Override
	protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException{
		String line = v1.toString();
		String[] words = line.split(" ");
		for(String word:words) {
			context.write(new Text(word), new IntWritable(1));
		}
	}
}

继承Reducer类，实现自己的Reducer类，并重写Reducer()方法。

package wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable,Text, IntWritable>{
	@Override
	protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException{
		int sum = 0;
		for(IntWritable v:v3) {
			sum += v.get();
		}
		context.write(k3, new IntWritable(sum));
	}
}

程序入口类

package wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountMain {
	public static void main(String[] args) throws Exception{
		//job
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(WordCountMain.class);
		//mapper
		job.setMapperClass(WordCountMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		//reducer
		job.setReducerClass(WordCountReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//execute job
		job.waitForCompletion(true);
	}

}

计算各部门最高工资

EMPNO	ENAME	JOB	MGR	HIREDATE	SAL	COMM	DEPTION
7369	SMITH	CLERK	7902	######	800		20
7499	ALLEN	SALESMAN	7698	######	1600	300	30
7521	WARD	SALESMAN	7698	######	1250	500	30
7566	JONES	MANAGER	7839	1981/4/2	2975		20
7654	MARTIN	SALESMAN	7698	######	1250	1400	30
7698	BLAKE	MANAGER	7839	1981/5/1	2850		30
7782	CLARK	MANAGER	7839	1981/6/9	2450		10
7788	SCOTT	ANALYST	7566	######	3000		20
7839	KING	PRESIDENT		######	5000		10
7844	TURNER	SALESMAN	7698	1981/9/8	1500	0	30
7876	ADAMS	CLERK	7788	######	1100		20
7900	JAMES	CLERK	7698	######	950		30
7902	FORD	ANALYST	7566	######	3000		20
7934	MILLER	CLERK	7782	######	1300		10

代码文件

Mapper类：

package saltotal;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class STM extends Mapper<LongWritable, Text, Text, IntWritable>{
	@Override
	protected void map(LongWritable key1,Text value1,Context context)
			throws IOException, InterruptedException {
		String data = value1.toString();
		String[] words = data.split(",");
		context.write(new Text(words[2]), new IntWritable(Integer.parseInt(words[5])));
	}
}

Reducer类：

package saltotal;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class STR extends Reducer<Text,IntWritable,Text,IntWritable>{
	@Override
	protected void reduce(Text k3, Iterable<IntWritable> v3,Context context)
			throws IOException, InterruptedException {
		int max = 0;
		for(IntWritable v:v3) {
			if(v.get() > max) {
				max = v.get();
			}
					
		}
		context.write(k3, new IntWritable(max));
	}

	
}

程序入口类：

package saltotal;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SalaryTotalMain {
	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(STM.class);
		job.setMapperClass(STM.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setReducerClass(STR.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job,new Path(args[1]));
		
		job.waitForCompletion(true);
	}



}

右键点击“src”

弹窗后选择”Export“

选择"JAR file"后，点击"Next"

选择Java文件，选择Jar包保存路径，点击"Finish"导出jar包

Documents下出现jar包

连接Xshell,使用Xftp传输文件到"Documents"路径下

路径下出现emp.csv文件(在虚拟机可以拖拽到路径下）

输入指令 cd /Documents，进入Documents路径下，ls,发现文件在路径下

启动hdfs

jps,查看节点

查看salarytotal.jar中内容

将emp.csv上传到HDFS下input文件夹中

查看input文件夹下内容

查看user下文件（不能有重复文件）

输入指令，命名文件名为"sc"(最后的文件名可以随意命名，这里的为sc)

qiangzi@qiangzi-virtual-machine:~/Documents$ hadoop jar salarytoatl.jar saltotal.SalaryTotalMain /input/emp.csv /user/sc5
23/04/17 20:14:29 INFO client.RMProxy: Connecting to ResourceManager at qiangzi-virtual-machine/127.0.1.1:8032
23/04/17 20:14:32 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
23/04/17 20:14:35 INFO input.FileInputFormat: Total input files to process : 1
23/04/17 20:14:36 INFO mapreduce.JobSubmitter: number of splits:1
23/04/17 20:14:37 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1681718824501_0006
23/04/17 20:14:38 INFO conf.Configuration: resource-types.xml not found
23/04/17 20:14:38 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
23/04/17 20:14:38 INFO resource.ResourceUtils: Adding resource type - name = memory-mb, units = Mi, type = COUNTABLE
23/04/17 20:14:38 INFO resource.ResourceUtils: Adding resource type - name = vcores, units = , type = COUNTABLE
23/04/17 20:14:40 INFO impl.YarnClientImpl: Submitted application application_1681718824501_0006
23/04/17 20:14:40 INFO mapreduce.Job: The url to track the job: http://qiangzi-virtual-machine:8088/proxy/application_1681718824501_0006/
23/04/17 20:14:40 INFO mapreduce.Job: Running job: job_1681718824501_0006
23/04/17 20:15:10 INFO mapreduce.Job: Job job_1681718824501_0006 running in uber mode : false
23/04/17 20:15:10 INFO mapreduce.Job:  map 0% reduce 0%
23/04/17 20:18:25 INFO mapreduce.Job:  map 100% reduce 0%
23/04/17 20:18:45 INFO mapreduce.Job:  map 100% reduce 100%
23/04/17 20:18:49 INFO mapreduce.Job: Job job_1681718824501_0006 completed successfully
23/04/17 20:18:51 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=200
		FILE: Number of bytes written=417325
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=708
		HDFS: Number of bytes written=66
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=188987
		Total time spent by all reduces in occupied slots (ms)=16669
		Total time spent by all map tasks (ms)=188987
		Total time spent by all reduce tasks (ms)=16669
		Total vcore-milliseconds taken by all map tasks=188987
		Total vcore-milliseconds taken by all reduce tasks=16669
		Total megabyte-milliseconds taken by all map tasks=193522688
		Total megabyte-milliseconds taken by all reduce tasks=17069056
	Map-Reduce Framework
		Map input records=14
		Map output records=14
		Map output bytes=166
		Map output materialized bytes=200
		Input split bytes=105
		Combine input records=0
		Combine output records=0
		Reduce input groups=5
		Reduce shuffle bytes=200
		Reduce input records=14
		Reduce output records=5
		Spilled Records=28
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=2528
		CPU time spent (ms)=4880
		Physical memory (bytes) snapshot=457605120
		Virtual memory (bytes) snapshot=3926859776
		Total committed heap usage (bytes)=277348352
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=603
	File Output Format Counters 
		Bytes Written=66

使用MapReduce进行编程来对数据进行处理和计算

词频统计

代码文件

继承Mapper，类实现自己的Mapper类，并重写map()方法。

继承Reducer类，实现自己的Reducer类，并重写Reducer()方法。

程序入口类

计算各部门最高工资

代码文件

Mapper类：

Reducer类：

程序入口类：

右键点击“src”

弹窗后选择”Export“

选择"JAR file"后，点击"Next"

选择Java文件，选择Jar包保存路径，点击"Finish"导出jar包

Documents下出现jar包

连接Xshell,使用Xftp传输文件到"Documents"路径下

路径下出现emp.csv文件(在虚拟机可以拖拽到路径下）

输入指令 cd /Documents，进入Documents路径下，ls,发现文件在路径下

启动hdfs

jps,查看节点

查看salarytotal.jar中内容

将emp.csv上传到HDFS下input文件夹中

查看input文件夹下内容

查看user下文件（不能有重复文件）

输入指令，命名文件名为"sc"(最后的文件名可以随意命名，这里的为sc)

查看sc中的内容

查看统计内容

实验结果

猜你喜欢