使用MapReduce进行编程来对数据进行处理和计算

词频统计

代码文件

继承Mapper,类实现自己的Mapper类,并重写map()方法。

package wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text,Text,IntWritable>{
	@Override
	protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException{
		String line = v1.toString();
		String[] words = line.split(" ");
		for(String word:words) {
			context.write(new Text(word), new IntWritable(1));
		}
	}
}

 继承Reducer类,实现自己的Reducer类,并重写Reducer()方法。

package wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable,Text, IntWritable>{
	@Override
	protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException{
		int sum = 0;
		for(IntWritable v:v3) {
			sum += v.get();
		}
		context.write(k3, new IntWritable(sum));
	}
}

程序入口类 

package wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountMain {
	public static void main(String[] args) throws Exception{
		//job
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(WordCountMain.class);
		//mapper
		job.setMapperClass(WordCountMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		//reducer
		job.setReducerClass(WordCountReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//execute job
		job.waitForCompletion(true);
	}

}

计算各部门最高工资

EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTION
7369 SMITH CLERK 7902 ###### 800 20
7499 ALLEN SALESMAN 7698 ###### 1600 300 30
7521 WARD SALESMAN 7698 ###### 1250 500 30
7566 JONES MANAGER 7839 1981/4/2 2975 20
7654 MARTIN SALESMAN 7698 ###### 1250 1400 30
7698 BLAKE MANAGER 7839 1981/5/1 2850 30
7782 CLARK MANAGER 7839 1981/6/9 2450 10
7788 SCOTT ANALYST 7566 ###### 3000 20
7839 KING PRESIDENT ###### 5000 10
7844 TURNER SALESMAN 7698 1981/9/8 1500 0 30
7876 ADAMS CLERK 7788 ###### 1100 20
7900 JAMES CLERK 7698 ###### 950 30
7902 FORD ANALYST 7566 ###### 3000 20
7934 MILLER CLERK 7782 ###### 1300 10

 代码文件

Mapper类:

package saltotal;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class STM extends Mapper<LongWritable, Text, Text, IntWritable>{
	@Override
	protected void map(LongWritable key1,Text value1,Context context)
			throws IOException, InterruptedException {
		String data = value1.toString();
		String[] words = data.split(",");
		context.write(new Text(words[2]), new IntWritable(Integer.parseInt(words[5])));
	}
}

Reducer类:

package saltotal;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class STR extends Reducer<Text,IntWritable,Text,IntWritable>{
	@Override
	protected void reduce(Text k3, Iterable<IntWritable> v3,Context context)
			throws IOException, InterruptedException {
		int max = 0;
		for(IntWritable v:v3) {
			if(v.get() > max) {
				max = v.get();
			}
					
		}
		context.write(k3, new IntWritable(max));
	}

	
}

程序入口类:

package saltotal;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SalaryTotalMain {
	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration());
		job.setJarByClass(STM.class);
		job.setMapperClass(STM.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setReducerClass(STR.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job,new Path(args[1]));
		
		job.waitForCompletion(true);
	}



}

右键点击“src” 

 弹窗后选择”Export“

选择"JAR file"后,点击"Next"

 选择Java文件,选择Jar包保存路径,点击"Finish"导出jar包

 Documents下出现jar包

 连接Xshell,使用Xftp传输文件到"Documents"路径下

路径下出现emp.csv文件(在虚拟机可以拖拽到路径下)

输入指令 cd /Documents,进入Documents路径下,ls,发现文件在路径下

启动hdfs 

 jps,查看节点

查看salarytotal.jar中内容 

将emp.csv上传到HDFS下input文件夹中

 查看input文件夹下内容

 查看user下文件(不能有重复文件

输入指令,命名文件名为"sc"(最后的文件名可以随意命名,这里的为sc)

qiangzi@qiangzi-virtual-machine:~/Documents$ hadoop jar salarytoatl.jar saltotal.SalaryTotalMain /input/emp.csv /user/sc5
23/04/17 20:14:29 INFO client.RMProxy: Connecting to ResourceManager at qiangzi-virtual-machine/127.0.1.1:8032
23/04/17 20:14:32 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
23/04/17 20:14:35 INFO input.FileInputFormat: Total input files to process : 1
23/04/17 20:14:36 INFO mapreduce.JobSubmitter: number of splits:1
23/04/17 20:14:37 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1681718824501_0006
23/04/17 20:14:38 INFO conf.Configuration: resource-types.xml not found
23/04/17 20:14:38 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
23/04/17 20:14:38 INFO resource.ResourceUtils: Adding resource type - name = memory-mb, units = Mi, type = COUNTABLE
23/04/17 20:14:38 INFO resource.ResourceUtils: Adding resource type - name = vcores, units = , type = COUNTABLE
23/04/17 20:14:40 INFO impl.YarnClientImpl: Submitted application application_1681718824501_0006
23/04/17 20:14:40 INFO mapreduce.Job: The url to track the job: http://qiangzi-virtual-machine:8088/proxy/application_1681718824501_0006/
23/04/17 20:14:40 INFO mapreduce.Job: Running job: job_1681718824501_0006
23/04/17 20:15:10 INFO mapreduce.Job: Job job_1681718824501_0006 running in uber mode : false
23/04/17 20:15:10 INFO mapreduce.Job:  map 0% reduce 0%
23/04/17 20:18:25 INFO mapreduce.Job:  map 100% reduce 0%
23/04/17 20:18:45 INFO mapreduce.Job:  map 100% reduce 100%
23/04/17 20:18:49 INFO mapreduce.Job: Job job_1681718824501_0006 completed successfully
23/04/17 20:18:51 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=200
		FILE: Number of bytes written=417325
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=708
		HDFS: Number of bytes written=66
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=188987
		Total time spent by all reduces in occupied slots (ms)=16669
		Total time spent by all map tasks (ms)=188987
		Total time spent by all reduce tasks (ms)=16669
		Total vcore-milliseconds taken by all map tasks=188987
		Total vcore-milliseconds taken by all reduce tasks=16669
		Total megabyte-milliseconds taken by all map tasks=193522688
		Total megabyte-milliseconds taken by all reduce tasks=17069056
	Map-Reduce Framework
		Map input records=14
		Map output records=14
		Map output bytes=166
		Map output materialized bytes=200
		Input split bytes=105
		Combine input records=0
		Combine output records=0
		Reduce input groups=5
		Reduce shuffle bytes=200
		Reduce input records=14
		Reduce output records=5
		Spilled Records=28
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=2528
		CPU time spent (ms)=4880
		Physical memory (bytes) snapshot=457605120
		Virtual memory (bytes) snapshot=3926859776
		Total committed heap usage (bytes)=277348352
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=603
	File Output Format Counters 
		Bytes Written=66

 查看sc中的内容

 查看统计内容

 实验结果

猜你喜欢

转载自blog.csdn.net/m0_74972727/article/details/130208366