MapReduce之--某个用户在某个位置从某个时刻开始停留了多长时间

package kaoshi831;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 
 *
	数据意义:某个用户在某个位置从某个时刻开始停留了多长时间
	处理逻辑:
	对同一个用户,在同一个位置,连续的多条记录进行合并
	合并原则:开始时间取最早的,停留时长加和
	用户ID,位置ID,开始时间,停留时长(分钟)
 *	user_a,location_a,2018-01-01 08:00:00,60
 */
public class GroupSort {
	 
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
		Text outkey = new Text();
		Text outvalue = new Text();
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String[] sp = value.toString().split(",");
//			String sb = sp[2].substring(11, 13);
			outkey.set(sp[0]+","+sp[1]);		//key发用户ID,位置ID
			outvalue.set(sp[2]+","+sp[3]);		//value发开始时间,停留时长(分钟)
			context.write(outkey, outvalue);
		}
	}
	static class MyReducer extends Reducer<Text, Text, Text, Text>{
		List<String> list = new ArrayList<>();
		Text outvalue = new Text();
		int sum=0;
		@Override
		protected void reduce(Text key,
				Iterable<Text> values, 
				Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {

			//user_a,location_a,2018-01-01 08:00:00,60
			for(Text v:values){
				String[] sp = v.toString().split(",");
				sum+=Integer.parseInt(sp[1]);
				System.out.println(sp[0]);
				list.add(sp[0]);		//添加到list集合中
			}
			Collections.sort(list);		//对list集合排序(2018-01-01 08:00:00字符串也可以排的处理)
			outvalue.set(list.get(0)+"\t"+sum);
			context.write(key, outvalue);
			sum=0;
			list.clear();		//为了保险清理一下缓存
		}
	}

	public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		Configuration conf=new Configuration();
		Job job=Job.getInstance(conf);
		
		job.setJarByClass(kaoshi831.GroupSort.class);
		
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		FileInputFormat.addInputPath(job, new Path("hdfs://hadoop01:9000/ksin"));
		
		FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), conf);//创建一个hdfs的文件系统
		Path path=new Path("hdfs://hadoop01:9000/ksout001");
		if(fs.exists(path)){			//对所在路径下的文件清除
			fs.delete(path, true);
		}
		FileOutputFormat.setOutputPath(job,path);
		
		job.waitForCompletion(true);	//打印日志

	}

}

使用了list集合做排序

在实际的生产中不建议使用,ArrayList底层是数组

Integer.MAX_VALUE((2^31)-1)个元素。

当数据量达到50%时性能就会明显下降

,处理数据有限,对性能影响很大

//运行结果
user_a,location_a	2018-01-01 08:00:00	240
user_a,location_b	2018-01-01 10:00:00	60
user_a,location_c	2018-01-01 08:00:00	180
user_b,location_a	2018-01-01 15:00:00	180

猜你喜欢

转载自blog.csdn.net/YZY_001/article/details/82313268