package kaoshi831;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
*
*
数据意义:某个用户在某个位置从某个时刻开始停留了多长时间
处理逻辑:
对同一个用户,在同一个位置,连续的多条记录进行合并
合并原则:开始时间取最早的,停留时长加和
用户ID,位置ID,开始时间,停留时长(分钟)
* user_a,location_a,2018-01-01 08:00:00,60
*/
public class GroupSort {
static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
Text outkey = new Text();
Text outvalue = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] sp = value.toString().split(",");
// String sb = sp[2].substring(11, 13);
outkey.set(sp[0]+","+sp[1]); //key发用户ID,位置ID
outvalue.set(sp[2]+","+sp[3]); //value发开始时间,停留时长(分钟)
context.write(outkey, outvalue);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text>{
List<String> list = new ArrayList<>();
Text outvalue = new Text();
int sum=0;
@Override
protected void reduce(Text key,
Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
//user_a,location_a,2018-01-01 08:00:00,60
for(Text v:values){
String[] sp = v.toString().split(",");
sum+=Integer.parseInt(sp[1]);
System.out.println(sp[0]);
list.add(sp[0]); //添加到list集合中
}
Collections.sort(list); //对list集合排序(2018-01-01 08:00:00字符串也可以排的处理)
outvalue.set(list.get(0)+"\t"+sum);
context.write(key, outvalue);
sum=0;
list.clear(); //为了保险清理一下缓存
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf=new Configuration();
Job job=Job.getInstance(conf);
job.setJarByClass(kaoshi831.GroupSort.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop01:9000/ksin"));
FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), conf);//创建一个hdfs的文件系统
Path path=new Path("hdfs://hadoop01:9000/ksout001");
if(fs.exists(path)){ //对所在路径下的文件清除
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true); //打印日志
}
}
使用了list集合做排序
在实际的生产中不建议使用,ArrayList底层是数组
Integer.MAX_VALUE((2^31)-1)个元素。
当数据量达到50%时性能就会明显下降
,处理数据有限,对性能影响很大
//运行结果
user_a,location_a 2018-01-01 08:00:00 240
user_a,location_b 2018-01-01 10:00:00 60
user_a,location_c 2018-01-01 08:00:00 180
user_b,location_a 2018-01-01 15:00:00 180