asignar segmentos:
LogClearMap.java
package etl;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import java.io.IOException;
public class LogClearMap extends Mapper<LongWritable, Text,Text, NullWritable> {
Text t = new Text();
//MultipleOutputs用于简化多文件输出
MultipleOutputs mo = null;
//setup方法
//setup方法一般会在map函数之前执行一些准备工作,如作业的一些配置信息等
@Override
protected void setup(Context context) throws IOException, InterruptedException {
mo = new MultipleOutputs(context);
super.setup(context);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//如果数据为空,就直接返回了,不进行下面的步骤
if (null == value){
return;
}
//获取原始数据
String line = value.toString();
//转成json类型
JSONObject jsonObject = JSON.parseObject(line);
//获取header
JSONObject header = jsonObject.getJSONObject("header");
//下面都是必选的属性
//sdk_ver
String sdk_ver = header.getString("sdk_ver");
//.trim:去掉空格
if(null == sdk_ver || "".equals(sdk_ver.trim())){
return;
}
//time_zone
String time_zone = header.getString("time_zone");
if(StringUtils.isBlank(time_zone)){
return;
}
//commit_id
String commit_id = header.getString("commit_id");
if(null == commit_id || "".equals(commit_id.trim())){
return;
}
//commit_time
String commit_time = header.getString("commit_time");
if(null == commit_time || "".equals(commit_time.trim())){
return;
}
//pid
String pid = header.getString("pid");
if(StringUtils.isBlank(pid)){
return;
}
//app_token
String app_token = header.getString("app_token");
if(StringUtils.isBlank(app_token)){
return;
}
//app_id
String app_id = header.getString("app_id");
if(StringUtils.isBlank(app_id)){
return;
}
//device_id
String device_id = header.getString("device_id");
if(StringUtils.isBlank(device_id)){
return;
}
//device_id_type
String device_id_type = header.getString("device_id_type");
if(StringUtils.isBlank(device_id_type)){
return;
}
//release_channel
String release_channel = header.getString("release_channel");
if(StringUtils.isBlank(release_channel)){
return;
}
//app_ver_name
String app_ver_name = header.getString("app_ver_name");
if(StringUtils.isBlank(app_ver_name)){
return;
}
//app_ver_code
String app_ver_code = header.getString("app_ver_code");
if(StringUtils.isBlank(app_ver_code)){
return;
}
//os_name
String os_name = header.getString("os_name");
if(StringUtils.isBlank(os_name)){
return;
}
//os_ver
String os_ver = header.getString("os_ver");
if(StringUtils.isBlank(os_ver)){
return;
}
//language
String language = header.getString("language");
if(StringUtils.isBlank(language)){
return;
}
//country
String country = header.getString("country");
if(StringUtils.isBlank(country)){
return;
}
//manufacture
String manufacture = header.getString("manufacture");
if(StringUtils.isBlank(manufacture)){
return;
}
//device_model
String device_model = header.getString("device_model");
if(StringUtils.isBlank(device_model)){
return;
}
//resolution
String resolution = header.getString("resolution");
if(StringUtils.isBlank(resolution)){
return;
}
//net_type
String net_type = header.getString("net_type");
if(StringUtils.isBlank(net_type)){
return;
}
//user_id是我们后来写的,所以初始化为""
String user_id="";
//获取操作系统
if("android".equals(os_name)){
//获取操作系统的android_id
String android_id = header.getString("android_id");
//判断android_id是否为空
//如果为空
//.isBlank就是.isEmpty,只不过包括了空格
if(StringUtils.isBlank(android_id)){
//获取
user_id=device_id;
}else {
user_id=android_id;
}
}else if("ios".equals(os_name)){
user_id=device_id;
}else {
return;
}
//加到header里
header.put("user_id",user_id);
//将header赋值给jsonObject
jsonObject.put("header",header);
//输出
t.set(jsonObject.toString());
if("ios".equals(os_name)){
mo.write(t,NullWritable.get(),"ios/");
}else{
mo.write(t,NullWritable.get(),"android/");
}
}
//cleanup方法
//cleanup方法则是在map方法运行完之后最后执行 的,该方法是完成一些结尾清理的工作,如:资源释放等
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
mo.close();
super.cleanup(context);
}
}
controlador段:
LogClearDriver:
package etl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogClearDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.setProperty("hadoop.home.dir","D:\\hadoop-2.7.2\\hadoop-2.7.2");
//友好提示
if(args.length<2){
System.out.println("参数错误");
return;
}
//conf
Configuration configuration = new Configuration();
//初始化job
Job job = Job.getInstance(configuration);
//jar
job.setJarByClass(LogClearDriver.class);
//设置输出的map和reduce的类
job.setMapperClass(LogClearMap.class);
//设置map和最后输出的key和value的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置输入输出文件
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//提交任务
//执行完成
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}