mapreduce de almacenamiento de datos

asignar segmentos:
LogClearMap.java

package etl;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import java.io.IOException;

public class LogClearMap extends Mapper<LongWritable, Text,Text, NullWritable> {
    Text t = new Text();

    //MultipleOutputs用于简化多文件输出
    MultipleOutputs mo = null;

    //setup方法
    //setup方法一般会在map函数之前执行一些准备工作,如作业的一些配置信息等
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        mo = new MultipleOutputs(context);
        super.setup(context);
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //如果数据为空,就直接返回了,不进行下面的步骤
       if (null == value){
           return;
       }
        //获取原始数据
        String line = value.toString();
        //转成json类型
        JSONObject jsonObject = JSON.parseObject(line);
        //获取header
        JSONObject header = jsonObject.getJSONObject("header");
        //下面都是必选的属性
        //sdk_ver
        String sdk_ver = header.getString("sdk_ver");
        //.trim:去掉空格
        if(null == sdk_ver || "".equals(sdk_ver.trim())){
            return;
        }
        //time_zone
        String time_zone = header.getString("time_zone");
        if(StringUtils.isBlank(time_zone)){
            return;
        }
        //commit_id
        String commit_id = header.getString("commit_id");
        if(null == commit_id || "".equals(commit_id.trim())){
            return;
        }
        //commit_time
        String commit_time = header.getString("commit_time");
        if(null == commit_time || "".equals(commit_time.trim())){
            return;
        }
        //pid
        String pid = header.getString("pid");
        if(StringUtils.isBlank(pid)){
            return;
        }
        //app_token
        String app_token = header.getString("app_token");
        if(StringUtils.isBlank(app_token)){
            return;
        }
        //app_id
        String app_id = header.getString("app_id");
        if(StringUtils.isBlank(app_id)){
            return;
        }
        //device_id
        String device_id = header.getString("device_id");
        if(StringUtils.isBlank(device_id)){
            return;
        }
        //device_id_type
        String device_id_type = header.getString("device_id_type");
        if(StringUtils.isBlank(device_id_type)){
            return;
        }
        //release_channel
        String release_channel = header.getString("release_channel");
        if(StringUtils.isBlank(release_channel)){
            return;
        }
        //app_ver_name
        String app_ver_name = header.getString("app_ver_name");
        if(StringUtils.isBlank(app_ver_name)){
            return;
        }
        //app_ver_code
        String app_ver_code = header.getString("app_ver_code");
        if(StringUtils.isBlank(app_ver_code)){
            return;
        }
        //os_name
        String os_name = header.getString("os_name");
        if(StringUtils.isBlank(os_name)){
            return;
        }
        //os_ver
        String os_ver = header.getString("os_ver");
        if(StringUtils.isBlank(os_ver)){
            return;
        }
        //language
        String language = header.getString("language");
        if(StringUtils.isBlank(language)){
            return;
        }
        //country
        String country = header.getString("country");
        if(StringUtils.isBlank(country)){
            return;
        }
        //manufacture
        String manufacture = header.getString("manufacture");
        if(StringUtils.isBlank(manufacture)){
            return;
        }
        //device_model
        String device_model = header.getString("device_model");
        if(StringUtils.isBlank(device_model)){
            return;
        }
        //resolution
        String resolution = header.getString("resolution");
        if(StringUtils.isBlank(resolution)){
            return;
        }
        //net_type
        String net_type = header.getString("net_type");
        if(StringUtils.isBlank(net_type)){
            return;
        }
        //user_id是我们后来写的,所以初始化为""
        String user_id="";

        //获取操作系统
        if("android".equals(os_name)){
            //获取操作系统的android_id
            String android_id = header.getString("android_id");
            //判断android_id是否为空
            //如果为空
            //.isBlank就是.isEmpty,只不过包括了空格
            if(StringUtils.isBlank(android_id)){
                //获取
                user_id=device_id;
            }else {
                user_id=android_id;
            }
        }else if("ios".equals(os_name)){
            user_id=device_id;
        }else {
            return;
        }

        //加到header里
        header.put("user_id",user_id);

        //将header赋值给jsonObject
        jsonObject.put("header",header);

        //输出
        t.set(jsonObject.toString());
        if("ios".equals(os_name)){
            mo.write(t,NullWritable.get(),"ios/");
        }else{
            mo.write(t,NullWritable.get(),"android/");
        }
    }

    //cleanup方法
    //cleanup方法则是在map方法运行完之后最后执行 的,该方法是完成一些结尾清理的工作,如:资源释放等
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        mo.close();
        super.cleanup(context);
    }
}

controlador段:
LogClearDriver:

package etl;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class LogClearDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        System.setProperty("hadoop.home.dir","D:\\hadoop-2.7.2\\hadoop-2.7.2");
        //友好提示
        if(args.length<2){
            System.out.println("参数错误");
            return;
        }
        //conf
        Configuration configuration = new Configuration();
        //初始化job
        Job job = Job.getInstance(configuration);
        //jar
        job.setJarByClass(LogClearDriver.class);
        //设置输出的map和reduce的类
        job.setMapperClass(LogClearMap.class);
        //设置map和最后输出的key和value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        //设置输入输出文件
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        //提交任务
        //执行完成
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}
Ha publicado 189 artículos originales · ganado elogios 13 · vistas 10000 +

Supongo que te gusta

Origin blog.csdn.net/NewBeeMu/article/details/103201896
Recomendado
Clasificación