Flink实践一:借鉴的实例

1、监听端口、event_time、滚动窗口、wordcount;https://www.jianshu.com/p/7d524ef8143c

public class DataStreamDemo {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);  //设置时间分配器

        env.setParallelism(1);  //设置并行度
        env.getConfig().setAutoWatermarkInterval(9000);//每9秒发出一个watermark

        DataStream<String> text = env.socketTextStream("localhost", 9900);

        DataStream<Tuple3<String, Long, Integer>> counts = text.filter(new FilterClass()).map(new LineSplitter())
                .assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple3<String, Long, Integer>>() {

                    private long currentMaxTimestamp = 0l;

                    private final long maxOutOfOrderness = 10000l;   //这个控制失序已经延迟的度量
                    //获取EventTime
                    @Override
                    public long extractTimestamp(Tuple3<String, Long, Integer> element, long previousElementTimestamp) {
                        long timestamp = element.f1;
                        currentMaxTimestamp = Math.max(  timestamp, currentMaxTimestamp);
                        System.out.println(
                                "get timestamp is " + timestamp + " currentMaxTimestamp " + currentMaxTimestamp);
                        return timestamp;
                    }
                    //获取Watermark
                    @Override
                    public Watermark getCurrentWatermark() {
                        System.out.println("wall clock is " + System.currentTimeMillis() + " new watermark "
                                + (currentMaxTimestamp - maxOutOfOrderness));
                        return new Watermark(currentMaxTimestamp - maxOutOfOrderness);
                    }
                }).keyBy(0).timeWindow(Time.seconds(20))
                // .allowedLateness(Time.seconds(10))   
                .sum(2);

        counts.print();
        env.execute("Window WordCount");

    }

//    自定义获取timesStamp
//    private static class MyTimestamp extends AscendingTimestampExtractor<Tuple3<String, Long, Integer>> {
//
//        private static final long serialVersionUID = 1L;
//
//        public long extractAscendingTimestamp(Tuple3<String, Long, Integer> element) {
//
//            return element.f1;
//        }
//
//    }

    //构造出element以及它的event time.然后把次数赋值为1
    public static final class LineSplitter implements MapFunction<String, Tuple3<String, Long, Integer>> {

        @Override
        public Tuple3<String, Long, Integer> map(String value) throws Exception {
            // TODO Auto-generated method stub
            String[] tokens = value.toLowerCase().split("\\W+");

            long eventtime = Long.parseLong(tokens[1]);

            return new Tuple3<String, Long, Integer>(tokens[0], eventtime, 1);
        }
    }

    //过滤掉为null和whitespace的字符串
    public static final class FilterClass implements FilterFunction<String> {

        @Override
        public boolean filter(String value) throws Exception {

            if (StringUtils.isNullOrWhitespaceOnly(value)) {
                return false;
            } else {
                return true;
            }
        }

    }
}

二、读kafka中json数据,处理延迟数据,结果写入ES;https://github.com/yezonggang/FlinkProj

关于延迟数据的处理方式,可以学习!

package xuwei.tech;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper;
import org.apache.flink.util.OutputTag;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import xuwei.tech.function.MyAggFunction;
import xuwei.tech.watermark.MyWatermark;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 *
 * bin/kafka-topics.sh  --create --topic lateLog --zookeeper localhost:2181 --partitions 5 --replication-factor 1
 *
 * Created by xuwei.tech on 2018/11/14.
 */
public class DataReport {
    private static Logger logger = LoggerFactory.getLogger(DataReport.class);

    public static void main(String[] args) throws Exception{


        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //设置并行度
        env.setParallelism(5);

        //设置使用eventtime
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        //checkpoint配置
        env.enableCheckpointing(60000);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
        env.getCheckpointConfig().setCheckpointTimeout(10000);
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        //设置statebackend

        //env.setStateBackend(new RocksDBStateBackend("hdfs://hadoop100:9000/flink/checkpoints",true));

        /**
         * 配置kafkaSource
         */
        String topic = "auditLog";
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers","hadoop110:9092");
        prop.setProperty("group.id","con1");

        FlinkKafkaConsumer011<String> myConsumer = new FlinkKafkaConsumer011<>(topic, new SimpleStringSchema(), prop);


        /**
         * 获取到kafka中的数据
         *
         * 审核数据格式
         * // {"dt":"审核时间[年月日 时分秒]","type":"审核类型","username":"审核人员姓名","area":"大区"}
         *
         */
        DataStreamSource<String> data = env.addSource(myConsumer);


        /**
         * 对数据进行清洗
         */
        DataStream<Tuple3<Long, String, String>> mapData = data.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String line) throws Exception {
                JSONObject jsonObject = JSON.parseObject(line);

                String dt = jsonObject.getString("dt");
                long time = 0;
                try {
                    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                    Date parse = sdf.parse(dt);
                    time = parse.getTime();
                } catch (ParseException e) {
                    //也可以把这个日志存储到其他介质中
                    logger.error("时间解析异常,dt:" + dt, e.getCause());
                }

                String type = jsonObject.getString("type");
                String area = jsonObject.getString("area");

                return new Tuple3<>(time, type, area);
            }
        });

        /**
         * 过滤掉异常数据
         */
        DataStream<Tuple3<Long, String, String>> filterData = mapData.filter(new FilterFunction<Tuple3<Long, String, String>>() {
            @Override
            public boolean filter(Tuple3<Long, String, String> value) throws Exception {
                boolean flag = true;
                if (value.f0 == 0) {
                    flag = false;
                }
                return flag;
            }
        });

        //保存迟到太久的数据
        OutputTag<Tuple3<Long, String, String>> outputTag = new OutputTag<Tuple3<Long, String, String>>("late-data"){};

        /**
         * 窗口统计操作
         */
        SingleOutputStreamOperator<Tuple4<String, String, String, Long>> resultData = filterData.assignTimestampsAndWatermarks(new MyWatermark())
                .keyBy(1, 2)
                .window(TumblingEventTimeWindows.of(Time.seconds(30)))
                .allowedLateness(Time.seconds(30))//允许迟到30s
                .sideOutputLateData(outputTag)//记录迟到太久的数据
                .apply(new MyAggFunction());


        //获取迟到太久的数据
        DataStream<Tuple3<Long, String, String>> sideOutput = resultData.getSideOutput(outputTag);

        //把迟到的数据存储到kafka中
        String outTopic = "lateLog";
        Properties outprop = new Properties();
        outprop.setProperty("bootstrap.servers","hadoop110:9092");
        outprop.setProperty("transaction.timeout.ms",60000*15+"");

        FlinkKafkaProducer011<String> myProducer = new FlinkKafkaProducer011<String>(outTopic, new KeyedSerializationSchemaWrapper<String>(new SimpleStringSchema()), outprop, FlinkKafkaProducer011.Semantic.EXACTLY_ONCE);

        sideOutput.map(new MapFunction<Tuple3<Long,String,String>, String>() {
            @Override
            public String map(Tuple3<Long, String, String> value) throws Exception {
                return value.f0+"\t"+value.f1+"\t"+value.f2;
            }
        }).addSink(myProducer);


        /**
         * 把计算的结果存储到es中
         */
        List<HttpHost> httpHosts = new ArrayList<>();
        httpHosts.add(new HttpHost("hadoop100", 9200, "http"));

        ElasticsearchSink.Builder<Tuple4<String, String, String, Long>> esSinkBuilder = new ElasticsearchSink.Builder<Tuple4<String, String, String, Long>>(
                httpHosts,
                new ElasticsearchSinkFunction<Tuple4<String, String, String, Long>>() {
                    public IndexRequest createIndexRequest(Tuple4<String, String, String, Long> element) {
                        Map<String, Object> json = new HashMap<>();
                        json.put("time",element.f0);
                        json.put("type",element.f1);
                        json.put("area",element.f2);
                        json.put("count",element.f3);

                        //使用time+type+area 保证id唯一
                        String id = element.f0.replace(" ","_")+"-"+element.f1+"-"+element.f2;

                        return Requests.indexRequest()
                                .index("auditindex")
                                .type("audittype")
                                .id(id)
                                .source(json);
                    }

                    @Override
                    public void process(Tuple4<String, String, String, Long> element, RuntimeContext ctx, RequestIndexer indexer) {
                        indexer.add(createIndexRequest(element));
                    }
                }
        );
        //设置批量写数据的缓冲区大小,实际工作中的时间这个值需要调大一些
        esSinkBuilder.setBulkFlushMaxActions(1);
        resultData.addSink(esSinkBuilder.build());


        env.execute("DataReport");


    }
}

3、计算topN,分组、窗口内进行processFunction操作;http://wuchong.me/blog/2018/11/07/use-flink-calculate-hot-items/

/** 求某个窗口中前 N 名的热门点击商品,key 为窗口时间戳,输出为 TopN 的结果字符串 */
public static class TopNHotItems extends KeyedProcessFunction<Tuple, ItemViewCount, String> {

  private final int topSize;

  public TopNHotItems(int topSize) {
    this.topSize = topSize;
  }

  // 用于存储商品与点击数的状态,待收齐同一个窗口的数据后,再触发 TopN 计算
  private ListState<ItemViewCount> itemState;

  @Override
  public void open(Configuration parameters) throws Exception {
    super.open(parameters);
    // 状态的注册
    ListStateDescriptor<ItemViewCount> itemsStateDesc = new ListStateDescriptor<>(
        "itemState-state",
        ItemViewCount.class);
    itemState = getRuntimeContext().getListState(itemsStateDesc);
  }

  @Override
  public void processElement(
      ItemViewCount input,
      Context context,
      Collector<String> collector) throws Exception {

    // 每条数据都保存到状态中
    itemState.add(input);
    // 注册 windowEnd+1 的 EventTime Timer, 当触发时,说明收齐了属于windowEnd窗口的所有商品数据
    context.timerService().registerEventTimeTimer(input.windowEnd + 1);
  }

  @Override
  public void onTimer(
      long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
    // 获取收到的所有商品点击量
    List<ItemViewCount> allItems = new ArrayList<>();
    for (ItemViewCount item : itemState.get()) {
      allItems.add(item);
    }
    // 提前清除状态中的数据,释放空间
    itemState.clear();
    // 按照点击量从大到小排序
    allItems.sort(new Comparator<ItemViewCount>() {
      @Override
      public int compare(ItemViewCount o1, ItemViewCount o2) {
        return (int) (o2.viewCount - o1.viewCount);
      }
    });
    // 将排名信息格式化成 String, 便于打印
    StringBuilder result = new StringBuilder();
    result.append("====================================\n");
    result.append("时间: ").append(new Timestamp(timestamp-1)).append("\n");
    for (int i=0;i<topSize;i++) {
      ItemViewCount currentItem = allItems.get(i);
      // No1:  商品ID=12224  浏览量=2413
      result.append("No").append(i).append(":")
            .append("  商品ID=").append(currentItem.itemId)
            .append("  浏览量=").append(currentItem.viewCount)
            .append("\n");
    }
    result.append("====================================\n\n");

    out.collect(result.toString());
  }
}
发布了62 篇原创文章 · 获赞 158 · 访问量 32万+

猜你喜欢

转载自blog.csdn.net/yezonggang/article/details/90515431