1、监听端口、event_time、滚动窗口、wordcount;https://www.jianshu.com/p/7d524ef8143c
public class DataStreamDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); //设置时间分配器
env.setParallelism(1); //设置并行度
env.getConfig().setAutoWatermarkInterval(9000);//每9秒发出一个watermark
DataStream<String> text = env.socketTextStream("localhost", 9900);
DataStream<Tuple3<String, Long, Integer>> counts = text.filter(new FilterClass()).map(new LineSplitter())
.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple3<String, Long, Integer>>() {
private long currentMaxTimestamp = 0l;
private final long maxOutOfOrderness = 10000l; //这个控制失序已经延迟的度量
//获取EventTime
@Override
public long extractTimestamp(Tuple3<String, Long, Integer> element, long previousElementTimestamp) {
long timestamp = element.f1;
currentMaxTimestamp = Math.max( timestamp, currentMaxTimestamp);
System.out.println(
"get timestamp is " + timestamp + " currentMaxTimestamp " + currentMaxTimestamp);
return timestamp;
}
//获取Watermark
@Override
public Watermark getCurrentWatermark() {
System.out.println("wall clock is " + System.currentTimeMillis() + " new watermark "
+ (currentMaxTimestamp - maxOutOfOrderness));
return new Watermark(currentMaxTimestamp - maxOutOfOrderness);
}
}).keyBy(0).timeWindow(Time.seconds(20))
// .allowedLateness(Time.seconds(10))
.sum(2);
counts.print();
env.execute("Window WordCount");
}
// 自定义获取timesStamp
// private static class MyTimestamp extends AscendingTimestampExtractor<Tuple3<String, Long, Integer>> {
//
// private static final long serialVersionUID = 1L;
//
// public long extractAscendingTimestamp(Tuple3<String, Long, Integer> element) {
//
// return element.f1;
// }
//
// }
//构造出element以及它的event time.然后把次数赋值为1
public static final class LineSplitter implements MapFunction<String, Tuple3<String, Long, Integer>> {
@Override
public Tuple3<String, Long, Integer> map(String value) throws Exception {
// TODO Auto-generated method stub
String[] tokens = value.toLowerCase().split("\\W+");
long eventtime = Long.parseLong(tokens[1]);
return new Tuple3<String, Long, Integer>(tokens[0], eventtime, 1);
}
}
//过滤掉为null和whitespace的字符串
public static final class FilterClass implements FilterFunction<String> {
@Override
public boolean filter(String value) throws Exception {
if (StringUtils.isNullOrWhitespaceOnly(value)) {
return false;
} else {
return true;
}
}
}
}
二、读kafka中json数据,处理延迟数据,结果写入ES;https://github.com/yezonggang/FlinkProj
关于延迟数据的处理方式,可以学习!
package xuwei.tech;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper;
import org.apache.flink.util.OutputTag;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import xuwei.tech.function.MyAggFunction;
import xuwei.tech.watermark.MyWatermark;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
*
* bin/kafka-topics.sh --create --topic lateLog --zookeeper localhost:2181 --partitions 5 --replication-factor 1
*
* Created by xuwei.tech on 2018/11/14.
*/
public class DataReport {
private static Logger logger = LoggerFactory.getLogger(DataReport.class);
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置并行度
env.setParallelism(5);
//设置使用eventtime
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//checkpoint配置
env.enableCheckpointing(60000);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
env.getCheckpointConfig().setCheckpointTimeout(10000);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//设置statebackend
//env.setStateBackend(new RocksDBStateBackend("hdfs://hadoop100:9000/flink/checkpoints",true));
/**
* 配置kafkaSource
*/
String topic = "auditLog";
Properties prop = new Properties();
prop.setProperty("bootstrap.servers","hadoop110:9092");
prop.setProperty("group.id","con1");
FlinkKafkaConsumer011<String> myConsumer = new FlinkKafkaConsumer011<>(topic, new SimpleStringSchema(), prop);
/**
* 获取到kafka中的数据
*
* 审核数据格式
* // {"dt":"审核时间[年月日 时分秒]","type":"审核类型","username":"审核人员姓名","area":"大区"}
*
*/
DataStreamSource<String> data = env.addSource(myConsumer);
/**
* 对数据进行清洗
*/
DataStream<Tuple3<Long, String, String>> mapData = data.map(new MapFunction<String, Tuple3<Long, String, String>>() {
@Override
public Tuple3<Long, String, String> map(String line) throws Exception {
JSONObject jsonObject = JSON.parseObject(line);
String dt = jsonObject.getString("dt");
long time = 0;
try {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date parse = sdf.parse(dt);
time = parse.getTime();
} catch (ParseException e) {
//也可以把这个日志存储到其他介质中
logger.error("时间解析异常,dt:" + dt, e.getCause());
}
String type = jsonObject.getString("type");
String area = jsonObject.getString("area");
return new Tuple3<>(time, type, area);
}
});
/**
* 过滤掉异常数据
*/
DataStream<Tuple3<Long, String, String>> filterData = mapData.filter(new FilterFunction<Tuple3<Long, String, String>>() {
@Override
public boolean filter(Tuple3<Long, String, String> value) throws Exception {
boolean flag = true;
if (value.f0 == 0) {
flag = false;
}
return flag;
}
});
//保存迟到太久的数据
OutputTag<Tuple3<Long, String, String>> outputTag = new OutputTag<Tuple3<Long, String, String>>("late-data"){};
/**
* 窗口统计操作
*/
SingleOutputStreamOperator<Tuple4<String, String, String, Long>> resultData = filterData.assignTimestampsAndWatermarks(new MyWatermark())
.keyBy(1, 2)
.window(TumblingEventTimeWindows.of(Time.seconds(30)))
.allowedLateness(Time.seconds(30))//允许迟到30s
.sideOutputLateData(outputTag)//记录迟到太久的数据
.apply(new MyAggFunction());
//获取迟到太久的数据
DataStream<Tuple3<Long, String, String>> sideOutput = resultData.getSideOutput(outputTag);
//把迟到的数据存储到kafka中
String outTopic = "lateLog";
Properties outprop = new Properties();
outprop.setProperty("bootstrap.servers","hadoop110:9092");
outprop.setProperty("transaction.timeout.ms",60000*15+"");
FlinkKafkaProducer011<String> myProducer = new FlinkKafkaProducer011<String>(outTopic, new KeyedSerializationSchemaWrapper<String>(new SimpleStringSchema()), outprop, FlinkKafkaProducer011.Semantic.EXACTLY_ONCE);
sideOutput.map(new MapFunction<Tuple3<Long,String,String>, String>() {
@Override
public String map(Tuple3<Long, String, String> value) throws Exception {
return value.f0+"\t"+value.f1+"\t"+value.f2;
}
}).addSink(myProducer);
/**
* 把计算的结果存储到es中
*/
List<HttpHost> httpHosts = new ArrayList<>();
httpHosts.add(new HttpHost("hadoop100", 9200, "http"));
ElasticsearchSink.Builder<Tuple4<String, String, String, Long>> esSinkBuilder = new ElasticsearchSink.Builder<Tuple4<String, String, String, Long>>(
httpHosts,
new ElasticsearchSinkFunction<Tuple4<String, String, String, Long>>() {
public IndexRequest createIndexRequest(Tuple4<String, String, String, Long> element) {
Map<String, Object> json = new HashMap<>();
json.put("time",element.f0);
json.put("type",element.f1);
json.put("area",element.f2);
json.put("count",element.f3);
//使用time+type+area 保证id唯一
String id = element.f0.replace(" ","_")+"-"+element.f1+"-"+element.f2;
return Requests.indexRequest()
.index("auditindex")
.type("audittype")
.id(id)
.source(json);
}
@Override
public void process(Tuple4<String, String, String, Long> element, RuntimeContext ctx, RequestIndexer indexer) {
indexer.add(createIndexRequest(element));
}
}
);
//设置批量写数据的缓冲区大小,实际工作中的时间这个值需要调大一些
esSinkBuilder.setBulkFlushMaxActions(1);
resultData.addSink(esSinkBuilder.build());
env.execute("DataReport");
}
}
3、计算topN,分组、窗口内进行processFunction操作;http://wuchong.me/blog/2018/11/07/use-flink-calculate-hot-items/
/** 求某个窗口中前 N 名的热门点击商品,key 为窗口时间戳,输出为 TopN 的结果字符串 */
public static class TopNHotItems extends KeyedProcessFunction<Tuple, ItemViewCount, String> {
private final int topSize;
public TopNHotItems(int topSize) {
this.topSize = topSize;
}
// 用于存储商品与点击数的状态,待收齐同一个窗口的数据后,再触发 TopN 计算
private ListState<ItemViewCount> itemState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
// 状态的注册
ListStateDescriptor<ItemViewCount> itemsStateDesc = new ListStateDescriptor<>(
"itemState-state",
ItemViewCount.class);
itemState = getRuntimeContext().getListState(itemsStateDesc);
}
@Override
public void processElement(
ItemViewCount input,
Context context,
Collector<String> collector) throws Exception {
// 每条数据都保存到状态中
itemState.add(input);
// 注册 windowEnd+1 的 EventTime Timer, 当触发时,说明收齐了属于windowEnd窗口的所有商品数据
context.timerService().registerEventTimeTimer(input.windowEnd + 1);
}
@Override
public void onTimer(
long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
// 获取收到的所有商品点击量
List<ItemViewCount> allItems = new ArrayList<>();
for (ItemViewCount item : itemState.get()) {
allItems.add(item);
}
// 提前清除状态中的数据,释放空间
itemState.clear();
// 按照点击量从大到小排序
allItems.sort(new Comparator<ItemViewCount>() {
@Override
public int compare(ItemViewCount o1, ItemViewCount o2) {
return (int) (o2.viewCount - o1.viewCount);
}
});
// 将排名信息格式化成 String, 便于打印
StringBuilder result = new StringBuilder();
result.append("====================================\n");
result.append("时间: ").append(new Timestamp(timestamp-1)).append("\n");
for (int i=0;i<topSize;i++) {
ItemViewCount currentItem = allItems.get(i);
// No1: 商品ID=12224 浏览量=2413
result.append("No").append(i).append(":")
.append(" 商品ID=").append(currentItem.itemId)
.append(" 浏览量=").append(currentItem.viewCount)
.append("\n");
}
result.append("====================================\n\n");
out.collect(result.toString());
}
}