下面贴一下完整的UDTF代码,老规矩,讲解都在注释里面了package udf;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.table.functions.FunctionContext;
import org.apache.flink.table.functions.TableFunction;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public class DeduplicationUDTF extends TableFunction<Tuple2<String, Integer>> {
private static final Logger log = LoggerFactory.getLogger(DeduplicationUDTF.class);
private transient Connection hConnection;
private transient Table table;
private transient BufferedMutator mutator;
private final String zkIp;
private final String zkPort;
private final String tableName;
private final String cf;
private final String col;
public DeduplicationUDTF(String zkIp, String zkPort, String tableName, String cf, String col) {
this.zkIp = zkIp;
this.zkPort = zkPort;
this.tableName = tableName;
this.cf = cf;
this.col = col;
}
public void eval(String rowkey) {
Get get = new Get(Bytes.toBytes(rowkey));
try {
Result result = table.get(get);
if (!result.isEmpty()) {
collect(Tuple2.of(rowkey, 1));
} else {
Put put = new Put(Bytes.toBytes(rowkey));
put.addColumn(Bytes.toBytes(cf), Bytes.toBytes(col), Bytes.toBytes("1"));
mutator.mutate(put);
mutator.flush();
collect(Tuple2.of(rowkey, -1));
}
} catch (IOException e) {
log.error("get from hbase error! ", e);
e.printStackTrace();
}
}
@Override
public void open(FunctionContext context) throws Exception {
super.open(context);
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", zkIp);
config.set("hbase.zookeeper.property.clientPort", zkPort);
hConnection = ConnectionFactory.createConnection(config);
table = hConnection.getTable(TableName.valueOf(tableName));
BufferedMutatorParams params = new BufferedMutatorParams(TableName.valueOf(tableName))
.writeBufferSize(-1);
mutator = hConnection.getBufferedMutator(params);
}
@Override
public void close() {
try {
super.close();
} catch (Exception e) {
log.error("super class close error!", e);
throw new RuntimeException(e);
}
if (table != null) {
try {
table.close();
} catch (IOException e) {
log.error("table close error!", e);
throw new RuntimeException(e);
}
}
if (mutator != null) {
try {
mutator.close();
} catch (IOException e) {
log.error("mutator close error!", e);
throw new RuntimeException(e);
}
}
if (hConnection != null) {
try {
hConnection.close();
} catch (IOException e) {
log.error("Connection close error!", e);
throw new RuntimeException(e);
}
}
}
}
下面是主类的代码package tutorial;
import org.apache.flink.table.api.Table;
import org.apache.flink.types.Row;
import udf.DeduplicationUDTF;
import static util.FlinkConstant.*;
public class FlinkSql08 {
public static final String KAFKA_TABLE_SOURCE_DDL = "" +
"CREATE TABLE t1 (\n" +
" user_id BIGINT,\n" +
" item_id BIGINT,\n" +
" category_id BIGINT,\n" +
" behavior STRING,\n" +
" ts BIGINT,\n" +
" p AS PROCTIME()" +
") WITH (\n" +
" 'connector.type' = 'kafka', -- 指定连接类型是kafka\n" +
" 'connector.version' = '0.11', -- 与我们之前Docker安装的kafka版本要一致\n" +
" 'connector.topic' = '08_test', -- 之前创建的topic \n" +
" 'connector.properties.group.id' = '08_test', -- 消费者组,相关概念可自行百度\n" +
" 'connector.startup-mode' = 'earliest-offset', --指定从最早消费\n" +
" 'connector.properties.zookeeper.connect' = 'localhost:2181', -- zk地址\n" +
" 'connector.properties.bootstrap.servers' = 'localhost:9092', -- broker地址\n" +
" 'format.type' = 'json' -- json格式,和topic中的消息格式保持一致\n" +
")";
public static final String MYSQL_TABLE_SINK_DDL = "" +
"CREATE TABLE `t2` (\n" +
" `user_id` BIGINT ,\n" +
" `item_id` BIGINT ,\n" +
" `behavior` STRING ,\n" +
" `category_id` BIGINT ,\n" +
" `ts` BIGINT \n" +
")WITH (\n" +
" 'connector.type' = 'jdbc', -- 连接方式\n" +
" 'connector.url' = 'jdbc:mysql://localhost:3306/test', -- jdbc的url\n" +
" 'connector.table' = 'user_behavior', -- 表名\n" +
" 'connector.driver' = 'com.mysql.jdbc.Driver', -- 驱动名字,可以不填,会自动从上面的jdbc url解析 \n" +
" 'connector.username' = 'root', -- 顾名思义 用户名\n" +
" 'connector.password' = '123456' , -- 密码\n" +
" 'connector.write.flush.max-rows' = '5000', -- 意思是攒满多少条才触发写入 \n" +
" 'connector.write.flush.interval' = '2s' -- 意思是攒满多少秒才触发写入;这2个参数,无论数据满足哪个条件,就会触发写入\n" +
")";
public static final String ES_TABLE_SINK_DDL = "" +
"CREATE TABLE `t3` (\n" +
" `user_id` BIGINT ,\n" +
" `item_id` BIGINT ,\n" +
" `behavior` STRING ,\n" +
" `category_id` BIGINT ,\n" +
" `ts` BIGINT \n" +
")WITH (\n" +
" 'connector.type' = 'elasticsearch', -- required: specify this table type is elasticsearch\n" +
" 'connector.version' = '6', -- required: valid connector versions are \"6\"\n" +
" 'connector.hosts' = 'http://127.0.0.1:9200', -- required: one or more Elasticsearch hosts to connect to\n" +
" 'connector.index' = 'user', -- required: Elasticsearch index\n" +
" 'connector.document-type' = 'user', -- required: Elasticsearch document type\n" +
" 'update-mode' = 'upsert', -- optional: update mode when used as table sink. \n" +
" 'connector.flush-on-checkpoint' = 'false', -- optional: disables flushing on checkpoint (see notes below!)\n" +
" 'connector.bulk-flush.max-actions' = '1', -- optional: maximum number of actions to buffer \n" +
" 'connector.bulk-flush.max-size' = '1 mb', -- optional: maximum size of buffered actions in bytes\n" +
" 'connector.bulk-flush.interval' = '1000', -- optional: bulk flush interval (in milliseconds)\n" +
" 'connector.bulk-flush.backoff.max-retries' = '3', -- optional: maximum number of retries\n" +
" 'connector.bulk-flush.backoff.delay' = '1000', -- optional: delay between each backoff attempt\n" +
" 'format.type' = 'json' -- required: Elasticsearch connector requires to specify a format,\n" +
")";
public static void main(String[] args) throws Exception {
tEnv.sqlUpdate(KAFKA_TABLE_SOURCE_DDL);
tEnv.sqlUpdate(MYSQL_TABLE_SINK_DDL);
tEnv.sqlUpdate(ES_TABLE_SINK_DDL);
tEnv.registerFunction("deDuplication",new DeduplicationUDTF("127.0.0.1","2182","test","cf","col"));
Table table = tEnv.sqlQuery("select a.* ,b.* from t1 a , \n" +
"LATERAL TABLE(deDuplication(concat_ws('',cast(a.user_id as varchar)))) as b(rowkey,is_duplicate)");
tEnv.toAppendStream(table,Row.class).print("没用where过滤").setParallelism(1);
Table where = table.where("is_duplicate = -1");
tEnv.toAppendStream(where,Row.class).print("用where过滤").setParallelism(1);
env.execute("FlinkSql08");
}
}