Flink:1.8.2
Kafka:0.10.1
HADOOP:CDH-Hadoop 3.0.0
数据样例
910690,3118339,3189162,buy,1511682220
953517,2541156,3607361,fav,1511682220
227081,2145764,982926,pv,1511682220
777480,3566166,3645362,fav,1511682220
225452,495241,4697972,pv,1511682220
实现代码
public class UserBehaviour {
long userId;
long itemId;
long categoryId;
String behavior;
long timestamps;
public long getUserId() {
return userId;
}
public void setUserId(long userId) {
this.userId = userId;
}
public long getItemId() {
return itemId;
}
public void setItemId(long itemId) {
this.itemId = itemId;
}
public long getCategoryId() {
return categoryId;
}
public void setCategoryId(long categoryId) {
this.categoryId = categoryId;
}
public String getBehavior() {
return behavior;
}
public void setBehavior(String behavior) {
this.behavior = behavior;
}
public long getTimestamps() {
return timestamps;
}
public void setTimestamps(long timestamps) {
this.timestamps = timestamps;
}
}
import java.time.ZoneId
import java.util.Properties
import org.apache.flink.api.common.serialization.BulkWriter
import org.apache.flink.api.scala._
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
import org.apache.flink.streaming.api.scala.{
DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
import org.apache.flink.streaming.util.serialization.SimpleStringSchema
import pojo.UserBehaviour
object StreamingFileSinkParquetWriterDemo{
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(60000L)
val properties = new Properties()
properties.setProperty(
"bootstrap.servers",
"127.0.0.1:9092,..."
)
properties.setProperty("auto.offset.reset", "earliest")
val dss: DataStream[UserBehaviour] = env.addSource(
new FlinkKafkaConsumer010[String](
"topic_name", new SimpleStringSchema(), properties
)
)
.map(a => getUserBehaviour(a))
val ssink = StreamingFileSink
.forBulkFormat(
new org.apache.flink.core.fs.Path("hdfs://" + "namenode_ip:8020/test..."),
ParquetAvroWriters.forReflectRecord(classOf[UserBehaviour]).asInstanceOf[BulkWriter.Factory[UserBehaviour]]
)
.withBucketAssigner(new DateTimeBucketAssigner[UserBehaviour]("yyyy-MM-dd--HH", ZoneId.of("Asia/Shanghai")))
.withBucketCheckInterval(6000l) //
.build()
dss.addSink(ssink)
env.execute()
}
private def getUserBehaviour(value: String): UserBehaviour = {
if (value != null && !value.isEmpty()) {
val strArr = value.split(",")
val ub = new UserBehaviour
ub.setUserId(strArr(0).toLong)
ub.setItemId(strArr(1).toLong)
ub.setCategoryId(strArr(2).toLong)
ub.setBehavior(strArr(3))
ub.setTimestamps(strArr(4).toLong)
return ub
} else {
return new UserBehaviour
}
}
}
Maven中包含这两个依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet</artifactId>
<version>1.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.10.1</version>
<scope>compile</scope>
</dependency>
Flink的lib下需要有flink-shaded-hadoop-2-uber-2.8.3-7.0.jar
如果HDFS有kerberos验证就自行解决吧
还有,工程里如果有Hadoop的包,提交YARN的时候可能会因为冲突报错