- 1:socket数据源
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.time.Time
object FlinkSource1 {
def main(args: Array[String]): Unit = {
//获取程序入口类
val streamExecution: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val socketText: DataStream[String] = streamExecution.socketTextStream("node01",8000)
//注意:必须要添加这一行隐式转行,否则下面的flatmap方法执行会报错
import org.apache.flink.api.scala._
val result: DataStream[(String, Int)] = socketText.flatMap(x => x.split(" "))
.map(x => (x, 1))
.keyBy(0)
.timeWindow(Time.seconds(5), Time.seconds(5)) //统计最近5秒钟的数据
.sum(1)
//打印结果数据
result.print().setParallelism(1)
//执行程序
streamExecution.execute()
}
}
- 2:文件数据源
- 第一步:添加maven依赖
<repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> </repositories> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.0-mr1-cdh5.14.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.0-cdh5.14.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.6.0-cdh5.14.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.6.0-cdh5.14.2</version> </dependency>
- 第二步:代码实现
object FlinkSource2 { def main(args: Array[String]): Unit = { val executionEnvironment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment import org.apache.flink.api.scala._ //从文本读取数据 val hdfStream: DataStream[String] = executionEnvironment.readTextFile("hdfs://node01:8020/flink_input/") val result: DataStream[(String, Int)] = hdfStream.flatMap(x => x.split(" ")).map(x =>(x,1)).keyBy(0).sum(1) result.print().setParallelism(1) executionEnvironment.execute("hdfsSource") } }
- 3、从一个已经存在的集合当中获取数据
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object FlinkSource3 {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val value: DataStream[String] = environment.fromElements[String]("hadoop hive","spark flink")
val result2: DataStream[(String, Int)] = value.flatMap(x => x.split(" ")).map(x =>(x,1)).keyBy(0).sum(1)
result2.print().setParallelism(1)
environment.execute()
}
}
- 4、自定义数据源
-
如果flink自带的一些数据源还不够的工作使用的话,我们还可以自定义数据源
-
flink提供了大量的已经实现好的source方法,你也可以自定义source
通过实现sourceFunction接口来自定义无并行度的source,
或者你也可以通过实现ParallelSourceFunction 接口 or 继承RichParallelSourceFunction 来自定义有并行度的source -
第一步:自定义class类实现SourceFunction接口
class MySource extends SourceFunction[String]{ var isRunning:Boolean = true override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = { while (isRunning){ sourceContext.collect("hello world") } } override def cancel(): Unit = { isRunning = false } }
- 第二步:使用自定义数据源
-
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.time.Time
object FlinkSource4 {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val result: DataStream[String] = environment.addSource(new MySource)
val tupleResult: DataStream[(String, Int)] = result.flatMap(x => x.split(" ")).map(x => (x, 1))
.keyBy(0)
.timeWindowAll(Time.seconds(1)) //每隔两秒钟处理一次数据
.sum(1)
tupleResult.print().setParallelism(1)
environment.execute()
}
}
- 2、自定义一个多并行度的数据源
- 如果需要实现一个多并行度的数据源,那么我们可以通过实现ParallelSourceFunction 接口或者继承RichParallelSourceFunction 来自定义有并行度的source。
- 第一步:使用scala代码实现ParallelSourceFunction接口
第二步:使用自定义数据源import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction} class MyParalleSource extends ParallelSourceFunction[String] { var isRunning:Boolean = true override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = { while (true){ sourceContext.collect("hello world") } } override def cancel(): Unit = { isRunning = false } }
object FlinkSource5 { def main(args: Array[String]): Unit = { val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment import org.apache.flink.api.scala._ val sourceStream: DataStream[String] = environment.addSource(new MyParalleSource) val result: DataStream[(String, Int)] = sourceStream.flatMap(x => x.split(" ")).map(x => (x, 1)) .keyBy(0) .sum(1) result.print().setParallelism(2) environment.execute("paralleSource") } }