flink学习笔记-自定义数据源

  • 1:socket数据源
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.time.Time

object FlinkSource1 {
  def main(args: Array[String]): Unit = {
    //获取程序入口类
    val streamExecution: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val socketText: DataStream[String] = streamExecution.socketTextStream("node01",8000)
    //注意:必须要添加这一行隐式转行,否则下面的flatmap方法执行会报错
    import org.apache.flink.api.scala._
    val result: DataStream[(String, Int)] = socketText.flatMap(x => x.split(" "))
      .map(x => (x, 1))
      .keyBy(0)
      .timeWindow(Time.seconds(5), Time.seconds(5)) //统计最近5秒钟的数据
      .sum(1)

    //打印结果数据
    result.print().setParallelism(1)
    //执行程序
    streamExecution.execute()
  }
}
  • 2:文件数据源
    • 第一步:添加maven依赖
    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>
    
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.6.0-mr1-cdh5.14.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.6.0-cdh5.14.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>2.6.0-cdh5.14.2</version>
    </dependency>
    
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-core</artifactId>
        <version>2.6.0-cdh5.14.2</version>
    </dependency>
    
    • 第二步:代码实现
    object FlinkSource2 {
      def main(args: Array[String]): Unit = {
        val executionEnvironment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        import org.apache.flink.api.scala._
        //从文本读取数据
        val hdfStream: DataStream[String] = executionEnvironment.readTextFile("hdfs://node01:8020/flink_input/")
        val result: DataStream[(String, Int)] = hdfStream.flatMap(x => x.split(" ")).map(x =>(x,1)).keyBy(0).sum(1)
    
        result.print().setParallelism(1)
    
        executionEnvironment.execute("hdfsSource")
      }
    }
    
  • 3、从一个已经存在的集合当中获取数据
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object FlinkSource3 {
  def main(args: Array[String]): Unit = {
    val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    import org.apache.flink.api.scala._
    val value: DataStream[String] = environment.fromElements[String]("hadoop hive","spark flink")
    val result2: DataStream[(String, Int)] = value.flatMap(x => x.split(" ")).map(x =>(x,1)).keyBy(0).sum(1)
    result2.print().setParallelism(1)
    environment.execute()
  }
}
  • 4、自定义数据源
    • 如果flink自带的一些数据源还不够的工作使用的话,我们还可以自定义数据源

    • flink提供了大量的已经实现好的source方法,你也可以自定义source
      通过实现sourceFunction接口来自定义无并行度的source,
      或者你也可以通过实现ParallelSourceFunction 接口 or 继承RichParallelSourceFunction 来自定义有并行度的source

    • 第一步:自定义class类实现SourceFunction接口

    class  MySource extends SourceFunction[String]{
      var isRunning:Boolean = true
      override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = {
        while (isRunning){
          sourceContext.collect("hello world")
        }
      }
      override def cancel(): Unit = {
          isRunning = false
      }
    }
    
    • 第二步:使用自定义数据源
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.time.Time

object FlinkSource4 {
  def main(args: Array[String]): Unit = {
    val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    import org.apache.flink.api.scala._
    val result: DataStream[String] = environment.addSource(new MySource)
    val tupleResult: DataStream[(String, Int)] = result.flatMap(x => x.split(" ")).map(x => (x, 1))
      .keyBy(0)
      .timeWindowAll(Time.seconds(1))  //每隔两秒钟处理一次数据
      .sum(1)
    tupleResult.print().setParallelism(1)
    environment.execute()
  }
}
  • 2、自定义一个多并行度的数据源
    • 如果需要实现一个多并行度的数据源,那么我们可以通过实现ParallelSourceFunction 接口或者继承RichParallelSourceFunction 来自定义有并行度的source。
    • 第一步:使用scala代码实现ParallelSourceFunction接口
    import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
    
    class MyParalleSource  extends ParallelSourceFunction[String] {
      var isRunning:Boolean = true
    
      override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = {
        while (true){
          sourceContext.collect("hello world")
        }
      }
      override def cancel(): Unit = {
        isRunning = false
      }
    }
    
    第二步:使用自定义数据源
    object FlinkSource5 {
      def main(args: Array[String]): Unit = {
        val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        import org.apache.flink.api.scala._
        val sourceStream: DataStream[String] = environment.addSource(new MyParalleSource)
        val result: DataStream[(String, Int)] = sourceStream.flatMap(x => x.split(" ")).map(x => (x, 1))
          .keyBy(0)
          .sum(1)
        result.print().setParallelism(2)
        environment.execute("paralleSource")
      }
    }
    
发布了40 篇原创文章 · 获赞 59 · 访问量 1402

猜你喜欢

转载自blog.csdn.net/qq_26719997/article/details/105042766