Flink:DataStream API 开发

入门案例

Flink 流处理程序的一般流程

  1. 获取 Flink 流处理执行环境

  2. 构建 source

  3. 数据处理

  4. 构建 sink

示例

编写 Flink 程序,用来统计单词的数量

步骤

  1. 获取 Flink 批处理运行环境

  2. 构建一个 socket 源

  3. 使用 flink 操作进行单词统计

  4. 打印

说明:如果 linux 上没有安装 nc 服务 ,使用 yum 安装

yum install -y nc

参考代码

import org.apache.flink.api.java.tuple.Tuple import org.apache.flink.streaming.api.scala._

import org.apache.flink.streaming.api.windowing.time.Time

import org.apache.flink.streaming.api.windowing.windows.TimeWindow /**

*简单的流处理的词频统计

*编写Flink程序,可以来接收 socket 的单词数据,并进行单词统计。

*/

object StreamWordCount {


def main(args: Array[String]): Unit = {

/**
*实现思路:
*1. 获取流处理运行环境
*2. 构建socket流数据源,并指定IP地址和端口号
*3. 对接收到的数据转换成单词元组
*4. 使用keyBy 进行分流(分组)
*5. 使用timeWinodw 指定窗口的长度(每5秒计算一次)
*6. 使用sum执行累加
*7. 打印输出
*8. 启动执行
*9. 在Linux中,使用nc -lk 端口号监听端口,并发送单词
*/
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2. 构建socket流数据源,并指定IP地址和端口号
val textDataStream: DataStream[String] = env.socketTextStream("node01", 7777)
//3. 对接收到的数据转换成单词元组
val wordDataStream: DataStream[(String, Int)] = textDataStream.flatMap(_.split(" ")).map(_ -> 1)
//4. 使用keyBy 进行分流(分组)
//在批处理中针对于dataset, 如果分组需要使用groupby
//在流处理中针对于datastream, 如果分组(分流)使用keyBy
val groupedDataStream: KeyedStream[(String, Int), Tuple] = wordDataStream.keyBy(0)
//5. 使用timeWinodw 指定窗口的长度(每5秒计算一次)
//spark-》reduceBykeyAndWindow
val windowDataStream: WindowedStream[(String, Int), Tuple, TimeWindow] = groupedDataStream.timeWindow(
Time.seconds(5))
//6. 使用sum执行累加
val sumDataStream = windowDataStream.sum(1)
sumDataStream.print()
env.execute()
}}

输入数据集 Data Sources

Flink 中你可以使用 StreamExecutionEnvironment.addSource(source) 来为你的程序添加数据来源。
Flink 已 经 提 供 了 若 干 实 现 好 了 的 source functions ,当 然 你 也 可 以通 过 实 现 SourceFunction 来自定义非并行的 source 或者实现 ParallelSourceFunction 接口或者扩展 RichParallelSourceFunction 来自定义并行的 source

Flink 在流处理上常见的 Source

Flink 在流处理上常见的 Source ,Flink 在流处理上的 source 和在批处理上的 source 基本一致。
大致有 4 大类:


基于本地集合的 source(Collection-based-source)


基于文件的 source(File-based-source)- 读取文本文件,即符合 TextInputFormat 规范 的文件,并将其作为字符串返回


基于网络套接字的 source(Socket-based-source)- 从 socket 读取。元素可以用分隔符切分。


自定义的 source(Custom-source)


基于集合的 source

env.fromCollection()

package com.czxy.flink.stream.source.collection

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamFromCollectionSource {
  def main(args: Array[String]): Unit = {
    //1.创建流处理的执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    import org.apache.flink.api.scala._
    val collectionDataStream: DataStream[String] = env.fromCollection(List("hadoop spark kafka","hive flink"))
    //3.打印输出
    collectionDataStream.print()
    //4.执行程序
    env.execute(this.getClass.getSimpleName)
  }
}

env.fromElements()

package com.czxy.flink.stream.source.collection

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamFromElementsSource {
  def main(args: Array[String]): Unit = {
    //1.创建流处理的执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    import org.apache.flink.api.scala._
    val elementDataStream: DataStream[String] = env.fromElements("hadoop hadoop hive flink")
    //3.打印输出
    elementDataStream.print()
    //4.执行程序
    env.execute("StreamFromElementsSource")
  }
}

env.generateSequence()

package com.czxy.flink.stream.source.collection

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamGenerateSequence {
  def main(args: Array[String]): Unit = {
    //1.创建流处理的执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    val generateSequence: DataStream[Long] = env.generateSequence(0,100)
    //3.输出打印
    generateSequence.print()
    //4.执行程序
    env.execute(this.getClass.getSimpleName)
  }
}

综合练习

package cn.czxy.stream.source
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import scala.collection.immutable.{Queue, Stack} import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer} import org.apache.flink.api.scala._
object StreamDataSourceDemo {
def main(args: Array[String]): Unit = {
val senv = StreamExecutionEnvironment.getExecutionEnvironment
//0.用element创建DataStream(fromElements)
val ds0: DataStream[String] = senv.fromElements("spark", "flink")
ds0.print()
//1.用Tuple创建DataStream(fromElements)
val ds1: DataStream[(Int, String)] = senv.fromElements((1, "spark"), (2, "flink"))
ds1.print()
//2.用Array创建DataStream
val ds2: DataStream[String] = senv.fromCollection(Array("spark", "flink"))
ds2.print()
//3.用ArrayBuffer创建DataStream
val ds3: DataStream[String] =
senv.fromCollection(ArrayBuffer("spark", "flink"))
ds3.print()
//4.用List创建DataStream
val ds4: DataStream[String] = senv.fromCollection(List("spark", "flink"))
ds4.print()
//5.用List创建DataStream
val ds5: DataStream[String] =
senv.fromCollection(ListBuffer("spark", "flink"))
ds5.print()
//6.用Vector创建DataStream
val ds6: DataStream[String] = senv.fromCollection(Vector("spark", "flink"))
ds6.print()
//7.用Queue创建DataStream
val ds7: DataStream[String] = senv.fromCollection(Queue("spark", "flink"))
ds7.print()
//8.用Stack创建DataStream
val ds8: DataStream[String] = senv.fromCollection(Stack("spark", "flink"))
ds8.print()
//9.用Stream创建DataStream(Stream相当于lazy List,避免在中间过程中生成不必要的集合)
val ds9: DataStream[String] = senv.fromCollection(Stream("spark", "flink"))
ds9.print()
//10.用Seq创建DataStream
val ds10: DataStream[String] = senv.fromCollection(Seq("spark", "flink"))
ds10.print()
//11.用Set创建DataStream(不支持)
//val ds11: DataStream[String] = senv.fromCollection(Set("spark", "flink"))
//ds11.print()
//12.用Iterable创建DataStream(不支持)
//val ds12: DataStream[String] = senv.fromCollection(Iterable("spark", "flink"))
//ds12.print()
//13.用ArraySeq创建DataStream
val ds13: DataStream[String] =
senv.fromCollection(mutable.ArraySeq("spark", "flink"))
ds13.print()
//14.用ArrayStack创建DataStream
val ds14: DataStream[String] =
senv.fromCollection(mutable.ArrayStack("spark", "flink"))
ds14.print()
//15.用Map创建DataStream(不支持)
//val ds15: DataStream[(Int, String)] = senv.fromCollection(Map(1 -> "spark", 2 -> "flink"))
//ds15.print()
//16.用Range创建DataStream
val ds16: DataStream[Int] = senv.fromCollection(Range(1, 9))
ds16.print()
//17.用fromElements创建DataStream
val ds17: DataStream[Long] = senv.generateSequence(1, 9)
ds17.print()}}

基于文件的 source(File-based-source)

package com.czxy.flink.stream.source.file

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

//基于文件构建数据源
object StreamFromFileSource {
  def main(args: Array[String]): Unit = {

    //1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    val fileDataStream: DataStream[String] = env.readTextFile("day03/data/input/wordcount.txt")
    //3.输出打印
    fileDataStream.print()
    //4.执行程序
    env.execute("StreamFromFileSource")

  }
}

基于网络套接字的 source(Socket-based-source)

val source = env.socketTextStream(“IP”, PORT)
案例:

package com.czxy.flink.stream

import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment, WindowedStream}
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow

//入门案例,单词统计
object StreamWordCount {
  def main(args: Array[String]): Unit = {
    /**
     * 实现思路:
     * 1. 获取流处理运行环境
     * 2. 构建socket流数据源, 并指定IP地址和端口号
     * 3. 对接收到的数据转换成单词元组
     * 4. 使用keyBy 进行分流(分组)
     * 5. 使用timeWindow 指定窗口的长度(每5秒计算一次)
     * 6. 使用sum执行累加
     * 7. 打印输出
     * 8. 启动执行
     * 9. 在Linux中, 使用nc -lk 端口号监听端口, 并发送单词
     */
    //1.创建流处理的执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源 使用的socket
    val socketDataStream: DataStream[String] = env.socketTextStream("node01",9999)
    //3.数据的处理
    import org.apache.flink.api.scala._
    //4.对接收到的数据转换成单词元组,使用keyBy 进行分流(分组)
    val groupKeyedStream: KeyedStream[(String, Int), Tuple] = socketDataStream.flatMap(x=>x.split(",")).map((_,1)).keyBy(0)
    //5.使用timeWindow 指定窗口的长度(每5秒计算一次)
    val windowedStream: WindowedStream[(String, Int), Tuple, TimeWindow] = groupKeyedStream.timeWindow(Time.seconds(5))
    //6.使用sum执行累加
    val resultDataStream: DataStream[(String, Int)] = windowedStream.sum(1)
    //7.打印数据
    resultDataStream.print()
    //8.执行程序
    env.execute("StreamWordCount")
  }
}

自定义的 source(Custom-source)

除了预定义的 Source 外,我们还可以通过实现 SourceFunction 来自定义 Source,然后通过 StreamExecutionEnvironment.addSource(sourceFunction)添加进来。比如读取 Kafka 数据的 Source:
addSource(new FlinkKafkaConsumer08<>);
我们可以实现以下三个接口来自定义 Source:

SourceFunction:创建非并行数据源。

package com.czxy.flink.stream.source.customer

import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

//自定义非并行数据源
object StreamCustomerNoParallelSource {
  def main(args: Array[String]): Unit = {
    //1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    import org.apache.flink.api.scala._
    val NoParallelDataStream: DataStream[Long] = env.addSource( new NoParallelSource()).setParallelism(1)
    //3.打印输出
    NoParallelDataStream.print()
    //4.执行程序
    env.execute("StreamCustomerNoParallelSource")
  }
  //实现一个单线程的,数据从1开始递增的数据集
  class NoParallelSource extends  SourceFunction[Long]() {

    var number:Long=1L
    var isRunning:Boolean=true

    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isRunning){
        ctx.collect(number)
        number+=1
        Thread.sleep(1)
        if (number>5){
          cancel()
        }
      }
    }
    override def cancel(): Unit = {
      isRunning=false
    }
  }
}

ParallelSourceFunction:创建并行数据源。

package com.czxy.flink.stream.source.customer

import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamCustomerParallelSource {
  def main(args: Array[String]): Unit = {
    //1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    import org.apache.flink.api.scala._
    val parallelSourceDataStream: DataStream[Long] = env.addSource(new ParallelSource()).setParallelism(2)
    //3.打印输出
    parallelSourceDataStream.print()
    //4.执行 程序
    env.execute("StreamCustomerParallelSource")
  }

  //创建一个并行度为1的数据源
  //实现从1开始产生递增数字
  class ParallelSource extends ParallelSourceFunction[Long]() {
    var number: Long = 1L
    var isRunning: Boolean = true

    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isRunning) {
        ctx.collect(number)
        number += 1
        Thread.sleep(1)
        if (number > 5) {
          cancel()
        }

      }
    }

    override def cancel(): Unit = {
      isRunning = false
    }
  }

}

RichParallelSourceFunction:创建并行数据源。

package com.czxy.stream.source

import

org.apache.flink.streaming.api.functions.source.{RichParallelSourceFu
nction, SourceFunction}

import org.apache.flink.streaming.api.scala.{DataStream,

StreamExecutionEnvironment}

import org.apache.flink.api.scala._

import org.apache.flink.configuration.Configuration

/**

* 自定义创建并行数据源
*/
object StreamCustomerRichParallelSourceDemo { def main(args: Array[String]): Unit = {
//1.创建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment //2.基于RichParallelSource并行数据源构建数据集

val richParallelSource: DataStream[Long] = env.addSource(new MyRichParallelSource()).setParallelism(2)
richParallelSource.map(line => {

println("接收到的数据:" + line)

})line
} env.execute("StreamRichParallelSourceDemo")

class MyRichParallelSource extends RichParallelSourceFunction[Long] {

var count = 1L
var isRunning = true

override def run(ctx: SourceFunction.SourceContext[Long]): Unit = { while (isRunning) {
ctx.collect(count)
count += 1
Thread.sleep(1000)
} }

override def cancel(): Unit = {

} isRunning = false

override def open(parameters: Configuration): Unit = { super.close()
} }

}

基于 kafka 的 source 操作

Kafka 基本操作

  1. kafka 集群启动与停止
  2. 注意事项:在 kafka 启动前,一定要让 zookeeper 启动起来。
  3. node01 执行以下命令将 kafka 进程启动在后台
cd /export/servers/kafka_2.11-1.0.0
nohup bin/kafka-server-start.sh config/server.properties 2>&1 &
------------------------------------------------------------------
------------------------------------------------------------------
node02 执行以下命令将 kafka 进程启动在后台
cd /export/servers/kafka_2.11-1.0.0
nohup bin/kafka-server-start.sh config/server.properties 2>&1 &
------------------------------------------------------------------
------------------------------------------------------------------
node03 执行以下命令将 kafka 进程启动在后台
cd /export/servers/kafka_2.11-1.0.0
nohup bin/kafka-server-start.sh config/server.properties 2>&1 &
------------------------------------------------------------------
------------------------------------------------------------------
三台机器也可以执行以下命令停止 kafka 集群
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-server-stop.sh
------------------------------------------------------------------
------------------------------------------------------------------
1、创建 topic
创建一个名字为 test 的主题, 有三个分区,有两个副本 node01 执行以下命令来创建 topic
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-topics.sh --create --zookeeper node01:2181 --replication-factor 2 --partitions 3 --topic test
------------------------------------------------------------------
------------------------------------------------------------------
2、查看主题命令
查看 kafka 当中存在的主题
node01 使用以下命令来查看 kafka 当中存在的 topic 主题
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-topics.sh	--list --zookeeper node01:2181,node02:2181,node03:2181
------------------------------------------------------------------
------------------------------------------------------------------
3、生产者生产数据
模拟生产者来生产数据
node01 服务器执行以下命令来模拟生产者进行生产数据
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-console-producer.sh --broker-list node01:9092,node02:9092,node03:9092 --topic test
------------------------------------------------------------------
------------------------------------------------------------------
4、消费者消费数据
node02 服务器执行以下命令来模拟消费者进行消费数据
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-console-consumer.sh --from-beginning --topic test --zookeeper node01:2181,node02:2181,node03:2181
------------------------------------------------------------------
------------------------------------------------------------------
5、运行 describe	topics 命令
node01 执行以下命令运行 describe 查看 topic 的相关信息
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-topics.sh --describe --zookeeper node01:2181 --topic test
结果说明:
这是输出的解释。第一行给出了所有分区的摘要,每个附加行提供有关一个分区的信息。由
于我们只有一个分 区用于此主题,因此只有一行。
“leader”是负责给定分区的所有读取和写入的节点。每个节点将成为随机选择的分区部分的领导者。(因为在 kafka  如果有多个副本的话,就会存在 leader  follower 的关系,表示当前这个副本为 leader 所在的 broker 是哪一个)
“replicas”是复制此分区日志的节点列表,无论它们是否为领导者,或者即使它们当前处于活
动状态。(所有副本列表	0 1,2
“isr”是“同步”复制品的集合。这是副本列表的子集,该列表当前处于活跃状态并且已经被领导者捕获。(可用的列表 数)
------------------------------------------------------------------
------------------------------------------------------------------
6、增加 topic 分区数
任意 kafka 服务器执行以下命令可以增加 topic 分区数
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-topics.sh --zookeeper zkhost:port --alter --topic topicName --partitions 8
------------------------------------------------------------------
------------------------------------------------------------------
7、增加配置
动态修改 kakfa 的配置
任意 kafka 服务器执行以下命令可以增加 topic 分区数
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-topics.sh --zookeeper node01:2181 --alter --topic test --config flush.messages=1
------------------------------------------------------------------
------------------------------------------------------------------
8、删除配置
动态删除 kafka 集群配置
cd /export/servers/kafka_2.11-1.0.0
bin/kafka-topics.sh --zookeeper node01:2181 --alter --topic test --delete-config flush.messages
------------------------------------------------------------------
------------------------------------------------------------------
9、删除 topic
目前删除 topic 在默认情况下知识打上一个删除的标记,在重新启动 kafka 后才删除。如果需要立即删除,则需要在
server.properties 中配置:
delete.topic.enable=true
然后执行以下命令进行删除 topic
bin/kafka-topics.sh --zookeeper node01:2181 --delete --topic test
package com.czxy.flink.stream.source.customer

import java.util.Properties

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import org.apache.flink.streaming.util.serialization.SimpleStringSchema

object StreamKafkaSource {
  def main(args: Array[String]): Unit = {
    //1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.设置Kafka
    val topic="test"
    val props = new Properties
    props.setProperty("bootstrap.servers", "node01:9092")
    props.setProperty("group.id", "test01")
    props.setProperty("key.deserializer",
      "org.apache.kafka.common.serialization.StringDeserializer")
    props.setProperty("value.deserializer",
      "org.apache.kafka.common.serialization.StringDeserializer")
    val consumer: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic,new SimpleStringSchema(),props)
    //设置从最新的数据进行消费
    consumer.setStartFromLatest()
    import org.apache.flink.api.scala._
    //2.添加数据
    val kafkaSource: DataStream[String] = env.addSource(consumer)
    //3.打印输出
    kafkaSource.print()
    //4.执行程序
    env.execute("StreamKafkaSource")
  }
}

基于 mysql 的 source 操作

上面就是 Flink 自带的 Kafka source,那么接下来就模仿着写一个从 MySQL 中读取数据的 Source。

package com.czxy.flink.stream.source.customer

import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}

import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

//自定义从mysql获取数据源
object StreamFromMysqlSource {

  case class Student(stuId: Int, stuName: String, stuAddr: String, stuSex: String)

  def main(args: Array[String]): Unit = {
    //1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    import org.apache.flink.api.scala._
    //2.添加数据源
    val mysqlSource: DataStream[Student] = env.addSource(new MysqlSource())
    //3.打印输出
    mysqlSource.print()
    //4.执行程序
    env.execute("StreamFromMysqlSource")
  }

  class MysqlSource extends RichSourceFunction[Student]() {
    //声明一些对象
    var connection: Connection = null
    var ps: PreparedStatement = null

    //这个方法在初始化的时候被执行一次
    override def open(parameters: Configuration): Unit = {
      val driver = "com.mysql.jdbc.Driver"
      val url = "jdbc:mysql://localhost:3306/test"
      val username = "root"
      val password = "root"
      Class.forName(driver)
      connection = DriverManager.getConnection(url, username, password)
      val sql =
        """
          |select id,name,addr,sex
          |from student
          |""".stripMargin
      ps = connection.prepareStatement(sql)
    }

    //每条数据执行一次
    override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
      val queryResultSet: ResultSet = ps.executeQuery()
      while (queryResultSet.next()) {
        val stuId: Int = queryResultSet.getInt("id")
        val stuName: String = queryResultSet.getString("name")
        val stuAddr: String = queryResultSet.getString("addr")
        val stuSex: String = queryResultSet.getString("sex")
        val student: Student = Student(stuId, stuName, stuAddr, stuSex)
        ctx.collect(student)
      }
    }
    override def cancel(): Unit = {
    }
  }
}

DataStream 的 Transformation

KeyBy

逻辑上将一个流分成不相交的分区,每个分区包含相同键的元素。在内部,这是通过散列分区来实现的

package com.czxy.flink.stream.transformation

import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment}

//keyBy分组操作算子
object StreamKeyBy {
  def main(args: Array[String]): Unit = {
    //1.创建执行环境
     val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据集
    import org.apache.flink.api.scala._
    val elementSource: DataStream[String] = env.fromElements("hadoop hadoop spark hive flink flink")
    //3.数据组合成元祖类型
    val wordAndOne: DataStream[(String, Int)] = elementSource.flatMap(x=>x.split(" ")).map((_,1))
    //4.进行分组
    val KeyedStream: KeyedStream[(String, Int), Tuple] = wordAndOne.keyBy(0)
    //5.聚合计算
    val result: DataStream[(String, Int)] = KeyedStream.reduce((v1,v2)=>(v1._1,v1._2+v2._2))
    //6.打印输出
    result.print().setParallelism(1)
    //7.执行程序
    env.execute("StreamKeyBy")
  }
}

package com.czxy.stream.transformation
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment}
object StreamKeyBy {
def main(args: Array[String]): Unit = {
//获取流处理运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//获取数据源
val socketStream = env.socketTextStream("node01",9999)
//导入隐式转换
import org.apache.flink.api.scala._
//keyBy分组
val groupStream: KeyedStream[(String, Int), Tuple] = socketStream.flatMap(x=>x.split(" ")).map((_,1)).keyBy(0)
//聚合计算
val result: DataStream[(String, Int)] = groupStream.sum(1)
//打印输出
result.print()
//执行程序
env.execute("StreamKeyBy")
	}
} 

Connect

用来将两个 dataStream 组装成一个 ConnectedStreams
而且这个 connectedStream 的组成结构就是保留原有的 dataStream 的结构体;这样我们就可以把不同的数据组装成同一个结构

package com.czxy.flink.stream.transformation

import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{ConnectedStreams, DataStream, StreamExecutionEnvironment}

object StreamConnect {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    import org.apache.flink.api.scala._
    val source1: DataStream[Long] = env.addSource(new NoParallelSource()).setParallelism(1)
    val source2: DataStream[Long] = env.addSource(new NoParallelSource()).setParallelism(1)
    val connectedStreams: ConnectedStreams[Long, Long] = source1.connect(source2)
    val result: DataStream[String] = connectedStreams.map(item1 => {
      "item1: " + item1
    },
      item2 => {
        "item2: " + item2
      })
    result.print()
    env.execute("StreamConnect")
  }

  //实现一个单线程的,数据从1开始递增的数据集
  class NoParallelSource extends SourceFunction[Long]() {
    var number: Long = 1L
    var isRunning: Boolean = true
    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isRunning) {
        ctx.collect(number)
        number += 1
        Thread.sleep(1)
        if (number > 5) {
          cancel()
        }
      }
    }
    override def cancel(): Unit = {
      isRunning = false
    }
  }

}

Split 和 select

在这里插入图片描述
Split 就是将一个 DataStream 分成两个或者多个 DataStream Select 就是获取分流后对应的数据需求:
给出数据 1, 2, 3, 4, 5, 6, 7
请使用 split 和 select 把数据中的奇偶数分开,并打印出奇数

package com.czxy.flink.stream.transformation

import org.apache.flink.streaming.api.scala.{DataStream, SplitStream, StreamExecutionEnvironment}

/**
 * 需求:
 * 给出数据 1, 2, 3, 4, 5, 6, 7
 * 请使用 split 和 select 把数据中的奇偶数分开, 并打印出奇数
 */
object StreamSplit {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    import org.apache.flink.api.scala._
    val source: DataStream[Int] = env.fromElements(1, 2, 3, 4, 5, 6, 7)
    val splitStream: SplitStream[Int] = source.split(x => {
      (x % 2) match {
        case 0 => List("偶数")
        case 1 => List("奇数")
      }
    })
    val result: DataStream[Int] = splitStream.select("奇数")
    result.print()
    env.execute("StreamSplit")
  }
}

数据输出 Data Sinks

将数据 sink 到本地文件(参考批处理)

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.core.fs.FileSystem.WriteMode
import org.apache.flink.streaming.api.datastream.DataStreamSource

object BatchSinkFile {
  def main(args: Array[String]): Unit = {
    val environment = StreamExecutionEnvironment.getExecutionEnvironment

    val textDataStreamSource: DataStreamSource[(Int, String, Double)] = environment.fromElements(
      (19, "zhangsan", 178.8),
      (17, "lisi", 168.8),
      (18, "wangwu", 184.8),
      (21, "zhaoliu", 164.8))
    textDataStreamSource.setParallelism(1).writeAsText("day02/data/output/BatchSin kCollection",WriteMode.OVERWRITE).setParallelism(1)
    environment.execute("BatchSinkFile")
  } }

Sink 到 HDFS(参考批处理)


import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.core.fs.FileSystem.WriteMode
import org.apache.flink.streaming.api.datastream.DataStreamSource

object BatchSinkToHdfsFile {
  def main(args: Array[String]): Unit = {
    val environment = StreamExecutionEnvironment.getExecutionEnvironment

    val textDataStreamSource: DataStreamSource[(Int, String, Double)] = environment.fromElements(
      (19, "zhangsan", 178.8),
      (17, "lisi", 168.8),
      (18, "wangwu", 184.8),
      (21, "zhaoliu", 164.8))
    textDataStreamSource.setParallelism(1).writeAsText("hdfs://node01:8020/filnk/out/bachsinkCollection",WriteMode.OVERWRITE).setParallelism(1)
    environment.execute("BatchSinkFile")
  } }

sink 到 kafka

package com.czxy.flink.stream.sink

import java.util.Properties

import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011
import org.apache.flink.streaming.util.serialization.SimpleStringSchema

object StreamKafkaSink {
  def main(args: Array[String]): Unit = {
    //1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据集
    import org.apache.flink.api.scala._
    val source: DataStream[String] = env.fromElements("1,小丽,北京,女")
    //3.设置kafka的配置信息
    val topic="test"
    val properties: Properties = new Properties()
    properties.setProperty("bootstrap.servers","node01:9092")

    val flinkKafkaProducer: FlinkKafkaProducer011[String] = new FlinkKafkaProducer011[String](topic,new SimpleStringSchema(),properties)

    val result: DataStreamSink[String] = source.addSink(flinkKafkaProducer)

    env.execute("StreamKafkaSink")


  }
}

sink 到 mysql

package com.czxy.stream.sink

import java.sql.{Connection, DriverManager, PreparedStatement} 
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration 
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction 
import org.apache.flink.streaming.api.scala{DataStream,StreamExecutionEnvironment}
object StreamMysqlSink {
case class Student(id: Int, name: String, addr: String, sex: String)
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val studentDStream: DataStream[Student] = env.fromElements(
//Student(4, "小明", "上海", "男")
Student(5, "小青", "广州", "女"),
Student(6, "小月", "深圳", "女")
)
studentDStream.addSink(new StudentSinkToMysql)
env.execute("StreamMysqlSink")
} 
class StudentSinkToMysql extends RichSinkFunction[Student] { 
private var connection: Connection = null
private var ps: PreparedStatement = null
override def open(parameters: Configuration): Unit = {
val driver = "com.mysql.jdbc.Driver"
val url ="jdbc:mysql://localhost:3306/test?characterEncoding=utf-8&useSSL=fa lse"
val username = "root"
val password = "root"
//1:加载驱动
Class.forName(driver)
//2:创建连接
connection = DriverManager.getConnection(url, username, password)
val sql = "insert into student(id , name , addr , sex)
values(?,?,?,?);"
//3:获得执行语句
ps = connection.prepareStatement(sql)
} //关闭连接操作
override def close(): Unit = {
if (connection != null) {
} connection.close()
if (ps != null) {
ps.close()
} }
//每个元素的插入,都要触发一次 invoke,这里主要进行 invoke 插入
override def invoke(stu: Student): Unit = { try {
//4.组装数据,执行插入操作
ps.setInt(1, stu.id)
ps.setString(2, stu.name)
ps.setString(3, stu.addr)
ps.setString(4, stu.sex)
ps.executeUpdate()
} catch {
case e: Exception => println(e.getMessage)
} } 
}
}
package com.czxy.flink.stream.sink

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamMysqlSink {

  case class Student(stuId: Int, stuName: String, stuAddr: String, stuSex: String)

  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    import org.apache.flink.api.scala._
    val source: DataStream[Student] = env.fromElements(
      Student(9, "wangman", "beijing", "nv")
    )
    val result: DataStreamSink[Student] = source.addSink(new MysqlSink())
    env.execute("StreamMysqlSink")

  }

  class MysqlSink extends RichSinkFunction[Student]() {
    var connection: Connection = null
    var ps: PreparedStatement = null

    override def open(parameters: Configuration): Unit = {
      val driver = "com.mysql.jdbc.Driver"
      val url = "jdbc:mysql://localhost:3306/test?characterEncoding=utf-8&useSSL=false"
      val username = "root"
      val password = "root"
      Class.forName(driver)
      connection = DriverManager.getConnection(url, username, password)
      val sql =
        """
          |insert into student(id , name , addr , sex)values(?,?,?,?);
          |""".stripMargin
      ps = connection.prepareStatement(sql)
    }

    //每条数据执行一次
    override def invoke(value: Student): Unit = {
      try{
        ps.setInt(1, value.stuId)
        ps.setString(2, value.stuName)
        ps.setString(3, value.stuAddr)
        ps.setString(4, value.stuSex)
        ps.executeUpdate()
      }catch{
        case e:Exception=>println(e.getMessage)
      }

    }

  }

}

猜你喜欢

转载自blog.csdn.net/weixin_43563705/article/details/107613183