flink statistics PV and UV, and bloom filter

PV (code):

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

object PageView {
    
    
  def main(args: Array[String]): Unit = {
    
    

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    // 用相对路径定义数据源
    val resource = getClass.getResource("/UserBehavior.csv")
    val dataStream = env.readTextFile(resource.getPath)
      .map(data => {
    
    
        val dataArray = data.split(",")
        UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)
      .filter(_.behavior == "pv")
      .map(data => ("pv", 1))
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      .sum(1)
    dataStream.print("pv count")

    env.execute("page view job")
  }
}

UV(code):

1: Set is not repeated. Store in Set (data is stored in memory, if the amount of data is too large, the memory requirements will be too large. It is almost impossible to achieve)

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

case class UVCount(windowEnd: Long, count: Long)

object UniqueVisitor {
    
    
  def main(args: Array[String]): Unit = {
    
    
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    // 用相对路径定义数据源
    val resource = getClass.getResource("/UserBehavior.csv")
    val dataStream = env.readTextFile(resource.getPath)
      .map(data => {
    
    
        val dataArray = data.split(",")
        UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)
      .filter(_.behavior == "pv")
      .timeWindowAll(Time.hours(1))
      .apply(new UVcountByWindow())


    dataStream.print("pv count")

    env.execute("page view job")
  }
}

class UVcountByWindow() extends AllWindowFunction[UserBehavior, UVCount, TimeWindow] {
    
    
  override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UVCount]): Unit = {
    
    
    var IDSet = Set[Long]()
    for (x <- input) {
    
    
      IDSet += x.userId
    }
    out.collect(UVCount(window.getEnd,IDSet.size))
  }
}

2: UVWithBloom

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{
    
    Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis

object UniqeVistrBloom {
    
    
  def main(args: Array[String]): Unit = {
    
    
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    // 用相对路径定义数据源
    val resource = getClass.getResource("/UserBehavior.csv")
    val dataStream = env.readTextFile(resource.getPath)
      .map(data => {
    
    
        val dataArray = data.split(",")
        UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)
      .filter(_.behavior == "pv")
      .map(x => ("uv", x.userId))
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      //自定义窗口触发器
      .trigger(new myTrigger())
      .process(new UVCoungWithBoolen())

    dataStream.print()

  }
}

class myTrigger() extends Trigger[(String, Long), TimeWindow] {
    
    
  override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

  override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

  override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {
    
    }

  override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult =
  //每来一条数据直接触发窗口操作,并清空所有窗口状态
    TriggerResult.FIRE_AND_PURGE
}

//定义一个布隆过滤器
class Bloom(size: Long) extends Serializable {
    
    
  // 位图的总大小,默认16M
  private val cap = if (size > 0) size else 1 << 29

  // 定义hash函数
  def hash(value: String, seed: Int): Long = {
    
    
    var result = 0L
    for (i <- 0 until value.length) {
    
    
      result = result * seed + value.charAt(i)
    }
    result & (cap - 1)
  }
}

class UVCoungWithBoolen() extends ProcessWindowFunction[(String, Long), UVCount, String, TimeWindow] {
    
    
  // 定义redis连接
  lazy val jedis = new Jedis("node01", 6379)
  lazy val bloom = new Bloom(1 << 29)

  override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[UVCount]): Unit = {
    
    
    val storeKey = context.window.getEnd.toString
    var count = 0L
    if (jedis.hget("count", storeKey) != null) {
    
    
      count = jedis.hget("count", storeKey).toLong
    }

    val userI = elements.last._2.toString
    val offset = bloom.hash(userI, 61)

    val isExist = jedis.getbit(storeKey, offset)

    if (!isExist) {
    
    
      jedis.setbit(storeKey, offset, true)
      jedis.hset("count", storeKey, (count + 1).toString)
    } else {
    
    
      out.collect(UVCount(storeKey.toLong, count))
    }
  }
}

Guess you like

Origin blog.csdn.net/weixin_44429965/article/details/108046527