flink统计PV和UV,以及布隆过滤器

PV(代码):

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

object PageView {
    
    
  def main(args: Array[String]): Unit = {
    
    

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    // 用相对路径定义数据源
    val resource = getClass.getResource("/UserBehavior.csv")
    val dataStream = env.readTextFile(resource.getPath)
      .map(data => {
    
    
        val dataArray = data.split(",")
        UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)
      .filter(_.behavior == "pv")
      .map(data => ("pv", 1))
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      .sum(1)
    dataStream.print("pv count")

    env.execute("page view job")
  }
}

UV(代码):

1:Set为不重复。存入Set中(数据放在内存中,数据量太大的话对内存要求太大。几乎不可实现)

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

case class UVCount(windowEnd: Long, count: Long)

object UniqueVisitor {
    
    
  def main(args: Array[String]): Unit = {
    
    
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    // 用相对路径定义数据源
    val resource = getClass.getResource("/UserBehavior.csv")
    val dataStream = env.readTextFile(resource.getPath)
      .map(data => {
    
    
        val dataArray = data.split(",")
        UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)
      .filter(_.behavior == "pv")
      .timeWindowAll(Time.hours(1))
      .apply(new UVcountByWindow())


    dataStream.print("pv count")

    env.execute("page view job")
  }
}

class UVcountByWindow() extends AllWindowFunction[UserBehavior, UVCount, TimeWindow] {
    
    
  override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UVCount]): Unit = {
    
    
    var IDSet = Set[Long]()
    for (x <- input) {
    
    
      IDSet += x.userId
    }
    out.collect(UVCount(window.getEnd,IDSet.size))
  }
}

2: UVWithBloom

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{
    
    Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis

object UniqeVistrBloom {
    
    
  def main(args: Array[String]): Unit = {
    
    
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    // 用相对路径定义数据源
    val resource = getClass.getResource("/UserBehavior.csv")
    val dataStream = env.readTextFile(resource.getPath)
      .map(data => {
    
    
        val dataArray = data.split(",")
        UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)
      .filter(_.behavior == "pv")
      .map(x => ("uv", x.userId))
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      //自定义窗口触发器
      .trigger(new myTrigger())
      .process(new UVCoungWithBoolen())

    dataStream.print()

  }
}

class myTrigger() extends Trigger[(String, Long), TimeWindow] {
    
    
  override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

  override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

  override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {
    
    }

  override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult =
  //每来一条数据直接触发窗口操作,并清空所有窗口状态
    TriggerResult.FIRE_AND_PURGE
}

//定义一个布隆过滤器
class Bloom(size: Long) extends Serializable {
    
    
  // 位图的总大小,默认16M
  private val cap = if (size > 0) size else 1 << 29

  // 定义hash函数
  def hash(value: String, seed: Int): Long = {
    
    
    var result = 0L
    for (i <- 0 until value.length) {
    
    
      result = result * seed + value.charAt(i)
    }
    result & (cap - 1)
  }
}

class UVCoungWithBoolen() extends ProcessWindowFunction[(String, Long), UVCount, String, TimeWindow] {
    
    
  // 定义redis连接
  lazy val jedis = new Jedis("node01", 6379)
  lazy val bloom = new Bloom(1 << 29)

  override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[UVCount]): Unit = {
    
    
    val storeKey = context.window.getEnd.toString
    var count = 0L
    if (jedis.hget("count", storeKey) != null) {
    
    
      count = jedis.hget("count", storeKey).toLong
    }

    val userI = elements.last._2.toString
    val offset = bloom.hash(userI, 61)

    val isExist = jedis.getbit(storeKey, offset)

    if (!isExist) {
    
    
      jedis.setbit(storeKey, offset, true)
      jedis.hset("count", storeKey, (count + 1).toString)
    } else {
    
    
      out.collect(UVCount(storeKey.toLong, count))
    }
  }
}

猜你喜欢

转载自blog.csdn.net/weixin_44429965/article/details/108046527