一、需求分析

1.1、基本需求

从web服务器的日志中，统计实时的访问流量
统计每分钟的ip访问量，取出访问量最大的5个地址，每5秒更新一次

1.1.1、解决思路

将 apache 服务器日志中的时间，转换为时间戳，作为 Event Time
构建滑动窗口，窗口长度为1分钟，滑动距离为5秒

1.2、模块实现

我们现在要实现的模块是 “实时流量统计”。对于一个电商平台而言，用户登录的入口流量、不同页面的访问流量都是值得分析的重要数据，而这些数据，可以简单地从web服务器的日志中提取出来。我们在这里实现最基本的“页面浏览数”的统计，也就是读取服务器日志中的每一行log，统计在一段时间内用户访问url的次数。
具体步骤为：每隔5秒，输出最近10分钟内访问量最多的前N个URL。可以看出，这个需求与之前“实时热门商品统计”非常类似

package com.chb.userbehavioranalysis.traffic


import java.sql.Timestamp
import java.text.SimpleDateFormat

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer


// 输入数据格式
case class ApacheLogEvent(ip: String, userId: String, eventTime: Long, method: String, url: String)

// 输出数据格式
case class UrlViewCount(url: String, windowEnd: Long, count: Long)

/**
 * 实时流量统计
 */
object NetworkTraffic {

    def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)

        val stream = env
            .readTextFile(getClass.getResource("/apache.log").getPath)
            .map(line => {
                val linearray = line.split(" ")
                // 定义时间转换模板将时间转成时间戳
                val simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
                val timestamp = simpleDateFormat.parse(linearray(3)).getTime
                ApacheLogEvent(linearray(0), linearray(1), timestamp, linearray(5), linearray(6))
            })
            // 乱序数据处理，创建时间戳和水位
            .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[ApacheLogEvent](Time.seconds(10)) {
                override def extractTimestamp(t: ApacheLogEvent): Long = {
                    t.eventTime
                }
            })
            .filter(_.method == "GET")
            .keyBy(_.url)
            .timeWindow(Time.minutes(1), Time.seconds(5))
            .aggregate(new CountAgg(), new WindowResultFunction())
            .keyBy(_.windowEnd)
            .process(new TopNHotUrls(5))
            .print()

        env.execute("Network Traffic Analysis Job")
    }

    class CountAgg extends AggregateFunction[ApacheLogEvent, Long, Long] {
        override def add(value: ApacheLogEvent, accumulator: Long): Long = accumulator + 1

        override def createAccumulator(): Long = 0L

        override def getResult(accumulator: Long): Long = accumulator

        override def merge(a: Long, b: Long): Long = a + b
    }

    class WindowResultFunction extends WindowFunction[Long, UrlViewCount, String, TimeWindow] {
        override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
            val url: String = key
            val count = input.iterator.next()
            out.collect(UrlViewCount(url, window.getEnd, count))
        }
    }

    // 自定义process function，统计访问量最大的url，排序输出
    class TopNHotUrls(topSize: Int) extends KeyedProcessFunction[Long, UrlViewCount, String] {

        // 直接定义状态变量，懒加载
        lazy val urlState: ListState[UrlViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[UrlViewCount]("urlState", classOf[UrlViewCount]))

        override def processElement(i: UrlViewCount, context: KeyedProcessFunction[Long, UrlViewCount, String]#Context, collector: Collector[String]): Unit = {
            // 把每条数据保存到状态中
            urlState.add(i)
            // 注册一个定时器，windowEnd + 10秒 时触发
            context.timerService().registerEventTimeTimer(i.windowEnd + 10 * 1000)
        }

        // 实现ontimer
        override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
            // 从状态中获取所有的Url访问量
            val allUrlViews: ListBuffer[UrlViewCount] = ListBuffer()
            import scala.collection.JavaConversions._
            for (urlView <- urlState.get()) {
                allUrlViews += urlView
            }
            // 清空state
            urlState.clear()
            // 按照访问量排序输出
            val sortedUrlViews = allUrlViews.sortBy(_.count)(Ordering.Long.reverse).take(topSize)

            // 将排名信息格式化成 String, 便于打印
            var result: StringBuilder = new StringBuilder
            result.append("====================================\n")
            result.append("时间: ").append(new Timestamp(timestamp - 10 * 1000)).append("\n")

            for (i <- sortedUrlViews.indices) {
                val currentUrlView: UrlViewCount = sortedUrlViews(i)
                // e.g.  No1：  URL=/blog/tags/firefox?flav=rss20  流量=55
                result.append("No").append(i + 1).append(":")
                    .append("  URL=").append(currentUrlView.url)
                    .append("  流量=").append(currentUrlView.count).append("\n")
            }
            result.append("====================================\n\n")

            Thread.sleep(500)
            out.collect(result.toString())

        }
    }

}

二、PageView

package com.chb.userbehavioranalysis.traffic

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time


// 定义输入数据的样例类
case class UserBehavior(userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long)

object PageView {
    def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)

        // 用相对路径定义数据源
        val resource = getClass.getResource("/UserBehavior.csv")
        val dataStream = env.readTextFile(resource.getPath)
            .map(data => {
                val dataArray = data.split(",")
                UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
            })
            .assignAscendingTimestamps(_.timestamp * 1000L)
            .filter(_.behavior == "pv") // 只统计pv操作
            .map(data => ("pv", 1))
            .keyBy(_._1)
            .timeWindow(Time.hours(1))
            .sum(1)

        dataStream.print("pv count")

        env.execute("page view jpb")
    }
}

三、UniqueVisitor 独立访客记录数

package com.chb.userbehavioranalysis.traffic

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector


case class UvCount(windowEnd: Long, uvCount: Long)

/**
 * 独立访客记录数
 */
object UniqueVisitor {
    def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)

        // 用相对路径定义数据源
        val resource = getClass.getResource("/UserBehavior.csv")
        val dataStream = env.readTextFile(resource.getPath)
            .map(data => {
                val dataArray = data.split(",")
                UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
            })
            .assignAscendingTimestamps(_.timestamp * 1000L)
            .filter(_.behavior == "pv") // 只统计pv操作
            .timeWindowAll(Time.hours(1))
            .apply(new UvCountByWindow())

        dataStream.print()
        env.execute("uv job")
    }
}

class UvCountByWindow() extends AllWindowFunction[UserBehavior, UvCount, TimeWindow] {
    override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UvCount]): Unit = {
        // 定义一个scala set，用于保存所有的数据userId并去重
        var idSet = Set[Long]()
        // 把当前窗口所有数据的ID收集到set中，最后输出set的大小
        for (userBehavior <- input) {
            idSet += userBehavior.userId
        }
        out.collect(UvCount(window.getEnd, idSet.size))
    }
}

四、海量数据实时去重方案

4.1、布隆过滤器实现UV统计

package com.chb.userbehavioranalysis.traffic

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis

/**
 * Unique User View
 */
object UvWithBloom {
    def main(args: Array[String]): Unit = {
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)

        // 用相对路径定义数据源
        val resource = getClass.getResource("/UserBehavior.csv")
        val dataStream = env.readTextFile(resource.getPath)
            .map(data => {
                val dataArray = data.split(",")
                UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
            })
            .assignAscendingTimestamps(_.timestamp * 1000L)
            .filter(_.behavior == "pv") // 只统计pv操作
            .map(data => ("dummyKey", data.userId))
            .keyBy(_._1)
            .timeWindow(Time.hours(1))
            .trigger(new MyTrigger())
            .process(new UvCountWithBloom())

        dataStream.print()

        env.execute("uv with bloom job")
    }
}

// 自定义窗口触发器
class MyTrigger() extends Trigger[(String, Long), TimeWindow] {
    override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

    override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

    override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {}

    override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
        // 每来一条数据，就直接触发窗口操作，并清空所有窗口状态
        TriggerResult.FIRE_AND_PURGE
    }
}

// 定义一个布隆过滤器
class Bloom(size: Long) extends Serializable {
    // 位图的总大小，默认16M
    private val cap = if (size > 0) size else 1 << 27

    // 定义hash函数
    def hash(value: String, seed: Int): Long = {
        var result = 0L
        for (i <- 0 until value.length) {
            result = result * seed + value.charAt(i)
        }
        result & (cap - 1)
    }
}

class UvCountWithBloom() extends ProcessWindowFunction[(String, Long), UvCount, String, TimeWindow] {
    // 定义redis连接
    lazy val jedis = new Jedis("localhost", 6379)
    lazy val bloom = new Bloom(1 << 29)

    override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[UvCount]): Unit = {
        // 位图的存储方式，key是windowEnd，value是bitmap
        val storeKey = context.window.getEnd.toString
        var count = 0L
        // 把每个窗口的uv count值也存入名为count的redis表，存放内容为（windowEnd -> uvCount），所以要先从redis中读取
        if (jedis.hget("count", storeKey) != null) {
            count = jedis.hget("count", storeKey).toLong
        }
        // 用布隆过滤器判断当前用户是否已经存在
        val userId = elements.last._2.toString
        val offset = bloom.hash(userId, 61)
        // 定义一个标识位，判断reids位图中有没有这一位
        val isExist = jedis.getbit(storeKey, offset)
        if (!isExist) {
            // 如果不存在，位图对应位置1，count + 1
            jedis.setbit(storeKey, offset, true)
            jedis.hset("count", storeKey, (count + 1).toString)
            out.collect(UvCount(storeKey.toLong, count + 1))
        } else {
            out.collect(UvCount(storeKey.toLong, count))
        }
    }
}

实时流量统计