flink statistics PV and UV, and bloom filter
PV (code):
import org. apache. flink. streaming. api. TimeCharacteristic
import org. apache. flink. streaming. api. scala. _
import org. apache. flink. streaming. api. windowing. time. Time
object PageView {
def main( args: Array[ String ] ) : Unit = {
val env = StreamExecutionEnvironment. getExecutionEnvironment
env. setStreamTimeCharacteristic( TimeCharacteristic. EventTime)
env. setParallelism( 1 )
val resource = getClass. getResource( "/UserBehavior.csv" )
val dataStream = env. readTextFile( resource. getPath)
. map( data => {
val dataArray = data. split( "," )
UserBehavior( dataArray( 0 ) . trim. toLong, dataArray( 1 ) . trim. toLong, dataArray( 2 ) . trim. toInt, dataArray( 3 ) . trim, dataArray( 4 ) . trim. toLong)
} )
. assignAscendingTimestamps( _. timestamp * 1000L )
. filter( _. behavior == "pv" )
. map( data => ( "pv" , 1 ) )
. keyBy( _. _1)
. timeWindow( Time. hours( 1 ) )
. sum( 1 )
dataStream. print( "pv count" )
env. execute( "page view job" )
}
}
UV(code):
1: Set is not repeated. Store in Set (data is stored in memory, if the amount of data is too large, the memory requirements will be too large. It is almost impossible to achieve)
import org. apache. flink. streaming. api. TimeCharacteristic
import org. apache. flink. streaming. api. scala. _
import org. apache. flink. streaming. api. scala. function. AllWindowFunction
import org. apache. flink. streaming. api. windowing. time. Time
import org. apache. flink. streaming. api. windowing. windows. TimeWindow
import org. apache. flink. util. Collector
case class UVCount( windowEnd: Long , count: Long )
object UniqueVisitor {
def main( args: Array[ String ] ) : Unit = {
val env = StreamExecutionEnvironment. getExecutionEnvironment
env. setStreamTimeCharacteristic( TimeCharacteristic. EventTime)
env. setParallelism( 1 )
val resource = getClass. getResource( "/UserBehavior.csv" )
val dataStream = env. readTextFile( resource. getPath)
. map( data => {
val dataArray = data. split( "," )
UserBehavior( dataArray( 0 ) . trim. toLong, dataArray( 1 ) . trim. toLong, dataArray( 2 ) . trim. toInt, dataArray( 3 ) . trim, dataArray( 4 ) . trim. toLong)
} )
. assignAscendingTimestamps( _. timestamp * 1000L )
. filter( _. behavior == "pv" )
. timeWindowAll( Time. hours( 1 ) )
. apply( new UVcountByWindow( ) )
dataStream. print( "pv count" )
env. execute( "page view job" )
}
}
class UVcountByWindow( ) extends AllWindowFunction[ UserBehavior, UVCount, TimeWindow] {
override def apply( window: TimeWindow, input: Iterable[ UserBehavior] , out: Collector[ UVCount] ) : Unit = {
var IDSet = Set[ Long ] ( )
for ( x <- input) {
IDSet += x. userId
}
out. collect( UVCount( window. getEnd, IDSet. size) )
}
}
2: UVWithBloom
import org. apache. flink. streaming. api. TimeCharacteristic
import org. apache. flink. streaming. api. scala. _
import org. apache. flink. streaming. api. scala. function. ProcessWindowFunction
import org. apache. flink. streaming. api. windowing. time. Time
import org. apache. flink. streaming. api. windowing. triggers. {
Trigger, TriggerResult}
import org. apache. flink. streaming. api. windowing. windows. TimeWindow
import org. apache. flink. util. Collector
import redis. clients. jedis. Jedis
object UniqeVistrBloom {
def main( args: Array[ String ] ) : Unit = {
val env = StreamExecutionEnvironment. getExecutionEnvironment
env. setStreamTimeCharacteristic( TimeCharacteristic. EventTime)
env. setParallelism( 1 )
val resource = getClass. getResource( "/UserBehavior.csv" )
val dataStream = env. readTextFile( resource. getPath)
. map( data => {
val dataArray = data. split( "," )
UserBehavior( dataArray( 0 ) . trim. toLong, dataArray( 1 ) . trim. toLong, dataArray( 2 ) . trim. toInt, dataArray( 3 ) . trim, dataArray( 4 ) . trim. toLong)
} )
. assignAscendingTimestamps( _. timestamp * 1000L )
. filter( _. behavior == "pv" )
. map( x => ( "uv" , x. userId) )
. keyBy( _. _1)
. timeWindow( Time. hours( 1 ) )
. trigger( new myTrigger( ) )
. process( new UVCoungWithBoolen( ) )
dataStream. print( )
}
}
class myTrigger( ) extends Trigger[ ( String , Long ) , TimeWindow] {
override def onEventTime( time: Long , window: TimeWindow, ctx: Trigger. TriggerContext) : TriggerResult = TriggerResult. CONTINUE
override def onProcessingTime( time: Long , window: TimeWindow, ctx: Trigger. TriggerContext) : TriggerResult = TriggerResult. CONTINUE
override def clear( window: TimeWindow, ctx: Trigger. TriggerContext) : Unit = {
}
override def onElement( element: ( String , Long ) , timestamp: Long , window: TimeWindow, ctx: Trigger. TriggerContext) : TriggerResult =
TriggerResult. FIRE_AND_PURGE
}
class Bloom( size: Long ) extends Serializable {
private val cap = if ( size > 0 ) size else 1 << 29
def hash( value: String , seed: Int ) : Long = {
var result = 0L
for ( i <- 0 until value. length) {
result = result * seed + value. charAt( i)
}
result & ( cap - 1 )
}
}
class UVCoungWithBoolen( ) extends ProcessWindowFunction[ ( String , Long ) , UVCount, String , TimeWindow] {
lazy val jedis = new Jedis( "node01" , 6379 )
lazy val bloom = new Bloom( 1 << 29 )
override def process( key: String , context: Context, elements: Iterable[ ( String , Long ) ] , out: Collector[ UVCount] ) : Unit = {
val storeKey = context. window. getEnd. toString
var count = 0L
if ( jedis. hget( "count" , storeKey) != null ) {
count = jedis. hget( "count" , storeKey) . toLong
}
val userI = elements. last. _2. toString
val offset = bloom. hash( userI, 61 )
val isExist = jedis. getbit( storeKey, offset)
if ( ! isExist) {
jedis. setbit( storeKey, offset, true )
jedis. hset( "count" , storeKey, ( count + 1 ) . toString)
} else {
out. collect( UVCount( storeKey. toLong, count) )
}
}
}