udaf self define type
class HllcdistinctByte extends Aggregator[Row, HLLCounter, Array[Byte]] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
def zero: HLLCounter = new HLLCounter(14)
// Combine two values to produce a new value. For performance, the function may modify `buffer`
// and return it instead of constructing a new object
def reduce(buffer: HLLCounter, employee: Row): HLLCounter = {
buffer.add(employee.getString(2))
buffer
}
// Merge two intermediate values
def merge(b1: HLLCounter, b2: HLLCounter): HLLCounter = {
b1.merge(b2)
b1
}
// Transform the output of the reduction
def finish(reduction: HLLCounter): Array[Byte] = {
val out1 = ByteBuffer.allocate(reduction.maxLength())
reduction.writeRegisters(out1)
out1.array()
}
// Specifies the Encoder for the intermediate value type
def bufferEncoder: Encoder[HLLCounter] = Encoders.javaSerialization
// Specifies the Encoder for the final output value type
def outputEncoder: Encoder[Array[Byte]] = Encoders.BINARY
}
val uvbytes = new HllcdistinctByte().toColumn
val uvb = wordsDataFrame.where("event_id = '2001'").groupByKey(_.getString(0)).agg(uvbytes)
uvb.show(5)
猜你喜欢
转载自lingzhi007.iteye.com/blog/2386783
今日推荐
周排行