日志项目在线分析每个城市的点击数

日志项目在线分析每个城市的点击数

从kafka里面消费,由SparkStreaming来处理,存到hbase里面

问题: table.incrementColumnValue   这里用了这个方法  可以指定列的值 自增

如果对于这种问题,我也想用put来做呢

1、使用updateStateByKey ,然后put到hbase,相同的列,版本保存最新的,

但是局限性,它每次会把历史所有的都会输出,5000个城市只有2个有变更,全要重新导入,浪费资源

2、使用mapWithState  只会把更新的输出,解决

ppackage com.atguigu.online

import java.io.File
import java.text.SimpleDateFormat
import java.util.Properties

import com.atguigu.model.StartupReportLogs
import com.atguigu.utils.{HBaseUtils, JsonUtils, ZookeeperUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.hadoop.hbase.client.Table
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.clients.consumer
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaCluster, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * @author wade 
  * @create 2019-03-22 15:21 
  */
object KafkaToSparkToHbase {



  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf().setAppName("wawa").setMaster("local[*]")

    val ssc = new StreamingContext(conf,Seconds(3))
 
    val params = Map(
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop103:9092,hadoop104:9092,hadoop105:9092",
      "zookeeper.connect" -> "hadoop103:2181,hadoop104:2181,hadoop105:2181",
      ConsumerConfig.GROUP_ID_CONFIG -> "bb"
    )

    val kafkaCluster = new KafkaCluster (params)

    val fromOffsets: Map[TopicAndPartition, Long] = ZookeeperUtils.getOffsetFromZookeeper(kafkaCluster,"bb",Set("log-analysis"))

    val idStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder,String](
      ssc,
      params,
      fromOffsets,
      (message:MessageAndMetadata[String,String]) => message.message()
    )

    idStream.foreachRDD(rdd => {


      val sdf = new SimpleDateFormat("yyyyMMdd")
      //table 不能被序列化,
      //使用 foreachPartition 在分区里面进行遍历就没问题了
      rdd.foreachPartition(t =>{

        val table: Table = HBaseUtils.getHBaseTabel(new Properties())
        //向Hbase里面 写  城市_日期      count
        t.foreach(s =>{
          val startupReportLog: StartupReportLogs = JsonUtils.json2StartupLog(s)
          val date = sdf.format(startupReportLog.getActiveTimeInMs)

          var rowkey = startupReportLog.getCity+"_"+date

          table.incrementColumnValue(
            Bytes.toBytes(rowkey),
            Bytes.toBytes("info"),
            Bytes.toBytes("click_count"),
            1L
          )
        })
        table.close()
      })


    })
    //
    //提交完成后更新offset

    ZookeeperUtils.offsetToZookeeper(idStream,kafkaCluster,"bb")
    ssc.start()
    ssc.awaitTermination()
  }
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>log-analysis</artifactId>
        <groupId>com.atguigu</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu</groupId>
    <artifactId>data-processing</artifactId>

    <dependencies>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.3.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.3.2.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
            <version>2.1.1</version>

        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.1.1</version>

        </dependency>


        <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.11</artifactId>
            <version>0.8.2.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.8.2.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.8.2</version>

        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.2</version>

        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>

        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.2</version>

        </dependency>

    </dependencies>




</project>

猜你喜欢

转载自blog.csdn.net/qq_42506914/article/details/88747405