Spark的一个小Demo

一个简单的小demo

package com.liuyang.test.scala

import org.apache.spark.sql.SparkSession

/**
  * @author liuyang
  * @date 2019-02-20 17:40
  * @description 实现数据的拆分
  */
object Test {

  //创建样例类
  case class Accessing(client_type: String, mac_line: String, package_name: String
                       , start_time: String, end_time: String, program_name: String
                       , duration: String, origin: String, version: String, total_duration: String, dt: String)

  //创建一个数组记录client_type、mac_line、version和dt
  val record: Array[String] = new Array[String](4)

  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder().master("local[4]").appName("Test").getOrCreate()
    //读取日志
    val data = session.sparkContext.textFile("E:\\bigdataLog\\rawData\\raw_data.txt")
    import session.implicits._

    val all = data.map(line => {
      //判断该行是否是行为记录起始位置,因为行为记录开头是 2019-XX-XX形式,所以直接判定line是否以2开头即可
      if (line.startsWith("2")) {
        //获取client_type、mac_line、version和dt
        getArray(line)
        //通过观察,行为记录信息若&data=后面的值为空,则不记录其信息。添加判断是否含有非空data
        if (line.contains("&data=com")) {
          //若含有data记录,顺便提取出与公共数据共处一行的访问包数据
          resolve(line, record)
        }
      } else {
        //此处代表该行为访问包数据,直接提取
        resolve(line, record)
      }
    }).filter(r => {
      r != null && r != ()
    }).coalesce(1)

    all.map(once => {
      once.asInstanceOf[Accessing]
    }).toDF().show(10000)

  }


  def getArray(line: String): Array[String] = {
      //client_type
      //判断该行是否包含“&clienttype”字段,包含的话提取出值,不包含则置空值
      if (line.contains("&clienttype")) {
        record(0) = line.substring(line.indexOf("&client")).split("&")(1).split("=")(1)
      } else {
        record(0) = ""
      }

      //mac_line
      //同上
      if (line.contains("&macline")) {
        record(1) = line.substring(line.indexOf("&macline")).split("&")(1).split("=")(1)
      } else {
        record(1) = ""
      }

      //version
      //同上
      if (line.contains("&version")) {
        record(2) = line.substring(line.indexOf("&version")).split("&")(1).split("=")(1)
      } else {
        record(2) = ""
      }

      //dt(自己通过观察判断dt应该是行为记录初始信息的时间,提取到日)
      record(3) = line.split(" ")(0)

      record
  }

    def resolve(line: String, record: Array[String]): Accessing = {
      var package_name = ""
      //提取package_name
      if (line.contains("&data")) {
        package_name = line.substring(line.indexOf("&data"), line.indexOf(",{1=")).split("=")(1)
      } else {
        package_name = line.split(",")(0)
      }

      //经过观察,以下字段不符合试题中的要求,暂时排除
      if (package_name.contains("InputSourceBehavior") || package_name.contains("systemRamCpuBehavior") || package_name.contains("keyDownBehavior") || package_name.contains("appRamCpuBehavior") || package_name.contains("sdcardUsbBehavior") || package_name.contains("multiscreeninteractiontvbehavior") || line.equals("") || line.contains("deviceProp") || line.contains("tvratings") || line.contains("keydownBehavior") || line.contains("mediaPlayBehavior")) {
        return null
      } else {
        //如果不包含上述字符串的话,正常处理
        if (line.contains("5=")) {
          //数据示例:com.tcl.thirdAppPlayBehavior,{1=qiyi,2=214260401,3=天乩之白蛇传说,4=2703,5=-1000}
          Accessing(record(0), record(1), package_name, "", "", line.substring(line.indexOf("3="), line.indexOf(",4=")).split("=")(1), line.substring(line.indexOf("5="), line.indexOf("}")).split("=")(1), line.substring(line.indexOf("1="), line.indexOf(",2=")).split("=")(1), record(2), line.substring(line.indexOf("4="), line.indexOf(",5=")).split("=")(1), record(3))
        } else if (line.contains("3=")) {
          //数据示例:com.qiyi.video,{1=2019-01-31 05:45:45,2=2019-01-31 07:23:34,3=2}
          Accessing(record(0), record(1), package_name, line.substring(line.indexOf("1="), line.indexOf(",2=")).split("=")(1), line.substring(line.indexOf("2="), line.indexOf(",3=")).split("=")(1), "", "", "", "", "", "")
        } else if (line.contains("2=")) {
          //数据示例:com.tcl.TVBasicBehavior,{1=2019-01-31 09:17:19,2=2019-01-31 09:59:20}
          Accessing(record(0), record(1), package_name, line.substring(line.indexOf("1="), line.indexOf(",2=")).split("=")(1), line.substring(line.indexOf("2="), line.indexOf("}")).split("=")(1), "", "", "", "", "", "")
        } else {
          return null
        }
      }
    }
}

猜你喜欢

转载自www.cnblogs.com/lanwu114/p/10410047.html