内容:
1.案例分析
2.案例实战
一、案例分析
项目:以京东找出搜索平台排名的产品,The hottest
元数据:date,userID,item,city,device
总体思路:混合使用Spark SQL和Spark Core的内容
第一步:原始的ETL,过滤数据后产生目标数据,实际企业中可能过滤条件非常复杂(进行广播),使用RDD的filter等进行操作;
第二步:过滤后的目标数据进行指定条件的查询,查询条件也可能非常复杂(进行广播),使用RDD的filter算子;
第三步:由于商品是分为种类的,我们在得出最终的结果之前,首先会基于商品进行UV(当然也可以对用户的商品的访问PV),此时要对商品镜像UV=计算的话,必须构建K-V的 RDD,例如构建过程为为(dateItem,UserID)以方便进行groupByKey,在调用了的groupByKey之后对user进行去重,并计算出每一天每一种商品的UV,最终计算出来的的结果的数据类型();
第四步:使用开窗函数row_number统计出每日商品UV前五名的内容,row_number()OVER (PARTITION BY date ORDER BY UV DESC) rank,此时会产生以date为日期、item、uv为Row的dataFrame
第五步:DataFrame转换成RDD,根据日期进行分组并分析出弥天排名为前5的热搜item;
第六步:进行Key-Values交换,然后调用sortByKey进行点击热度排名;
第七步:再次进行Key-Value交换,得出目标数据为(data#item,UV)的格式;
第八步:通过RDD直接操作MYSQL等把结果放入生成系统中的DB中,通过Java EE等Server技术进行可视化以提供市场营销人员、仓库调度系统、快递系统、仓库决策人员吃用数据创造价值;
当然也可以放在Hive中,Java EE等技术通过JDBC等链接访问Hive;
当然也可以放在Spark SQL中,通过Thrift技术通过Java EE使用等;
当然,如果像双十一等时候,一般首选放在Redis中,这样可以实现类似秒杀系统的响应速度
二、代码案例
1.生成日志的代码示例
package SparkSQL
import java.io.{File, FileWriter, PrintWriter}
import java.text.SimpleDateFormat
import java.util.{Calendar, Random}
import scala.reflect.macros.ParseException
/**
* FileName: SparkSQLUserLogsManually
* Author: hadoop
* Email: [email protected]
* Date: 18-11-24 下午9:48
* Description:
*
*/
object SparkSQLUserLogsManually {
def main (args: Array[String]): Unit = {
//数据的容量为10000条
val numberItems = 10000
//数据存储位置
val pathPath = "/home/hadoop/IdeaScala/"
ganerateUserLogs(numberItems,pathPath)
}
/**
* 将数据写入到指定的文件中
* @param pathPath 数据存储的目录
* @param fileName 数据存储的文件
* @param strUserLog
*/
def writeLog (pathPath: String, fileName: String, strUserLog: String) = {
var fw:FileWriter = null
var out:PrintWriter = null
try{
val writeFile = new File(pathPath+fileName)
if(!writeFile.exists()){
writeFile.createNewFile()
}else{
writeFile.delete()
}
fw = new FileWriter(writeFile,true)
out = new PrintWriter(fw)
out.print(strUserLog)
}catch{
case e:Exception=>e.printStackTrace()
}finally {
try{
if (out!=null)
out.close
if (fw != null)
fw.close
}catch{
case e:Exception=>e.printStackTrace()
}
}
}
/**
*a
* @param value
* @param formate
* @param step
* @return
*/
def getCountDate (value: Null, formate: String, step: Int) = {
val sdf = new SimpleDateFormat(formate)
val cal = Calendar.getInstance()
if(value != null){
try{
cal.setTime(sdf.parse(value))
}catch{
case e:ParseException=>e.printStackTrace()
}
}
cal.add(Calendar.DAY_OF_MONTH,step)
sdf.format(cal.getTime)
}
def ganerateUserID () = {
val random = new Random
val userID = Array[String](
"98415b9c-f3d4-45c3-bc7f-dce3126c6c0b", "7371b4bd-8535-461f-a5e2-c4814b2151e1",
"49852bfa-a662-4060-bf68-0dddde5feea1", "8768f089-f736-4346-a83d-e23fe05b0ecd",
"a76ff021-049c-4a1a-8372-02f9c51261d5", "8d5dc011-cbe2-4332-99cd-a1848ddfd65d",
"a2bccbdf-f0e9-489c-8513-011644cb5cf7", "89c79413-a7d1-462c-ab07-01f0835696f7",
"8d525daa-3697-455e-8f02-ab086cda7851", "c6f57c89-9871-4a92-9cbe-a2d76cd79cd0",
"19951134-97e1-4f62-8d5c-134077d1f955", "3202a063-4ebf-4f3f-a4b7-5e542307d726",
"40a0d872-45cc-46bc-b257-64ad898df281", "b891a528-4b5e-4ba7-949c-2a32cb5a75ec",
"0d46d52b-75a2-4df2-b363-43874c9503a2", "c1e4b8cf-0116-46bf-8dc9-55eb074ad315",
"6fd24ac6-1bb0-4ea6-a084-52cc22e9be42", "5f8780af-93e8-4907-9794-f8c960e87d34",
"692b1947-8b2e-45e4-8051-0319b7f0e438", "dde46f46-ff48-4763-9c50-377834ce7137")
userID(random.nextInt(20))
}
def ganerateItemID()={
val random = new Random
val itemID = Array("小米","休闲鞋","洗衣机","显示器","显卡","洗衣液","行车记录仪")
itemID(random.nextInt(7))
}
def ganerateCityID()={
val random = new Random
val CityNames = Array("上海", "北京", "深圳", "广州", "纽约", "伦敦", "东京", "首尔", "莫斯科", "巴黎")
CityNames(random.nextInt(10))
}
def ganerateDevice()={
val random = new Random
val Devices = Array("android","iphone","ipad","PC")
Devices(random.nextInt(4))
}
def ganerateUserLogs(numberItems: Int, pathPath: String): Unit = {
val userLogBuffer = new StringBuffer()
val fileName = "SparkSQLUserlogsHottest.log"
val formate = "yyyy-MM-dd"
for (i <- 0 until numberItems){
val date = getCountDate(null,formate,-1)
val userID = ganerateUserID()
val itemID = ganerateItemID()
val cityID = ganerateCityID()
val device = ganerateDevice()
userLogBuffer.append(date+"\t"
+userID +"\t"+itemID+"\t"+cityID+"\t"+device+"\n")
println(userLogBuffer.toString)
writeLog(pathPath,fileName,userLogBuffer+"")
}
}
}
2.日志案例
package SparkSQL
import java.util
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object SparkSQLUserLogsHosttest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkSQLUserLogsHosttest").setMaster("local[4]")
val sc = new SparkContext(conf)
// val sqlContext = new HiveContext(sc)
val spark = SparkSession.builder().appName("SparkSQLUserLogsHosttest").master("local[4]").config("spark.sql.warehouse.dir", "/home/hadoop/IdeaScala/LearningBigdata/spark-warehouse").getOrCreate()
var path = "/home/hadoop/IdeaScala/SparkSQLUserlogsHottest.log"
val file = sc.textFile(path)
val devicebd = "iphone"
val broadcastdevice = sc.broadcast(devicebd)
val lines = file.filter(line=>{
line.contains(broadcastdevice.value)
})
val listRow = lines.collect()
for(row <- listRow){
println(row)
}
val pairs = lines.map(line=>{
val splited = line.split("\t")
val one = 1
val dataanditemanduserid = splited(0)+"#"+splited(2)+"#"+splited(1)
(dataanditemanduserid,one)
})
val pairRow = pairs.collect()
for(pair <- pairRow){
println(pair)
}
val reduceedPairs = pairs.reduceByKey(_+_)
val reduceedRow = reduceedPairs.collect()
val peopleInformation:util.ArrayList[String] = new util.ArrayList[String]()
for(eachRow <- reduceedRow) {
println(eachRow)
val rowSplited = eachRow._1.split("#")
val userID = rowSplited(2)
val itemID = rowSplited(1)
val dateID = rowSplited(0)
//val jsonzip = "{ Date :" +dateID+", UserID :"+userID+",Username :"+userID+",Item : "+itemID+",count : "+ eachRow._2+"}"
val jsonzip = "{\"Date\":\"" + dateID + "\", \"UserID\":\"" + userID + "\", \"Username\":\"" + userID + "\", \"Item\":\"" + itemID + "\", \"count\":" + eachRow._2 + "}"
peopleInformation.add(jsonzip)
}
for (row <- peopleInformation.toArray()){
println(row)
}
val peopleInformationRDD = sc.parallelize(peopleInformation.toArray())
val peopleInformationDS = spark.read.json(peopleInformationRDD.toString())
peopleInformationDS.createOrReplaceTempView("peopleInformations")
val sqlText = "SELECT UserID,Item, count "+
"FROM ("+
"SELECT "+
"UserID,Item, count,"+
"row_number() OVER (PARTITION BY UserID ORDER BY count DESC) rank"+
" FROM peopleInformations "+
") sub_peopleInformations "+
"WHERE rank <= 3 "
val execellentNameAgeDS = spark.sql(sqlText)
execellentNameAgeDS.show()
execellentNameAgeDS.write.format("json").save(""+"Result")
}
}