转:SparkStreaming--小案例2对于爬虫来的数据进行分析

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/qq_42721694/article/details/85267653
请注意本博客中代码头和尾是固定模式,而lines是需要根据你的数据特点进行切分和整理的,我会附上我的一部分数据供参考,附在文档末尾。

1.统计某一时间段输入数据出现次数(时间不断更新)这不是爬虫数据分析是一个热身

package Test1226
 
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
//统计某一时间段数据出现次数(时间不断更新)
object test01 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("nwc").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("ERROR")
    ssc.checkpoint("d://123//12262")
    val lines = ssc.socketTextStream("lion", 2222)
    val words = lines.flatMap(_.split(" "))
    val pairs = words.map(word => (word, 1))
    val res = pairs.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(20), Seconds(10))
    res.print()
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}
2.统计哪个网站访问量最多

package Test1226
 
import Test1225.Spider01.updateFunction
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
 
object test02 {
//  统计哪个网站访问量最多
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("nwc").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("ERROR")
    ssc.checkpoint("d://123//122602")
    val lines = ssc.socketTextStream("lion", 2222)
    lines.map(x => x.split(" ")(6).split("\\?")(0)+" "+x.split(" ")(0)).map(x => (x,1)).reduceByKey(_+_)
      .map(x => x._1.split(" ")(0)).countByValue().map(x => (x._2,x._1))
      .transform(rdd => rdd.sortByKey(false)).map(x => (x._2,x._1))print()
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
 
  }
}
3.统计网站某模块访问量降序排序

package Test1226
 
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
 
object test03 {
//  统计网站某模块访问量降序排序
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("nwc").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("ERROR")
    ssc.checkpoint("d://123//122602")
    val lines = ssc.socketTextStream("lion", 3333)
    lines.map(x => (x.split(" ")(6).split("\\?")(0),x.split(" ")(9).toInt)).reduceByKey(_ + _)
        .map(x => (x._2,x._1)).transform(rdd => rdd.sortByKey(false)).map(x =>(x._2,x._1)).print()
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}
4.统计非200的报错访问量(200为正常访问)

package Test1226
 
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
 
object test04 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("nwc").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("ERROR")
    ssc.checkpoint("d://123//122602")
    val lines = ssc.socketTextStream("lion", 3333)
//    统计非200的报错访问量
    lines.filter(x => if(x.split(" ").length <= 8) false else true)
    lines.map(x => (x.split(" ")(6).split("\\?")(0)+" "+x.split(" ")(8),1))
      .filter(x => if(x._1.split(" ")(1)=="200")false else true).reduceByKey(_+_).print()
 
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}
5.统计ueragent的数量(最后一对双引号中的内容)

package Test1226
 
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
 
object test05 {
//统计那个useragent数量
def main(args: Array[String]): Unit = {
  val conf = new SparkConf().setAppName("nwc").setMaster("local[2]")
  val ssc = new StreamingContext(conf, Seconds(5))
  ssc.sparkContext.setLogLevel("ERROR")
  ssc.checkpoint("d://123//122602")
  val lines = ssc.socketTextStream("lion", 3333)
  lines.map(_.split("\"")(5)).map((_,1)).reduceByKey(_+_).print()
  ssc.start()
  ssc.awaitTermination()
  ssc.stop()
}
}
6.统计前一分钟的网站总访问量

package Test1226
 
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
 
object test06 {
//统计前一分钟的访问量
def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("nwc").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("ERROR")
    ssc.checkpoint("d://123//12262")
    val lines = ssc.socketTextStream("lion", 2222)
    lines.window(Seconds(60)).count().print()
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}
所用部分数据:

120.197.87.216 - - [04/Jan/2012:00:00:02 +0800] "GET /home.php?mod=space&uid=563413&mobile=yes HTTP/1.1" 200 3388 "-" "-"
123.126.50.73 - - [04/Jan/2012:00:00:02 +0800] "GET /thread-679411-1-1.html HTTP/1.1" 200 5251 "-" "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
203.208.60.187 - - [04/Jan/2012:00:00:02 +0800] "GET /archiver/tid-3003.html HTTP/1.1" 200 2056 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getgold HTTP/1.1" 200 13886 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getmedal HTTP/1.1" 200 13882 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
110.6.179.88 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=no HTTP/1.1" 200 172 "http://www.itpub.net/forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=yes" "Mozilla/5.0 (Linux; U; Android 2.2; zh-cn; ZTE-U V880 Build/FRF91) UC AppleWebKit/530+ (KHTML, like Gecko) Mobile Safari/530"
116.205.130.2 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=6 HTTP/1.1" 200 32 "http://www.itpub.net/forum-6-1.html?ts=28" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; QQDownload 702; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; AskTbPTV/5.11.3.15590; .NET4.0E)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=133 HTTP/1.1" 200 11 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?tid=1558574 HTTP/1.1" 200 5 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
110.75.173.35 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?goto=lastpost&mod=redirect&tid=1380214 HTTP/1.1" 302 5 "-" "Yahoo! Slurp China"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=133 HTTP/1.1" 200 11 "http://www.itpub.net/thread-1554759-4-10.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?tid=1554759 HTTP/1.1" 200 5 "http://www.itpub.net/thread-1554759-4-10.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
120.197.87.220 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?mod=viewthread&tid=692703&extra=&page=2&mobile=yes HTTP/1.1" 200 4903 "-" "-"
————————————————
版权声明:本文为CSDN博主「橙以」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_42721694/article/details/85267653

发布了12 篇原创文章 · 获赞 130 · 访问量 34万+

猜你喜欢

转载自blog.csdn.net/wdr2003/article/details/102599605