Spark Streaming real-time computing framework learning 01

Exploring Spark Streaming

Accept one or more lines of text content from port 8888 of hadoop102, and separate the received content with spaces to calculate the number of occurrences of each word

package Spark_Streaming
import org.apache.log4j.{
    
    Level,Logger}
import org.apache.spark.{
    
    SparkContext,SparkConf}
import org.apache.spark.streaming._
object demo01 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //设置日志级别
    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf=new SparkConf().setAppName("demo01").setMaster("local[2]")
    val sc=new SparkContext(conf)
    //从SparkConf创建SteamingContext并指定10s的批处理大小
    val ssc=new StreamingContext(sc,Seconds(10))
    //启动连接到hadoop102 8888端口上,使用收到的数据创建DAtream
    val lines=ssc.socketTextStream("hadoop102",8888)
    val words=lines.flatMap(_.split(" "))
    val wordCounts=words.map(x=>(x,1)).reduceByKey(_+_)
    wordCounts.print()
    //启动流计算环境StreamingContext
    ssc.start()
    ssc.awaitTermination()
  }
}

insert image description here
insert image description here

Listen to the hadoop102 HDFS directory /opt/datafiles, once a new file is added to this directory, Spark
Streaming will calculate the word statistics within the time

package Spark_Streaming
import org.apache.log4j.{
    
    Level,Logger}
import org.apache.spark.{
    
    SparkContext,SparkConf}
import org.apache.spark.streaming._
object demo02 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //设置日志级别
    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf=new SparkConf().setAppName("demo02").setMaster("local[2]")
    val sc=new SparkContext(conf)
    //从SparkConf创建SteamingContext并指定10s的批处理大小
    val ssc=new StreamingContext(sc,Seconds(10))
    //启动连接到hadoop102 hdfs目录/opt/datafiles上
    val lines=ssc.textFileStream("hdfs://hadoop102:8020/opt/datafiles")
    val words=lines.flatMap(_.split(" "))
    val wordCounts=words.map(x=>(x,1)).reduceByKey(_+_)
    wordCounts.print()
    //启动流计算环境StreamingContext
    ssc.start()
    ssc.awaitTermination()
  }
}

insert image description here
insert image description here
insert image description here

Master the DStream programming model

DStream conversion operation

Use transform to split a line of statements into words

package Spark_Streaming
import org.apache.log4j.{
    
    Level,Logger}
import org.apache.spark.{
    
    SparkContext,SparkConf}
import org.apache.spark.streaming._
object demo03 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //设置日志级别
    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf=new SparkConf().setAppName("demo03").setMaster("local[2]")
    val sc=new SparkContext(conf)
    //从SparkConf创建SteamingContext并指定10s的批处理大小
    val ssc=new StreamingContext(sc,Seconds(10))
    //启动连接到hadoop102 8888端口上
    val lines=ssc.socketTextStream("hadoop102",8888)
    val words=lines.transform(rdd=>rdd.flatMap(_.split(" ")))
    words.print()
    //启动流计算环境StreamingContext
    ssc.start()
    ssc.awaitTermination()
  }
}

insert image description here
insert image description here

DStream window operation

The window operation is based on a window batch calculation of a source DStream to obtain a new DStream.
For example, set the window length to 3s and the sliding time interval to 1s. Intercept elements in the source DStream to form a new DStream.

package Spark_Streaming
import org.apache.log4j.{
    
    Level,Logger}
import org.apache.spark.{
    
    SparkContext,SparkConf}
import org.apache.spark.streaming._
object demo04 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //设置日志级别
    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf=new SparkConf().setAppName("demo04").setMaster("local[2]")
    val sc=new SparkContext(conf)
    val ssc=new StreamingContext(sc,Seconds(1))
    val lines=ssc.socketTextStream("localhost",8888)
    val words=lines.flatMap(_.split(" "))
    val windowWords=words.window(Seconds(3),Seconds(1))
    windowWords.print()
    //启动流计算环境StreamingContext
    ssc.start()
    ssc.awaitTermination()
  }
}

Basically input a letter every second, take out all the elements in the length of 3s at the current moment, and print them out. At the 4th time, a is no longer visible, indicating that a is no longer in the current window
insert image description here

DStream output operations

package Spark_Streaming
import org.apache.log4j.{
    
    Level,Logger}
import org.apache.spark.streaming.{
    
    Seconds,StreamingContext}
import org.apache.spark.{
    
    SparkContext,SparkConf}
object demo06 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    //设置日志级别
    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf=new SparkConf().setAppName("demo06").setMaster("local[2]")
    val sc=new SparkContext(conf)
    val ssc=new StreamingContext(sc,Seconds(10))
    val lines=ssc.socketTextStream("hadoop102",8888)
    lines.saveAsTextFiles("hdfs://hadoop102:8020/opt/datafiles/sahf","txt")
    ssc.start()
    ssc.awaitTermination()
  }
}

If the error shows that the permission is insufficient and cannot be written
Execute

hdfs dfs -chmod a+w /opt/datafiles
[xwk@hadoop102 ~]$ nc -l -p 8888
this is 1th line
this is 2th line
this is 3th line
this is 4 th line

insert image description here
The 4 lines of text are saved in the first 4 directories respectively

Use foreachPartition to write the processing results to the MySQL database

mysql> create database spark;
Query OK, 1 row affected (0.07 sec)

mysql> use spark;
Database changed
mysql> create table searchKeyWord(insert_time date,keyword varchar(30),search_count integer);
Query OK, 0 rows affected (0.66 sec)

Set the window length to 60s, the window sliding time interval to 10s, calculate the number of occurrences of each word within 10s, and sort the words in descending order according to the number of occurrences

package Spark_Streaming

import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import java.sql.DriverManager
import org.apache.log4j.{
    
    Level, Logger}

object demo07 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    Logger.getLogger("org").setLevel(Level.ERROR)
    val conf = new SparkConf().setMaster("local[3]").setAppName("WriteDataToMySQL")
    val ssc = new StreamingContext(conf, Seconds(5))
    val ItemsStream = ssc.socketTextStream("hadoop102", 8888)
    val ItemPairs = ItemsStream.map(line => (line.split(",")(0), 1))
    val ItemCount = ItemPairs.reduceByKeyAndWindow((v1:Int,v2:Int)=>v1+v2,Seconds (60),Seconds(10))
    val hottestWord = ItemCount.transform(itemRDD => {
    
    
      val top3 = itemRDD.map(pair => (pair._2, pair._1)).sortByKey(false).map(pair => (pair._2, pair._1)).take(3)
      ssc.sparkContext.makeRDD(top3)
    })
    hottestWord.foreachRDD(rdd => {
    
    
      rdd.foreachPartition(partitionOfRecords => {
    
    
        val url = "jdbc:mysql://hadoop102:3306/spark"
        val user = "root"
        val password = "root"
        Class.forName("com.mysql.cj.jdbc.Driver")
        val conn = DriverManager.getConnection(url, user, password)
        conn.prepareStatement("delete from searchKeyWord where 1=1").executeUpdate()
        conn.setAutoCommit(false)
        val stmt = conn.createStatement()
        partitionOfRecords.foreach(record => {
    
    
          stmt.addBatch("insert into searchKeyWord (insert_time,keyword,search_count) values (now(),'" + record._1 + "','" + record._2 + "')")
        })
        stmt.executeBatch()
        conn.commit()
      })
    })
    ItemsStream.print()
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}
nc -l -p 8888
select * from searchKeyWord;

Guess you like

Origin blog.csdn.net/weixin_46322367/article/details/124925836