spark -- 练习题本地数据过滤->kafka->mysql

以下是RNG S8 8强赛失败后，官微发表道歉微博下一级评论

数据说明：

rng_comment.txt文件中的数据

字段	字段含义
index	数据id
child_comment	回复数量
comment_time	评论时间
content	评论内容
da_v	微博个人认证
like_status	赞
pic	图片评论url
user_id	微博用户id
user_name	微博用户名
vip_rank	微博会员等级
stamp	时间戳

1在kafak中创建rng_comment主题，设置2个分区2个副本

bin/kafka-topics.sh --create --zookeeper node01:2181,node02:2181,node03:2181  --replication-factor 2 --partitions 2 --topic rng_comment

2数据预处理，把空行过滤掉

import org.apache.spark.sql.SparkSession

/**
 * @author kismet
 * @date 2020-04-23 16:25  
 */
object tt02 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("tt02").master("local[*]").getOrCreate()
    val sc = spark.sparkContext
    sc.setLogLevel("warn")
    //读取数据过滤掉空行
    val data = sc.textFile("E:\\第四学期\\第一阶段网课\\文档\\spark\\练习题\\4.14号练习题\\4.14号练习题\\rng_comment.txt")
    val filterdata = data.filter(a => {
      if (a.trim != null) {
        true
      } else {
        false
      }
    })

    import spark.implicits._
    val dataFrame = filterdata.toDF()
    //reparttion :修改分区数，默认为2个分区，改为一个
    dataFrame.repartition(1).write.text("E:\\第四学期\\第一阶段网课\\文档\\spark\\练习题\\4.14号练习题\\4.14号练习题\\rng")
  }
}

3请把给出的文件写入到kafka中，根据数据id进行分区，id为奇数的发送到一个分区中，偶数的发送到另一个分区

package com.czxy.demo1;

import org.apache.commons.io.FileUtils;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/**
 * @author kismet
 * @date 2020-04-15 20:35
 */
public class kafkaProducer {
    public static void main(String[] args) {
        Properties props = new Properties();
//kafka服务器地址
        props.put("bootstrap.servers", "node01:9092,node02:9092,node03:9092");
//消息确认机制
        props.put("acks", "all");
//重试机制
        props.put("retries", 0);
//批量发送的大小
        props.put("batch.size", 16384);
//消息延迟
        props.put("linger.ms", 1);
////批量的缓冲区大小
        props.put("buffer.memory", 33554432);
        props.put("key.serializer",
                "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer",
                "org.apache.kafka.common.serialization.StringSerializer");
        //自定义分区
        props.put("partitioner.class", "com.czxy.demo1.MyPartition");

        KafkaProducer<String, String> kafkaProducer = new KafkaProducer<String, String>(props);
        List list = null;
        try {
            //读取本地json文件
            list = FileUtils.readLines(new File("E:\\第四学期\\第一阶段网课\\文档\\spark\\练习题\\4.14号练习题\\4.14号练习题\\rng_comment.txt"));
        } catch (IOException e) {
            e.printStackTrace();
        }
        ArrayList<String> arrayList = new ArrayList<>();
        for (Object o : list) {
            String s = o.toString();
            if (s.trim()!=""){
                arrayList.add(s);
            }
        }
        for (String s : arrayList) {
            String[] split = s.split("\t");
           if (split.length>=11){
               //指定分区
//               if (Integer.parseInt(split[0])%2==0){
//                   kafkaProducer.send(new ProducerRecord<String, String>("rng_comment",0,String.valueOf(0),s));
//               }else {
//                   kafkaProducer.send(new ProducerRecord<String, String>("rng_comment",1,String.valueOf(1),s));
//               }

               //自定义分区
               kafkaProducer.send(new ProducerRecord<String, String>("rng_comment",s));
           }
        }
        kafkaProducer.close();
    }
}

package com.czxy.demo1;


import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.common.Cluster;

import java.util.Map;

/**
 * @author kismet
 * @date 2020-04-23 17:00
 */
public class MyPartition  implements Partitioner {

    @Override
    public int partition(String s, Object o, byte[] bytes, Object o1, byte[] bytes1, Cluster cluster) {
        String[] split = s.split("\t");
        int id=Integer.parseInt(split[0]);
        if (id%2==0){
            return 0;
        }else {
            return 1;
        }
    }

    @Override
    public void close() {

    }

    @Override
    public void configure(Map<String, ?> map) {

    }
}

4使用Spark Streaming对接kafka

使用Spark Streaming对接kafka之后进行计算

在mysql中创建一个数据库rng_comment

在数据库rng_comment创建vip_rank表，字段为数据的所有字段

在数据库rng_comment创建like_status表，字段为数据的所有字段

在数据库rng_comment创建count_conmment表，字段为时间，条数

5查询出微博会员等级为5的用户，并把这些数据写入到mysql数据库中的vip_rank表中

6查询出评论赞的个数在10个以上的数据，并写入到mysql数据库中的like_status表中

7分别计算出2018/10/20 ，2018/10/21，2018/10/22，2018/10/23这四天每一天的评论数是多少，并写入到mysql数据库中的count_conmment表中

package com.czxy.demo01

import org.apache.kafka.clients.consumer.{ConsumerRecord}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.spark.rdd.{JdbcRDD, RDD}
import org.apache.spark.{SparkConf, SparkContext}

/**
 * @author kismet
 * @date 2020-04-15 20:30  
 */
object tt01 {
  def main(args: Array[String]): Unit = {
    val config: SparkConf = new SparkConf().setAppName("SparkStream").setMaster("local[*]")
    val sc = new SparkContext(config)
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(sc, Seconds(5))
    ssc.checkpoint("./kafka")

    //链接kafka
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "SparkKafkaDemo",
      //earliest:当各分区下有已提交的offset时，从提交的offset开始消费；无提交的offset时，从头开始消费
      //latest:当各分区下有已提交的offset时，从提交的offset开始消费；无提交的offset时，消费新产生的该分区下的数据
      //none:topic各分区都存在已提交的offset时，从offset后开始消费；只要有一个分区不存在已提交的offset，则抛出异常
      //这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
      "auto.offset.reset" -> "earliest",
      //false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    //读取数据
    val kafkaDatas: InputDStream[ConsumerRecord[String, String]] =
      KafkaUtils.createDirectStream[String, String](ssc,
        LocationStrategies.PreferConsistent, //本地策略，PreferConsistent官方推荐、高效、均衡
        ConsumerStrategies.Subscribe[String, String](Array("rng_comment"), kafkaParams) //消费策略
      )
    val line: DStream[Array[String]] = kafkaDatas.map(_.value()).map(_.split("\t")).cache()


    //    1.5.1、查询出微博会员等级为5的用户，并把这些数据写入到mysql数据库中的vip_rank表中
    val filterdata: DStream[Array[String]] = line.filter(_ (9) == "5")
    filterdata.foreachRDD {
      rdd: RDD[Array[String]] => {
        rdd.foreachPartition(saveToMySQL)
      }
    }


    //    1.5.2、查询出评论赞的个数在10个以上的数据，并写入到mysql数据库中的like_status表中
    val filterdata2: DStream[Array[String]] = line.filter(_ (5).toInt > 10)
    filterdata2.foreachRDD {
      rdd: RDD[Array[String]] => {
        rdd.foreachPartition(saveToMySQL2)
      }
    }

    //    1.5.3、分别计算出2018/10/20 ，2018/10/21，2018/10/22，2018/10/23这四天每一天的评论数是多少，并写入到mysql数据库中的count_conmment表中
    val filterdata3: DStream[Array[String]] = line.filter {
      date: Array[String] => {
        if(date(2).contains("2018/10/20")||date(2).contains("2018/10/21")||date(2).contains("2018/10/22")||date(2).contains("2018/10/23")){
          true
        } else {
          false
        }
      }
    }
    filterdata2.foreachRDD {
      rdd: RDD[Array[String]] => {
        rdd.foreachPartition(saveToMySQL3)
      }
    }


    ssc.start() //开启
    ssc.awaitTermination() //等待优雅停止
  }

  def getConn(): Connection = {
    DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "root")
  }

  def saveToMySQL(partitionData: Iterator[Array[String]]): Unit = {
    //将数据存入到MySQL
    //获取连接
    val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "root")
    partitionData.foreach(
      data => {
        //将每一条数据存入到MySQL
        var sql = "insert into vip_rank values (?,?,?,?,?,?,?,?,?,?,?)"
        val ps: PreparedStatement = conn.prepareStatement(sql)
        ps.setInt(1, data(0).toInt)
        ps.setInt(2, data(1).toInt)
        ps.setString(3, data(2))
        ps.setString(4, data(3))
        ps.setString(5, data(4))
        ps.setString(6, data(5))
        ps.setString(7, data(6))
        ps.setString(8, data(7))
        ps.setString(9, data(8))
        ps.setInt(10, data(9).toInt)
        ps.setString(11, data(10))
        ps.executeUpdate()
        ps.close()
      })
    //ps.executeBatch()
    conn.close()
  }


  def saveToMySQL2(partitionData: Iterator[Array[String]]): Unit = {
    //将数据存入到MySQL
    //获取连接
    val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "root")
    partitionData.foreach(
      data => {
        //将每一条数据存入到MySQL
        var sql = "insert into like_status values (?,?,?,?,?,?,?,?,?,?,?)"
        val ps: PreparedStatement = conn.prepareStatement(sql)
        ps.setInt(1, data(0).toInt)
        ps.setInt(2, data(1).toInt)
        ps.setString(3, data(2))
        ps.setString(4, data(3))
        ps.setString(5, data(4))
        ps.setString(6, data(5))
        ps.setString(7, data(6))
        ps.setString(8, data(7))
        ps.setString(9, data(8))
        ps.setInt(10, data(9).toInt)
        ps.setString(11, data(10))
        ps.executeUpdate()
        ps.close()
      })
    //ps.executeBatch()
    conn.close()
  }


  def saveToMySQL3(partitionData: Iterator[Array[String]]): Unit = {
    //将数据存入到MySQL
    //获取连接
    val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "root")
    partitionData.foreach(
      data => {
        //将每一条数据存入到MySQL
        var sql = "insert into count_conmment values (?,?)"
        val ps: PreparedStatement = conn.prepareStatement(sql)
        ps.setString(1, data(0))
        ps.setInt(2, data(1).toInt)
        ps.executeUpdate()
        ps.close()
      })
    //ps.executeBatch()
    conn.close()
  }
}

kismetG

发布了124 篇原创文章 · 获赞 214 · 访问量 19万+

私信关注

spark -- 练习题 本地数据过滤->kafka->mysql

1在kafak中创建rng_comment主题，设置2个分区2个副本

2数据预处理，把空行过滤掉

3请把给出的文件写入到kafka中，根据数据id进行分区，id为奇数的发送到一个分区中，偶数的发送到另一个分区

4使用Spark Streaming对接kafka

使用Spark Streaming对接kafka之后进行计算

在mysql中创建一个数据库rng_comment

在数据库rng_comment创建vip_rank表，字段为数据的所有字段

在数据库rng_comment创建like_status表，字段为数据的所有字段

在数据库rng_comment创建count_conmment表，字段为 时间，条数

5查询出微博会员等级为5的用户，并把这些数据写入到mysql数据库中的vip_rank表中

6查询出评论赞的个数在10个以上的数据，并写入到mysql数据库中的like_status表中

7分别计算出2018/10/20 ，2018/10/21，2018/10/22，2018/10/23这四天每一天的评论数是多少，并写入到mysql数据库中的count_conmment表中

猜你喜欢

spark -- 练习题本地数据过滤->kafka->mysql

在数据库rng_comment创建count_conmment表，字段为时间，条数