reading and writing in a spark rdd mysql

1, data read mysql. When read from the incoming data need mysql boundary data type is long, the primary key column is generally recommended that, if the column is the time required to convert a timestamp.

package com.home.spark


import java.sql.DriverManager
import java.time.{LocalDateTime, ZoneOffset}

import org.apache.spark.rdd.{JdbcRDD, RDD}
import org.apache.spark.{SparkConf, SparkContext}


object Ex_mysql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo")

    val sc = new SparkContext(conf)

    val driverClassName = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false"
    val user = "root"
    val password = "root"//mysql里时间类型为datetime,传入的条件为时间戳
    val sql = "select userId,userName,name from user where createTime > from_unixtime(?) and createTime < from_unixtime(?)"
    val connection = () => {
      Class.forName(driverClassName)
      DriverManager.getConnection(url, user, password)
    }
    val startTime = LocalDateTime.of(2018, 11, 3, 0, 0, 0)
    val endTime = LocalDateTime.of(2018, 11, 4, 0,0 MySQL timestamp is only 10, it is necessary to java in the lower 13 timestamp accuracy directly divided by 1000//)
    

    



    val startTimeStamp = startTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000
    val endTimeStamp = endTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000

    println("startTime: " + startTime + ", endTime: " + endTime)
    println("startTime: " + startTimeStamp + ", endTime: " + endTimeStamp)

    //读取
    val result: JdbcRDD[(Int, String, String)] = new JdbcRDD[(Int, String, String)](
      sc,
      connection,
      sql,
      startTimeStamp,
      endTimeStamp,
      2,
      rs => {

        val userId = rs.getInt(1)
        val userName = rs.getString(2)
        val name = rs.getString(3)
        //        println(s"id:${userId},userName:${userName},name:${name}")
        (userId, userName, name)
      }
    )
    result.collect().foreach(println)

    sc.stop()
  }

}

2, write mysql. Reduce the number of connections to create, use foreachPartition , instead of foreach

 

package com.home.spark

import java.sql.{DriverManager, PreparedStatement}
import java.time.LocalDateTime

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object Ex_mysql2 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo")

    val sc = new SparkContext(conf)

    val driverClassName = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false"
    val user = "root"
    val password = "root"


    //写入
    val logBuffer = mutable.ListBuffer[(String, String, String, String, String, String)]()
    import java.time.format.DateTimeFormatter
    val ofPattern = DateTimeFormatter.ofPattern("yyyy-MM-dd hh:mm:ss")
    for (i <- 1 to 100) {
      logBuffer.+=(("write" + i, "写入测试" + i, "localhost" + i, LocalDateTime.now().format(ofPattern), "spark", LocalDateTime.now().format(ofPattern)))
    }

    //    logBuffer.foreach(println)
    logRDD Val: RDD [(String, String, String, String, String, String)] = sc.makeRDD (logbuffer) 


    // To reduce the number of connection creation, use foreachPartition, instead of the foreach
     // defects: All transmission by way entire Partition iterator manner has OOM risk 
    logRDD.foreachPartition (logData => { 
      the Class.forName (driverClassName) 
      Val Connection = the DriverManager.getConnection (URL, User, password) 
      Val SQL = "INSERT INTO the syslog (Action, Event, Host, insertTime, userName, update_time) values (,,,,,) "?????? 
      Val of Statement: PreparedStatement = Connection.prepareStatement (SQL)
       the try { 
        logData.foreach { 
          Case (action, event, host, insertTime, userName, updateTime) => {

            statement.setString(1, action)
            statement.setString(2, event)
            statement.setString(3, host)
            statement.setString(4, insertTime)
            statement.setString(5, userName)
            statement.setString(6, updateTime)
            statement.executeUpdate()
          }
        }
      }
      finally {
        if(statement!=null) statement.close()
        if(connection!=null) connection.close()
      }


      connection.close()
    }
    )

    sc.stop()
  }

}

Guess you like

Origin www.cnblogs.com/asker009/p/12092362.html
Recommended