1, data read mysql. When read from the incoming data need mysql boundary data type is long, the primary key column is generally recommended that, if the column is the time required to convert a timestamp.
package com.home.spark import java.sql.DriverManager import java.time.{LocalDateTime, ZoneOffset} import org.apache.spark.rdd.{JdbcRDD, RDD} import org.apache.spark.{SparkConf, SparkContext} object Ex_mysql { def main(args: Array[String]): Unit = { val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo") val sc = new SparkContext(conf) val driverClassName = "com.mysql.jdbc.Driver" val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false" val user = "root" val password = "root"//mysql里时间类型为datetime,传入的条件为时间戳 val sql = "select userId,userName,name from user where createTime > from_unixtime(?) and createTime < from_unixtime(?)" val connection = () => { Class.forName(driverClassName) DriverManager.getConnection(url, user, password) } val startTime = LocalDateTime.of(2018, 11, 3, 0, 0, 0) val endTime = LocalDateTime.of(2018, 11, 4, 0,0 MySQL timestamp is only 10, it is necessary to java in the lower 13 timestamp accuracy directly divided by 1000//) val startTimeStamp = startTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000 val endTimeStamp = endTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000 println("startTime: " + startTime + ", endTime: " + endTime) println("startTime: " + startTimeStamp + ", endTime: " + endTimeStamp) //读取 val result: JdbcRDD[(Int, String, String)] = new JdbcRDD[(Int, String, String)]( sc, connection, sql, startTimeStamp, endTimeStamp, 2, rs => { val userId = rs.getInt(1) val userName = rs.getString(2) val name = rs.getString(3) // println(s"id:${userId},userName:${userName},name:${name}") (userId, userName, name) } ) result.collect().foreach(println) sc.stop() } }
2, write mysql. Reduce the number of connections to create, use foreachPartition , instead of foreach
package com.home.spark import java.sql.{DriverManager, PreparedStatement} import java.time.LocalDateTime import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import scala.collection.mutable object Ex_mysql2 { def main(args: Array[String]): Unit = { val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo") val sc = new SparkContext(conf) val driverClassName = "com.mysql.jdbc.Driver" val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false" val user = "root" val password = "root" //写入 val logBuffer = mutable.ListBuffer[(String, String, String, String, String, String)]() import java.time.format.DateTimeFormatter val ofPattern = DateTimeFormatter.ofPattern("yyyy-MM-dd hh:mm:ss") for (i <- 1 to 100) { logBuffer.+=(("write" + i, "写入测试" + i, "localhost" + i, LocalDateTime.now().format(ofPattern), "spark", LocalDateTime.now().format(ofPattern))) } // logBuffer.foreach(println) logRDD Val: RDD [(String, String, String, String, String, String)] = sc.makeRDD (logbuffer) // To reduce the number of connection creation, use foreachPartition, instead of the foreach // defects: All transmission by way entire Partition iterator manner has OOM risk logRDD.foreachPartition (logData => { the Class.forName (driverClassName) Val Connection = the DriverManager.getConnection (URL, User, password) Val SQL = "INSERT INTO the syslog (Action, Event, Host, insertTime, userName, update_time) values (,,,,,) "?????? Val of Statement: PreparedStatement = Connection.prepareStatement (SQL) the try { logData.foreach { Case (action, event, host, insertTime, userName, updateTime) => { statement.setString(1, action) statement.setString(2, event) statement.setString(3, host) statement.setString(4, insertTime) statement.setString(5, userName) statement.setString(6, updateTime) statement.executeUpdate() } } } finally { if(statement!=null) statement.close() if(connection!=null) connection.close() } connection.close() } ) sc.stop() } }