RDD 直接存入MySQL,以及直接读取MySQL中数据


1、在spark中,直接把RDD文件存到数据库中。

object  IPLocation {

  //每个分区连接一次MySQL,并且存储数据。

  val data2MySQL =(iterator: Iterator[(String, Int)]) => {

    var conn: Connection =null

    var ps :PreparedStatement = null  //这个对象是用来把数据放到MySQL数据库中

    val sql = "INSERTINTO location_info (location, counts, accesse_date) VALUES (?, ?, ?)"

    try { 

      conn =

DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata","root",

"123456")

      iterator.foreach(line=> {

        ps =conn.prepareStatement(sql)

        ps.setString(1,line._1)

        ps.setInt(2,line._2)

        ps.setDate(3, newDate(System.currentTimeMillis()))

        ps.executeUpdate()

      })

    } catch {

      case e: Exception =>println("Mysql Exception")

    } finally {

      if (ps != null)

        ps.close()

      if (conn != null)

        conn.close()

    }

  }

  def ip2Long(ip: String): Long = {

    val fragments =ip.split("[.]")

    var ipNum = 0L

    for (i <- 0 untilfragments.length){

      ipNum =  fragments(i).toLong | ipNum << 8L

    }

    ipNum

  }

  def binarySearch(lines:Array[(String, String, String)], ip: Long) : Int = {

    var low = 0

    var high = lines.length - 1

    while (low <= high) {

      val middle = (low + high) / 2

      if ((ip >=lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))

        return middle

      if (ip <lines(middle)._1.toLong)

        high = middle - 1

      else {

        low = middle + 1

      }

    }

    -1

  }

  def main(args: Array[String]) {

val conf =

new SparkConf().setMaster("local[2]").setAppName("IpLocation")

    val sc = new SparkContext(conf)

    val ipRulesRdd =sc.textFile("c://ip.txt").map(line =>{

      val fields =line.split("\\|")

      val start_num = fields(2)

      val end_num = fields(3)

      val province = fields(6)

      (start_num, end_num, province)

    })

    //全部的ip映射规则

    val ipRulesArrary =ipRulesRdd.collect()

    //广播规则

    val ipRulesBroadcast =sc.broadcast(ipRulesArrary)

    //加载要处理的数据

    val ipsRDD =sc.textFile("c://access_log").map(line => {

      val fields =line.split("\\|")

      fields(1)

    })

    val result = ipsRDD.map(ip =>{

      val ipNum = ip2Long(ip)

      val index =binarySearch(ipRulesBroadcast.value, ipNum)

      val info =ipRulesBroadcast.value(index)

      //(ip的起始Num, ip的结束Num,省份名)

      info

    }).map(t => (t._3, 1)).reduceByKey(_+_)

//foreachPartition操作单元是一个个的分区数据(Itorator),调用函数    data2MySQL 向MySQL写入数据

 

   result.foreachPartition(data2MySQL(_))

   //println(result.collect().toBuffer)

    sc.stop()

  }

2、RDD直接从MySQL中读取数据

代码样本:

def main(args: Array[String]) {

    val conf =newSparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]")

    val sc = newSparkContext(conf)

    valconnection = () => {

     Class.forName("com.mysql.jdbc.Driver").newInstance()

     DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata","root", "123456")

}

 

//创建JdbcRDD对象

    val jdbcRDD= new JdbcRDD(

      sc,

     connection,

     "SELECT * FROM ta where id >= ? AND id <= ?",

      1, 4,

2,

      r => {              //这个函数就是把MySQL中的数据select出来之后,把第一列的数据赋值给id, 第二列的数据给code

        val id =r.getInt(1)

        val code= r.getString(2)

        (id,code)

      }

    )

    val jrdd =jdbcRDD.collect()

   println(jdbcRDD.collect().toBuffer)

    sc.stop()

  }

}

 

JdbcRDD类的一部分源码  注意他的主构造器的参数。


猜你喜欢

转载自blog.csdn.net/huangyinzhao/article/details/80357363
rdd