1、在spark中,直接把RDD文件存到数据库中。
object IPLocation {
//每个分区连接一次MySQL,并且存储数据。
val data2MySQL =(iterator: Iterator[(String, Int)]) => {
var conn: Connection =null
var ps :PreparedStatement = null //这个对象是用来把数据放到MySQL数据库中
val sql = "INSERTINTO location_info (location, counts, accesse_date) VALUES (?, ?, ?)"
try {
conn =
DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata","root",
"123456")
iterator.foreach(line=> {
ps =conn.prepareStatement(sql)
ps.setString(1,line._1)
ps.setInt(2,line._2)
ps.setDate(3, newDate(System.currentTimeMillis()))
ps.executeUpdate()
})
} catch {
case e: Exception =>println("Mysql Exception")
} finally {
if (ps != null)
ps.close()
if (conn != null)
conn.close()
}
}
def ip2Long(ip: String): Long = {
val fragments =ip.split("[.]")
var ipNum = 0L
for (i <- 0 untilfragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
def binarySearch(lines:Array[(String, String, String)], ip: Long) : Int = {
var low = 0
var high = lines.length - 1
while (low <= high) {
val middle = (low + high) / 2
if ((ip >=lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
return middle
if (ip <lines(middle)._1.toLong)
high = middle - 1
else {
low = middle + 1
}
}
-1
}
def main(args: Array[String]) {
val conf =
new SparkConf().setMaster("local[2]").setAppName("IpLocation")
val sc = new SparkContext(conf)
val ipRulesRdd =sc.textFile("c://ip.txt").map(line =>{
val fields =line.split("\\|")
val start_num = fields(2)
val end_num = fields(3)
val province = fields(6)
(start_num, end_num, province)
})
//全部的ip映射规则
val ipRulesArrary =ipRulesRdd.collect()
//广播规则
val ipRulesBroadcast =sc.broadcast(ipRulesArrary)
//加载要处理的数据
val ipsRDD =sc.textFile("c://access_log").map(line => {
val fields =line.split("\\|")
fields(1)
})
val result = ipsRDD.map(ip =>{
val ipNum = ip2Long(ip)
val index =binarySearch(ipRulesBroadcast.value, ipNum)
val info =ipRulesBroadcast.value(index)
//(ip的起始Num, ip的结束Num,省份名)
info
}).map(t => (t._3, 1)).reduceByKey(_+_)
//foreachPartition操作单元是一个个的分区数据(Itorator),调用函数 data2MySQL 向MySQL写入数据
result.foreachPartition(data2MySQL(_))
//println(result.collect().toBuffer)
sc.stop()
}
2、RDD直接从MySQL中读取数据
代码样本:
def main(args: Array[String]) {
val conf =newSparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]")
val sc = newSparkContext(conf)
valconnection = () => {
Class.forName("com.mysql.jdbc.Driver").newInstance()
DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata","root", "123456")
}
//创建JdbcRDD对象
val jdbcRDD= new JdbcRDD(
sc,
connection,
"SELECT * FROM ta where id >= ? AND id <= ?",
1, 4,
2,
r => { //这个函数就是把MySQL中的数据select出来之后,把第一列的数据赋值给id, 第二列的数据给code
val id =r.getInt(1)
val code= r.getString(2)
(id,code)
}
)
val jrdd =jdbcRDD.collect()
println(jdbcRDD.collect().toBuffer)
sc.stop()
}
}