JDBC数据源
Spark SQL支持使用JDBC从关系型数据库(比如MySQL)中读取数据。读取的数据,依然由DataFrame表示,可以很方便地使用Spark Core提供的各种算子进行处理。
创建方式:
查询时连接Mysql:
用Spark SQL处理JDBC中的数据是非常有用的。比如说,你的MySQL业务数据库中,有大量的数据,比如1000万,然后,你现在需要编写一个程序,对线上的脏数据某种复杂业务逻辑的处理,甚至复杂到可能涉及到要用Spark SQL反复查询Hive中的数据,来进行关联处理。
那么此时,用Spark SQL来通过JDBC数据源,加载MySQL中的数据,然后通过各种算子进行处理,是最好的选择。因为Spark是分布式的计算框架,对于1000万数据,肯定是分布式处理的。而如果你自己手工编写一个Java程序,那么不好意思,你只能分批次处理了,先处理2万条,再处理2万条,运行太耗时间。
案例:
package Spark_SQL.Hive_sql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @Date: 2019/3/16 17:11
* @Author Angle
*/
public class JDBCDataSource {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
Map<String,String> options = new HashMap<String,String>();
options.put("url","jdbc:mysql://master:3306/testdb");
options.put("dbtable","student_infos");
//将mysql中两个表加载为DataFrame
//创建第一个表的DataFrame
Dataset<Row> studentInfoDF = sqlContext.read().format("jdbc").options(options).load();
options.put("dbtable","studnet_scores");
//创建第二个DataFrame
Dataset<Row> studentScoreDF = sqlContext.read().format("jdbc").options(options).load();
//将两个DataFrame转换为JavaPairRDD执行join操作
JavaPairRDD<String, Tuple2<Integer, Integer>> studentsRDD = studentInfoDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>
(row.getString(0), Integer.valueOf(String.valueOf(row.get(1))));
}
}).join(studentScoreDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
//如果不确定row中元素的类型,就可以直接get之后再转换
return new Tuple2<String, Integer>
(String.valueOf(row.get(0)), Integer.valueOf(String.valueOf(row.get(1))));
}
}));
//将JavaPairRDD换转为JavaRDD<Row>
JavaRDD<Row> studentsRowRDD = studentsRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception {
return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2);
}
});
//过滤出大于score80的数据
JavaRDD<Row> filterStudentRowRDD = studentsRowRDD.filter(new Function<Row, Boolean>() {
@Override
public Boolean call(Row v1) throws Exception {
if (v1.getInt(2) > 80){
return true;
}
return false;
}
});
//转换为DataFrame
List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));
structFields.add(DataTypes.createStructField("score",DataTypes.IntegerType,true));
StructType structType = DataTypes.createStructType(structFields);
Dataset<Row> studentsDF = sqlContext.createDataFrame(filterStudentRowRDD, structType);
//打印出来结果
List<Row> rows = studentsDF.collectAsList();
for(Row row : rows){
System.out.println(row);
}
//将DataFrame中的数据保存到mysql表中
options.put("dbtable","good_student_infos");
studentsDF.write().format("jdbc").options(options).save();
studentsDF.javaRDD().foreach(new VoidFunction<Row>() {
@Override
public void call(Row row) throws Exception {
String sql = "insert into good_student_infos values("
+ "'" + String.valueOf(row.getString(0)) + "',"
+ Integer.valueOf(String.valueOf(row.get(1))) + ","
+ Integer.valueOf(String.valueOf(row.get(2))) + ")";
Class.forName("com.mysql.jdbc.Driver");
Connection conn = null;
Statement stmt= null;
try{
DriverManager.getConnection("jdbc:mysql://master:3306/testdb","root","root");
stmt = conn.createStatement();
stmt.executeUpdate(sql);
}catch (Exception e){
e.printStackTrace();
}finally {
if (stmt != null);
stmt.close();
if (conn != null);
conn.close();
}
}
});
}
}