spark SQL:在大数据查询是,使用SQL让我们方便了许多。。。
1. pom
<dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.11.7</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>1.5.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>1.5.1</version>
<build> <plugins> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <version>2.15.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> </execution> </executions> <configuration> <args> <arg>-feature</arg> </args> </configuration> </plugin> </plugins> </build>
2.sparkSQL.scala
import java.sql.DriverManager import org.apache.spark.rdd.JdbcRDD import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType } object sparkSQL { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new org.apache.spark.sql.SQLContext(sc) val people = sc.textFile("i:/4/people.txt") val dog = sc.textFile("i:/4/dog.txt") // 这个 schema 字符类型是字符串 val schemaPeople = "name age" val schemadog = "name age" // 导入 Row. import org.apache.spark.sql.Row; // 导入 Spark SQL 数据类型 import org.apache.spark.sql.types.{ StructType, StructField, StringType }; // Generate the schema based on the string of schema val schemaPer = StructType( schemaPeople.split(" ").map(fieldName => StructField(fieldName, StringType, true))) val schemaD = StructType( schemadog.split(" ").map(fieldName => StructField(fieldName, StringType, true))) // Convert records of the RDD (people) to Rows. val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim)) val rowRDD2 = dog.map(_.split(",")).map(p => Row(p(0), p(1).trim)) // Apply the schema to the RDD. val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schemaPer) val dogDataFrame = sqlContext.createDataFrame(rowRDD2, schemaD) // 注册DataFrames为表。 peopleDataFrame.registerTempTable("people") dogDataFrame.registerTempTable("dog") // SQL语句可以通过使用由sqlContext提供的SQL方法运行。 val results = sqlContext.sql("SELECT p.name,d.name,p.age,d.age FROM people p,dog d where p.age=d.age") // SQL查询的结果是DataFrames支持所有的正常的RDD操作。 results.map(t => "perName: " + t(0)+" dogName: " + t(1) + " perAge:"+ t(2)+ " dogAge:"+ t(3)).collect().foreach(println) } }
people.txt
aaa,11 bbb,22 ccc,33 ddd,44
dog.txt
eee,11 fff,22 ggg,33 ddd,44
perName: aaa dogName: eee perAge:11 dogAge:11 perName: bbb dogName: fff perAge:22 dogAge:22 perName: ccc dogName: ggg perAge:33 dogAge:33 perName: ddd dogName: ddd perAge:44 dogAge:44