对于spark 中存在dataframe,我们可以用 .creatOrReplaceTempView方法创建临时表。
临时表创建之后我们就可以用SQL语句对这个临时表进行查询统计:
from pyspark.sql.types import * # Generate our own CSV data # This way we don't have to access the file system yet. stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 23, 'blue')]) # The schema is encoded in a string, using StructType we define the schema using various pyspark.sql.types schemaString = "id name age eyeColor" schema = StructType([ StructField("id", LongType(), True), StructField("name", StringType(), True), StructField("age", LongType(), True), StructField("eyeColor", StringType(), True) ]) # Apply the schema to the RDD and Create DataFrame swimmers = spark.createDataFrame(stringCSVRDD, schema) # Creates a temporary view using the DataFrame swimmers.createOrReplaceTempView("swimmers")
spark.sql("select * from swimmers").show() swimmers.select("id", "age").filter("age = 22").show() spark.sql("select name, eyeColor from swimmers where eyeColor like 'b%'").show()