今儿难得清闲,看了一章书,觉得光看不过瘾,拿了项目中应用场景用Scala实现了一把,加深记忆,顺便练练手
使用场景:用于Spark Streaming计算中,需要根据DF或RDD方便用SQL进行join,filter,aggregate等action
依赖软件环境:
Java 1.8
Scala 2.12
scala-intellij-bin-2017.2.13.zip
Demo文件内容
heroine.txt
不婚族,张三丰
慕容雪,风清扬
任盈盈,令狐冲
瑛姑,周伯通
swordsman.txt
张三丰,99999
风清扬,88888
令狐冲,66666
周伯通,99999
代码:
1 package sparkRDD 2 3 import org.apache.log4j.{Level, Logger} 4 import org.apache.spark.{SparkConf, SparkContext} 5 import org.apache.spark.sql.SparkSession 6 import org.apache.spark.sql.types.StringType 7 import org.apache.spark.sql.types.StructField 8 import org.apache.spark.sql.types.StructType 9 import org.apache.spark.sql.Row 10 11 object DF2RDD { 12 def main(args: Array[String]): Unit = { 13 Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 14 Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF) 15 16 val conf = new SparkConf().setAppName("DF2RDD").setMaster("local[2]") 17 val sc = new SparkContext(conf) 18 val spark = SparkSession.builder.config(conf).getOrCreate() 19 20 val heRDD=spark.sparkContext.textFile("D:\\testdata\\swordsman.txt") 21 val heCol="name,force_value" 22 val heFields=heCol.split(",").map(fieldName => StructField(fieldName, StringType, nullable = true)) 23 val heSchema=StructType(heFields) 24 val heRowRDD=heRDD.map(_.split(",")).map(parts⇒Row(parts(0),parts(1))) 25 val heDF=spark.createDataFrame(heRowRDD, heSchema) 26 27 val sheRDD=spark.sparkContext.textFile("D:\\testdata\\heroine.txt") 28 val sheCol="name,lower_name" 29 val sheFields=sheCol.split(",").map(fieldName => StructField(fieldName, StringType, nullable = true)) 30 val sheSchema=StructType(sheFields) 31 val sheRowRDD=sheRDD.map(_.split(",")).map(parts⇒Row(parts(0),parts(1))) 32 val sheDF=spark.createDataFrame(sheRowRDD, sheSchema) 33 34 val heView=heDF.createOrReplaceTempView("swordsman") 35 val sheView=sheDF.createOrReplaceTempView("heroine") 36 37 val resDF=spark.sql("SELECT t1.name,t2.name as lower,t1.force_value FROM swordsman t1 join heroine t2 on t1.name=t2.lower_name order by t1.force_value desc") 38 resDF.show() 39 40 spark.stop() 41 } 42 }
运行结果:
|name|lower|force_value| +----+-----+-----------+ |张三丰|不婚族|99999| |周伯通|瑛姑|99999| |风清扬|慕容雪|88888| |令狐冲|任盈盈|66666| +----+-----+-----------+