1. parquet
object testSparkReadParquet { def main(args: Array[String]): Unit = { var spark = SparkSession.builder().appName("TestSparkSession").master("local").getOrCreate() val df = spark.read.parquet("D:\\tools\\testSparkFile\\users.parquet"); df.printSchema() df.select("name","favorite_color","favorite_numbers").show() df.select("name","favorite_color").write.mode("overwrite").save("D:\\tools\\testSparkFile\\namesAndFavColors.parquet") val df2 = spark.read.parquet("D:\\tools\\testSparkFile\\namesAndFavColors.parquet"); df2.printSchema() } }
object TestParquet { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("Java Spark SQL basic example") .config("spark.some.config.option", "some-value") .master("local") .getOrCreate(); import spark.implicits._ val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square") squaresDF.write.mode("append").parquet("D:\\tools\\testSparkFile\\test\\key=1") val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube") cubesDF.write.mode("append").parquet("D:\\tools\\testSparkFile\\test\\key=2") // Read the partitioned table val mergedDF = spark.read.option("mergeSchema", "true").parquet("D:\\tools\\testSparkFile\\test\\") mergedDF.select("value", "square","key").show() mergedDF.printSchema() } }
2. DataFrame
object DFExample { case class Student(id:Int, name:String, phone:String, email:String, age:Int) def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("DFExample").master("local").getOrCreate() import spark.implicits._ val studentDF = spark.sparkContext.textFile("D:\\tools\\testSparkFile\\dfTestFile.txt").map(_.split("\\|")) .map(line => Student (Line (0) .trim (). ToInt, Line (. 1) .trim (), Line (2) .trim (), Line (. 3) .trim (), Line (. 4 ) .trim () .toInt)). toDF () Val studentDF2 = studentDF studentDF.show () // filter out names of empty data //studentDF.filter ( " name! = '' OR name! = 'NULL' " ) the .Show ( ) // did not work studentDF.filter ( " name! = 'NULL' " ) .filter ( " name! = '' " ) .Show () // find out the name of the people l at the beginning of studentDF.filter ( " substr (name, 0,1) = 'L' " ) the .Show () // Spark.sql("show functions" ) .Show (2000 ) Import org.apache.spark.sql.functions._ Import spark.implicits._ // By Name sort operations studentDF.sort ( " name " ) .Show () studentDF.sort (studentDF.col ( " name " ) .desc) .Show () // queue rename studentDF.select (studentDF.col ( " name " ) the .as ( " student_name " )). Show () // the Join operation studentDF.join ( studentDF2, studentDF.col ( " the above mentioned id " ) === studentDF2.col ("id")).sort(studentDF.col("id")).show() } }