sparksql基础Demo代码

1.DataFrame

1.1 Case class封装数据

wc.txt

hello spark
tom tom jim
hello tom spark
package com.wedoctor.sparksql

import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

object DataFrameWC {
Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)
    val sc: SparkContext = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._
    val wcData: RDD[String] = sc.textFile("wc.txt")
    val words: RDD[Words] = wcData.flatMap(_.split(
      " ")).map(Words(_))
    val frame: DataFrame = words.toDF()

    //SQL风格
    frame.registerTempTable("t_word")
    sqlContext.sql("select * from t_word").show()

    //DSL风格
    frame.select("word").show(2)

    sc.stop()
  }
}
case class Words(word:String)

1.2 Row封装数据

person.txt

zs,20,zs.163.com
ls,22,ls.qq.com
ww,22,ww.qq.com
zl,21,xxx.qq.com
zl,26,xxx.qq.com
ww,122,ww.qq.com
package com.wedoctor.sparksql

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

object DateFrameDemo {
  Logger.getLogger("org").setLevel(Level.ERROR)

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName(this.getClass.getSimpleName)
    val sc: SparkContext = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    val personData: RDD[String] = sc.textFile("person.txt")
    val rowRDD: RDD[Row] = personData.map(str => {
      val arr: Array[String] = str.split(",")
      Row(arr(0), arr(1).toInt, arr(2))

    })

    val schema: StructType = StructType(
      List(
        StructField("name", StringType, true),
        StructField("age", IntegerType, false),
        StructField("email", StringType, true)
      )
    )

    val pdf: DataFrame = sqlContext.createDataFrame(rowRDD,schema)
    pdf.select("name") .show()

    sc.stop()
  }

}

2.DataSet

person.txt

zs,20,zs.163.com
ls,22,ls.qq.com
ww,22,ww.qq.com
zl,21,xxx.qq.com
zl,26,xxx.qq.com
ww,122,ww.qq.com
package com.wedoctor.sparksql

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object DataSetDemo {
  Logger.getLogger("org").setLevel(Level.ERROR)

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder()
        .master("local[*]")
        .appName(this.getClass.getSimpleName)
        .getOrCreate()
    //导入隐式转换
    import spark.implicits._
    val ds: Dataset[String] = spark.read.textFile("person.txt")
    val ds2: Dataset[(String, Int, String)] = ds.map(str => {
      val arr: Array[String] = str.split(",")
      (arr(0), arr(1).toInt, arr(2))
    })

    /**
      * root
      * |-- _1: string (nullable = true)
      * |-- _2: integer (nullable = false)
      * |-- _3: string (nullable = true)
      */
      //dsl风格
    val ds3: DataFrame = ds2.toDF("age","name","email")
    ds3.select("age").show(3)

    //sql风格
    ds3.createTempView("t_person_test")
    spark.sql("select * from t_person_test").show()
    spark.close()
  }

}
发布了79 篇原创文章 · 获赞 107 · 访问量 8万+

猜你喜欢

转载自blog.csdn.net/zuochang_liu/article/details/97618591