SparkSQL read write external data source file --csv

object CSVFileTest {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("CSVFileTest")
      .master("local")
      .getOrCreate()

    import spark.implicits._

    val df = spark.read.json(s"${BASE_PATH}/people.json")
    
    //将json文件数据转化成csv文件数据
    df.write.mode(SaveMode.Overwrite).csv(s"${BASE_PATH}/csv")

    val csvDF = spark.read.csv(s"${BASE_PATH}/csv").toDF("age", "name")
    csvDF.show()

    //从String类型中的Dataset来创建DataFrame
    val csvDS = spark.createDataset(Seq("23,jeffy", "34,katy"))
    val ds = spark.read.csv(csvDS)
    ds.show()

    // 1: sep delimiter and functions are the same, are represented csv cutting character, (default,) (read and write parameters) 
    spark.read.csv (Seq ( "23 is, Jeffy", "34 is, Katy" ) .toDS ()). Show () 
    spark.read.option ( "sep", "") .csv (Seq ( "Jeffy 23", "34 Katy"). ToDS ()). Show () 
    spark.read .option ( "DELIMITER", "") .csv (Seq ( "Jeffy 23", "34 Katy"). ToDS ()). Show () 
    ds.write.mode (SaveMode.Overwrite) .option ( "sep" , "|") csv (S "$ {} BASE_PATH / DELIMITER"). 

    // 2: header (the default is false) indicating whether or not the first line as a csv file schema (read and write parameters) 
    spark.read.csv (s "$ {BASE_PATH} /cars.csv ").show()
    /*
    +----+-----+-----+--------------------+-----+
    | _c0|  _c1|  _c2|                 _c3|  _c4|
    +----+-----+-----+--------------------+-----+
    |year| make|model|             comment|blank|
    |2012|Tesla|    S|          No comment| null|
    |1997| Ford| E350|Go get one now th...| null|
    |2015|Chevy| Volt|                null| null|
    +----+-----+-----+--------------------+-----+
     */
    val headerDF = spark.read.option("header", true).csv(s"${BASE_PATH}/cars.csv")
    headerDF.printSchema()
    headerDF.write.mode(SaveMode.Overwrite).option("header", true).csv(s"${BASE_PATH}/headerDF")
    /*
    root
     |-- year: string (nullable = true)
     |-- make: string (nullable = true)
     |-- model: string (nullable = true)
     |-- comment: string (nullable = true)
     |-- blank: string (nullable = true)
     */

    headerDF.show ()
    /*
    ---- + ----- + ----- + + ----- + -------------------- + 
    | year | the make | Model | the Comment | blank | 
    + ---- + ----- + ----- + ----- + -------------------- + 
    | 2012 | Tesla | S | No the Comment | null | 
    | 1997 | Ford | E350 | Go One now GET TH ... | null | 
    | 2015 | Chevy | Volt | null | null | 
    + ---- + - + -------------------- + ----- + --- + ----- 
     * / 

    // 3: InferSchema expressed support from the data deduced schema (read-only parameter) 
    Val inferSchemaDF = 
      spark.read.option ( "header", to true) .option ( "InferSchema", to true) .csv (S "$ {} /cars.csv BASE_PATH") 
    inferSchemaDF.printSchema () 
    / * 
    root 
     | - year: Integer (Nullable = to true)
     | - the make: String (Nullable = to true) 
     | - Model: String (Nullable = to true) 
     | - the Comment: String (Nullable = to true) 
     | - blank: String (Nullable = to true) 
     * / 
    inferSchemaDF.show ( ) 
    / * 
    + ---- + ----- + ----- + ----- + -------------------- + 
    | year | the make | Model | the Comment | blank | 
    + ---- + ----- + ----- + -------------------- + - + --- 
    | 2012 | Tesla | S | No the Comment | null | 
    | 1997 | Ford | E350 | Go One now GET TH ... | null | 
    | 2015 | Chevy | Volt | null | null | 
    + ---- -------------------- ----- + ----- + + + + ----- 
     * / 

    //. 4: charset and encoding ( the default is UTF-8), decode (read-only parameter) of the csv file according to the specified encoder
    spark.read.option ( "header", "true "). option ( "encoding", "iso-8859-1"). option ( "sep", "þ"). csv (s "$ {BASE_PATH} / 8859-1.csv-cars_iso "). Show () 
    / * 
    + ---- + ----- + ----- + ----------------- + ----- + --- 
    | year | the make | Model | the Comment | blank | 
    + ---- + ----- + ----- + ----------- + ----- + --------- 
    | 2012 | Tesla | S | No the Comment | null | 
    | 1997 | Ford | E350 | Go One now GET TH ... | null | 
    | 2015 | Chevy | Volt | null | null | 
    + ---- + ----- + ----- + -------------------- + ---- - + 
     * / 


    //. 5: quote (default is ` '') shows no cutting mark field value together with the quote (read and write parameters)  
    var optMap the Map = (" quote "->" \ ' "," delimiter "->" ")
    spark.read.options (optMap) .csv (Seq (" 23 is' jeffy tang ' "," 34 katy ").toDS()).show()
    /*
    +---+----------+
    | _c0 | _c1 | 
    + --- + ---------- + 
    | 23 | Jeffy Tang | 
    | 34 | Katy | 
    + --- + ---------- + 
     * / 

    // 6: escape (default is `\`) if further contained in the field values quote quote labeled, to avoid the escape with (read and write parameters) 
    optMap the Map = ( "quote" -> "\ '", " DELIMITER "->" "," Escape "->" \ "") 
    spark.read.options (optMap) .csv (Seq ( "23 is' Jeffy \" 'Tang' "," 34 is Katy ") ToDS (). ) the .Show () 


    //. 7: comment (default is an empty string, denotes that this feature) indicates a comment tag (read and write parameters) in the CSV 
    optMap = Map ( "comment" - > "~", "header "->" to false ") 
    spark.read.options (optMap) .csv (S" $ {} /comments.csv BASE_PATH ").show()
    /*
    +---+---+---+---+----+-------------------+
    |_c0|_c1|_c2|_c3| _c4|                _c5|
    +---+---+---+---+----+-------------------+
    |. 1 | 2 |. 3 |. 4 | 5.01 | 2015-08-20 15: 57 is: 00 | 
    |. 6 |. 7 |. 8 |. 9 | 0 | 2015-08-21 16: 58: 01 | 
    |. 1 | 2 |. 3 | 4 | 5 | 2015-08-23 18: 00: 42 | 
    + --- + --- + --- + --- + ---- + ------------ + ------- 
     * / 

    // 8: (write parameter) 
    // ignoreLeadingWhiteSpace (the default is false) indicating whether to ignore the value of the field in front spaces 
    // ignoreTrailingWhiteSpace (the default is false) value indicating whether to ignore the back of the field space 
    optMap the Map = ( "ignoreLeadingWhiteSpace" -> "to true", "ignoreTrailingWhiteSpace" -> "to true") 
    spark.read.options (optMap) .csv (Seq ( "A, B, C") .toDS ()). Show () 

    // 9: mULTILINE (default is false) whether to support the analytical reading a record is split into multiple lines of csv (read-only parameter) 
    Val primitiveFieldAndType = Seq ( 
      "" """"
        |string","integer
        |
        |
        |","long
        |
        |","bigInteger",double,boolean,null""".stripMargin,
      """"this is a
        |simple
        |string.","
        |
        |10","
        |21474836470","92233720368547758070","
        |
        |1.7976931348623157E308",true,""".stripMargin)

      primitiveFieldAndType.toDF("value").coalesce(1).write.mode(SaveMode.Overwrite).text(s"csv_multiLine_test")

      spark.read.option("header", true).option("multiLine", true).csv("csv_multiLine_test").show()


    //10:mode (the default is PERMISSIVE) (Read-only parameter) 
    // FAILFAST resolved when there is wrong, immediately throw abnormal
    // DROPMALFORMED expressed resolve to ignore the wrong record
    // PERMISSIVE represent encountered parsing errors when the fields are set to null
    Schema new new StructType = Val (). the Add ( "A", IntegerType) .add ( "B", TimestampType) 
    Val DF1 = spark.read.option ( "MODE", "PERMISSIVE"). Schema (Schema) .csv ( Seq ( "0,2013-111-11 12:13:14", "1,1983-08-04") ToDS ()). 
    df1.show () 

    // 11: The nullValue (default is an empty string), represents nullValue needs to be resolved into a specified string null (read and write parameters) 
    spark.read.option ( "nullValue", "-"). CSV (Seq ( "0,2013-11-11, -", " 1,1983-08-04,3 ") ToDS ()) Show ().. 

    // 12 is: nanValue (default value NaN3) (read-only parameter) 
    // positiveInf 
    // negativeInf 
    Val Numbers = spark.read.format ( "CSV"). Schema (StructType (List ( 
        StructField ( "int", IntegerType, to true), 
        StructField ( "Long", LongType,true),
        StructField("float", FloatType, true), 
        StructField ( "Double", DoubleType, to true) 
      ))).options(Map(
        "header" -> "true",
        "mode" -> "DROPMALFORMED",
        "nullValue" -> "--",
        "nanValue" -> "NAN",
        "negativeInf" -> "-INF",
        "positiveInf" -> "INF")).load(s"${BASE_PATH}/numbers.csv")
    numbers.show()
    /*
    +----+--------+---------+---------------+
    | int|    long|    float|         double|
    +----+--------+---------+---------------+
    |   8| 1000000|    1.042|2.38485450374E7|
    |null|34232323|   98.343|184721.23987223|
    |  34|    null|   98.343|184721.23987223|
    |  34|43323123|     null|184721.23987223|
    |  34|43323123|223823.95|           null|
    | 34 | 43,323,123 | 223,823.0 | NaN |
    root
    |  34|43323123| 223823.0|       Infinity|
    |  34|43323123| 223823.0|      -Infinity|
    ---- + -------- + --------- + + --------------- + 
     * / 

    // 13: CODEC and compression compression format, compression formats are supported: 
    // none means no compression and uncompressed 
    // bzip2, deflate, gzip, lz4 , snappy ( write only parameter) 
    inferSchemaDF.write.mode (SaveMode.Overwrite) .option ( "compression", "the gzip"). CSV (S "$ {} BASE_PATH / compression") 

    // 14 the dateFormat (read and write parameters) 
    Val = customSchema new new StructType (the Array (StructField ( "DATE", the DateType, to true))) 
    Val date1 = 
      Spark .read.option ( "dateFormat", "dd / mM / yyyy HH: mm"). schema (customSchema) .csv (Seq ( "26/08/2015 18:00", "27/10/2014 18:30 ") .toDS ()) 
    date1.printSchema () 
    / * 
      | - DATE: DATE (Nullable = to true) 
     * /
    date1.write.mode(SaveMode.Overwrite).option("dateFormat", "yyyy-MM-dd").csv(s"${BASE_PATH}/dateFormat")
    spark.read.csv(s"${BASE_PATH}/dateFormat").show()

    //15: timestampFormat (读写参数)
    val timeSchema = new StructType(Array(StructField("date", TimestampType, true)))
    val time =
      spark.read.option("timestampFormat", "dd/MM/yyyy HH:mm").schema(timeSchema).csv(Seq("26/08/2015 18:00", "27/10/2014 18:30").toDS())
    time.printSchema()
    /*
    root
      |-- date: timestamp (nullable = true)
     */
    time.write.mode(SaveMode.Overwrite).option("timestampFormat", "yyyy-MM-dd HH:mm").csv(s"${BASE_PATH}/timestampFormat")
    spark.read.csv (S "$ {} BASE_PATH / TimestampFormat") Show (). 

    // 16: maxColumns (default 20480) csv a predetermined maximum number of record (read-only parameter) column 
    spark.read.option ( "maxColumns", "3" ). csv (Seq ( "test, as, g", "h, bm, s"). toDS ()). show () // being given 

    spark.stop () 

  } 
}

  

Guess you like

Origin www.cnblogs.com/tesla-turing/p/11489075.html