SparkSQL read write external data source file -json

{JsonFileTest Object 
  DEF main (args: the Array [String]): Unit = { 
    Val Spark = SparkSession 
      .builder () 
      .master ( "local") 
      .appName ( "JsonFileTest") 
      .getOrCreate () 

    Import spark.implicits._ 

    / / parquet file data into the file data json 
    Val sessionDf spark.read.parquet = (S "$ {} BASE_PATH / trackerSession") 
    sessionDf.show () 

    sessionDf.write.json (S "$ {} BASE_PATH / json") 

    // read the data file json 
    Val jsonDF spark.read.json = (S "$ {} BASE_PATH / json") 
    jsonDF.show () 

    // from JSON Dataset (of type String) to create a the DF 
    Val jsonDataset = Spark .createDataset ( 
      "" "{ "name": "Yin", "address": { "city": "Columbus","state":"Ohio"}}""" :: Nil) 
    Val otherJsonDF spark.read.json = ( jsonDataset) 
    otherJsonDF.show () 

    // primitivesAsString (the default is false) shows a basic type of string conversion type basic types here include: Boolean, int, Long, a float, Double 
    // prefersDecimal (the default is false) shows primitivesAsString when false as will float, double turn into DecimalType 
    Val jsonDataset_1 spark.createDataset = ( 
      "" "{" name ":" Yin "," address ": {" is_old ": to true," Area ": 23000.34}}" "" :: Nil) 
    var otherJsonDF_1 = spark.read.json (jsonDataset_1) 
    otherJsonDF_1.printSchema()
    /*
    root
     |-- address: struct (nullable = true)
     |    |-- area: double (nullable = true) 
     | | - is_old: boolean (Nullable = to true) 
     | - name: String (Nullable = to true) 
     * / 

    var optsMap the Map = ( "primitivesAsString" -> "to true", "prefersDecimal" -> "to true")
    otherJsonDF_1 = spark.read.options(optsMap).json(jsonDataset_1)
    otherJsonDF_1.printSchema () 
    / * 
    root 
     | - address: struct (Nullable = to true) 
     | | - Area: String (Nullable = to true) 
     | | - is_old: String (Nullable = to true) 
     | - name: String ( = to true Nullable) 
     * / 

    optsMap = Map("primitivesAsString" -> "false", "prefersDecimal" -> "true")
    otherJsonDF_1 = spark.read.options(optsMap).json(jsonDataset_1)
    otherJsonDF_1.printSchema () 
    / * 
    root 
     | - address: struct (Nullable = to true) 
     | | - Area: decimal (7,2) (Nullable = to true) 
     | | - is_old: boolean ( = to true Nullable) 
     | - name: String (Nullable = to true) 
     * / 


    // allowComments (default is false), expressed support json comments containing java / c format 
    spark.read.option ( "allowComments", "true ") json (Seq. ( "" "{" name ": / * Hello * /" Yin "," address ": {" is_old ": to true," Area ": 23000.34}}" "").. ToDS ()) Show () 

    / / allowUnquotedFieldNames (default is false), expressed support json domain contains no quotes 
    spark.read.option ( "allowUnquotedFieldNames", "true ") json (Seq ( "" "{name:". Yin ","address":{"is_old":true,"area":23000.34}}""").toDS()).show()

    // allowSingleQuotes (the default is true), indicating whether or not support domain name contained in single quotes json or a value 
    spark.read.option ( "allowSingleQuotes", "true ") json (Seq ( "" "{ 'name':. 'Yin ', "address": { "is_old": to true, "Area": 23000.34}..} "" ") ToDS ()) Show () 

    // allowNumericLeadingZeros (the default is false), indicating whether or not to support json containing 0 values beginning  
    spark.read.option (" allowNumericLeadingZeros "," true "). json (Seq ( "" "{ 'name': 'Yin'," address ": {" is_old ": true," area ": 0023000.34}}".. "") toDS ()) show ()

    // allowNonNumericNumbers (the default is false), indicating whether json support containing NaN3 (Not a Number) 
    spark.read.option ( "allowNonNumericNumbers", "to true"). json (Seq ( "" "{ 'name' : 'Yin', "address": { "is_old": to true, "Area": NaN3}..} "" ") ToDS ()) Show () 

    // allowBackslashEscapingAnyCharacter (the default is false), indicating whether the support json It contains a backslash, backslash, and will ignore 
    spark.read.option ( "allowBackslashEscapingAnyCharacter "," true ") json. (Seq (" "" { 'name': 'Yin', "address": { "is_old": true, "area": ​​"\ $ 23000"}} "" "). . ToDS ()) Show () 

    // the MODE (default is PERMISSIVE), table format is encountered parsing error handling behavior json is:
    // PERMISSIVE represents more tolerant. If a malformed strip, then add a field, the value field named columnNameOfCorruptRecord, the value field is a string json malformed, the others are null 
    spark.read.option ( "MODE", "PERMISSIVE"). Json (Seq ( "" "{ 'name': 'Yin'," address ": {" is_old ": to true," Area ": 3000}}" "", 
      "" "{ 'name': 'Yin'," address ": {" is_old ": to true," Area ": \ 3000}}.". "") ToDS ()) Show () 
    / * 
    + ---------------- + ----------- + ---- + ---- 
    | _corrupt_record | address | name | 
    + -------------------- + ---- + ----------- +
    |                null|[3000,true]| Yin|
    |{'name':'Yin',"ad...|       null|null|
      Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""",
      """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show()
    /*
    +-----------+--------------------+----+
    |    address|     customer_column|name|
    +-----------+--------------------+----+
    |[3000,true]|                null| Yin|
    |       null|{'name':'Yin',"ad...| null | 
    : 3000}} "" "spark.read.option ( "mode", "DROPMALFORMED
    // DROPMALFORMED lose representation error piece of recording format
     * /
    + ----------- + -------------------- + ---- +
      """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show()
    /*
    +-----------+----+
    |    address|name|
    +-----------+----+
    |[3000,true]| Yin|
    +-----------+----+
     */
    //FAILFAST 碰到解析错误的记录直接报错
    spark.read.option("mode", "FAILFAST").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""",
      """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).Show () 
    Val dataFormatDF =
      StructField ( "DATE", DateType, to true)))
    Val customSchema new new StructType = (the Array (StructField ( "name", the StringType, to true),

    // the dateFormat (default is yyyy-MM-dd) represented in string format json time (corresponding to the DataType)
      spark.read.schema(customSchema).option("dateFormat", "dd/MM/yyyy HH:mm").json(Seq(
        """{'name':'Yin',"date":"26/08/2015 18:00"}""").toDS())
    dataFormatDF.write.mode(SaveMode.Overwrite).option("dateFormat", "yyyy/MM/dd").json("testjson")
    spark.read.json("testjson").show()

    //timestampFormat(默认值为yyyy-MM-dd'T'HH:mm:ss.SSSZZ) 表示json中时间的字符串格式(对应着TimestampType)
    val customSchema_1 = new StructType(Array(StructField("name", StringType, true),
      StructField("date", TimestampType, true)))
    val timestampFormatDf =
      spark.read.schema(customSchema_1).option("timestampFormat", "dd/MM/yyyy HH:mm").json(Seq(
        """{'name':'Yin',"date":"26/08/2015 18:00"}""").toDS())

    the Map optMap = Val ( "TimestampFormat" -> "YYYY / the MM / dd HH: mm", DateTimeUtils.TIMEZONE_OPTION -> "GMT") 
    . timestampFormatDf.write.mode (SaveMode.Overwrite) .format ( "JSON") Options ( optMap) .save ( "test.json") 
    spark.read.json ( "test.json") Show (). 

    // compression compression format, compression formats are supported: 
    // none and uncompressed no compression 
    // bzip2 , the deflate, gzip, LZ4, Snappy 
    timestampFormatDf.write.mode (SaveMode.Overwrite) .option ( "compression", "gzip"). json ( "test.json") 

    // MULTILINE indicates whether the record supports split into a json a plurality of rows 
    Val primitiveFieldAndType: a Dataset [String] = spark.createDataset (spark.sparkContext.parallelize ( 
      "" "{" String ":" A Simple String the this IS.",
          "integer":10,
          "long":21474836470, 
          " BigInteger ":92233720368547758070,
          "double":1.7976931348623157E308,
          "boolean":true,
          "null":null
      }""" ::
        """{"string":"this is a simple string.",
 |          "integer":10,
 |          "long":21474836470,
 |          "bigInteger":92233720368547758070,
 |          "double":1.7976931348623157E308,
 |          "boolean":true,
 |          "null":null
 |      }""" :: Nil))(Encoders.STRING)
    primitiveFieldAndType.toDF("value").write.mode(SaveMode.Overwrite).option("compression", "GzIp").text(s"${BASE_PATH}/primitiveFieldAndType")

    val multiLineDF = spark.read.option("multiLine", false).json(s"${BASE_PATH}/primitiveFieldAndType")
    multiLineDF.show()

    spark.stop()
  }
}

  

Guess you like

Origin www.cnblogs.com/tesla-turing/p/11489069.html