{JsonFileTest Object DEF main (args: the Array [String]): Unit = { Val Spark = SparkSession .builder () .master ( "local") .appName ( "JsonFileTest") .getOrCreate () Import spark.implicits._ / / parquet file data into the file data json Val sessionDf spark.read.parquet = (S "$ {} BASE_PATH / trackerSession") sessionDf.show () sessionDf.write.json (S "$ {} BASE_PATH / json") // read the data file json Val jsonDF spark.read.json = (S "$ {} BASE_PATH / json") jsonDF.show () // from JSON Dataset (of type String) to create a the DF Val jsonDataset = Spark .createDataset ( "" "{ "name": "Yin", "address": { "city": "Columbus","state":"Ohio"}}""" :: Nil) Val otherJsonDF spark.read.json = ( jsonDataset) otherJsonDF.show () // primitivesAsString (the default is false) shows a basic type of string conversion type basic types here include: Boolean, int, Long, a float, Double // prefersDecimal (the default is false) shows primitivesAsString when false as will float, double turn into DecimalType Val jsonDataset_1 spark.createDataset = ( "" "{" name ":" Yin "," address ": {" is_old ": to true," Area ": 23000.34}}" "" :: Nil) var otherJsonDF_1 = spark.read.json (jsonDataset_1) otherJsonDF_1.printSchema() /* root |-- address: struct (nullable = true) | |-- area: double (nullable = true) | | - is_old: boolean (Nullable = to true) | - name: String (Nullable = to true) * / var optsMap the Map = ( "primitivesAsString" -> "to true", "prefersDecimal" -> "to true") otherJsonDF_1 = spark.read.options(optsMap).json(jsonDataset_1) otherJsonDF_1.printSchema () / * root | - address: struct (Nullable = to true) | | - Area: String (Nullable = to true) | | - is_old: String (Nullable = to true) | - name: String ( = to true Nullable) * / optsMap = Map("primitivesAsString" -> "false", "prefersDecimal" -> "true") otherJsonDF_1 = spark.read.options(optsMap).json(jsonDataset_1) otherJsonDF_1.printSchema () / * root | - address: struct (Nullable = to true) | | - Area: decimal (7,2) (Nullable = to true) | | - is_old: boolean ( = to true Nullable) | - name: String (Nullable = to true) * / // allowComments (default is false), expressed support json comments containing java / c format spark.read.option ( "allowComments", "true ") json (Seq. ( "" "{" name ": / * Hello * /" Yin "," address ": {" is_old ": to true," Area ": 23000.34}}" "").. ToDS ()) Show () / / allowUnquotedFieldNames (default is false), expressed support json domain contains no quotes spark.read.option ( "allowUnquotedFieldNames", "true ") json (Seq ( "" "{name:". Yin ","address":{"is_old":true,"area":23000.34}}""").toDS()).show() // allowSingleQuotes (the default is true), indicating whether or not support domain name contained in single quotes json or a value spark.read.option ( "allowSingleQuotes", "true ") json (Seq ( "" "{ 'name':. 'Yin ', "address": { "is_old": to true, "Area": 23000.34}..} "" ") ToDS ()) Show () // allowNumericLeadingZeros (the default is false), indicating whether or not to support json containing 0 values beginning spark.read.option (" allowNumericLeadingZeros "," true "). json (Seq ( "" "{ 'name': 'Yin'," address ": {" is_old ": true," area ": 0023000.34}}".. "") toDS ()) show () // allowNonNumericNumbers (the default is false), indicating whether json support containing NaN3 (Not a Number) spark.read.option ( "allowNonNumericNumbers", "to true"). json (Seq ( "" "{ 'name' : 'Yin', "address": { "is_old": to true, "Area": NaN3}..} "" ") ToDS ()) Show () // allowBackslashEscapingAnyCharacter (the default is false), indicating whether the support json It contains a backslash, backslash, and will ignore spark.read.option ( "allowBackslashEscapingAnyCharacter "," true ") json. (Seq (" "" { 'name': 'Yin', "address": { "is_old": true, "area": "\ $ 23000"}} "" "). . ToDS ()) Show () // the MODE (default is PERMISSIVE), table format is encountered parsing error handling behavior json is: // PERMISSIVE represents more tolerant. If a malformed strip, then add a field, the value field named columnNameOfCorruptRecord, the value field is a string json malformed, the others are null spark.read.option ( "MODE", "PERMISSIVE"). Json (Seq ( "" "{ 'name': 'Yin'," address ": {" is_old ": to true," Area ": 3000}}" "", "" "{ 'name': 'Yin'," address ": {" is_old ": to true," Area ": \ 3000}}.". "") ToDS ()) Show () / * + ---------------- + ----------- + ---- + ---- | _corrupt_record | address | name | + -------------------- + ---- + ----------- + | null|[3000,true]| Yin| |{'name':'Yin',"ad...| null|null| Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""", """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show() /* +-----------+--------------------+----+ | address| customer_column|name| +-----------+--------------------+----+ |[3000,true]| null| Yin| | null|{'name':'Yin',"ad...| null | : 3000}} "" "spark.read.option ( "mode", "DROPMALFORMED // DROPMALFORMED lose representation error piece of recording format * / + ----------- + -------------------- + ---- + """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).show() /* +-----------+----+ | address|name| +-----------+----+ |[3000,true]| Yin| +-----------+----+ */ //FAILFAST 碰到解析错误的记录直接报错 spark.read.option("mode", "FAILFAST").json(Seq("""{'name':'Yin',"address":{"is_old":true,"area":3000}}""", """{'name':'Yin',"address":{"is_old":true,"area":\3000}}""").toDS()).Show () Val dataFormatDF = StructField ( "DATE", DateType, to true))) Val customSchema new new StructType = (the Array (StructField ( "name", the StringType, to true), // the dateFormat (default is yyyy-MM-dd) represented in string format json time (corresponding to the DataType) spark.read.schema(customSchema).option("dateFormat", "dd/MM/yyyy HH:mm").json(Seq( """{'name':'Yin',"date":"26/08/2015 18:00"}""").toDS()) dataFormatDF.write.mode(SaveMode.Overwrite).option("dateFormat", "yyyy/MM/dd").json("testjson") spark.read.json("testjson").show() //timestampFormat(默认值为yyyy-MM-dd'T'HH:mm:ss.SSSZZ) 表示json中时间的字符串格式(对应着TimestampType) val customSchema_1 = new StructType(Array(StructField("name", StringType, true), StructField("date", TimestampType, true))) val timestampFormatDf = spark.read.schema(customSchema_1).option("timestampFormat", "dd/MM/yyyy HH:mm").json(Seq( """{'name':'Yin',"date":"26/08/2015 18:00"}""").toDS()) the Map optMap = Val ( "TimestampFormat" -> "YYYY / the MM / dd HH: mm", DateTimeUtils.TIMEZONE_OPTION -> "GMT") . timestampFormatDf.write.mode (SaveMode.Overwrite) .format ( "JSON") Options ( optMap) .save ( "test.json") spark.read.json ( "test.json") Show (). // compression compression format, compression formats are supported: // none and uncompressed no compression // bzip2 , the deflate, gzip, LZ4, Snappy timestampFormatDf.write.mode (SaveMode.Overwrite) .option ( "compression", "gzip"). json ( "test.json") // MULTILINE indicates whether the record supports split into a json a plurality of rows Val primitiveFieldAndType: a Dataset [String] = spark.createDataset (spark.sparkContext.parallelize ( "" "{" String ":" A Simple String the this IS.", "integer":10, "long":21474836470, " BigInteger ":92233720368547758070, "double":1.7976931348623157E308, "boolean":true, "null":null }""" :: """{"string":"this is a simple string.", | "integer":10, | "long":21474836470, | "bigInteger":92233720368547758070, | "double":1.7976931348623157E308, | "boolean":true, | "null":null | }""" :: Nil))(Encoders.STRING) primitiveFieldAndType.toDF("value").write.mode(SaveMode.Overwrite).option("compression", "GzIp").text(s"${BASE_PATH}/primitiveFieldAndType") val multiLineDF = spark.read.option("multiLine", false).json(s"${BASE_PATH}/primitiveFieldAndType") multiLineDF.show() spark.stop() } }