object CSVFileTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("CSVFileTest") .master("local") .getOrCreate() import spark.implicits._ val df = spark.read.json(s"${BASE_PATH}/people.json") //将json文件数据转化成csv文件数据 df.write.mode(SaveMode.Overwrite).csv(s"${BASE_PATH}/csv") val csvDF = spark.read.csv(s"${BASE_PATH}/csv").toDF("age", "name") csvDF.show() //从String类型中的Dataset来创建DataFrame val csvDS = spark.createDataset(Seq("23,jeffy", "34,katy")) val ds = spark.read.csv(csvDS) ds.show() // 1: sep delimiter and functions are the same, are represented csv cutting character, (default,) (read and write parameters) spark.read.csv (Seq ( "23 is, Jeffy", "34 is, Katy" ) .toDS ()). Show () spark.read.option ( "sep", "") .csv (Seq ( "Jeffy 23", "34 Katy"). ToDS ()). Show () spark.read .option ( "DELIMITER", "") .csv (Seq ( "Jeffy 23", "34 Katy"). ToDS ()). Show () ds.write.mode (SaveMode.Overwrite) .option ( "sep" , "|") csv (S "$ {} BASE_PATH / DELIMITER"). // 2: header (the default is false) indicating whether or not the first line as a csv file schema (read and write parameters) spark.read.csv (s "$ {BASE_PATH} /cars.csv ").show() /* +----+-----+-----+--------------------+-----+ | _c0| _c1| _c2| _c3| _c4| +----+-----+-----+--------------------+-----+ |year| make|model| comment|blank| |2012|Tesla| S| No comment| null| |1997| Ford| E350|Go get one now th...| null| |2015|Chevy| Volt| null| null| +----+-----+-----+--------------------+-----+ */ val headerDF = spark.read.option("header", true).csv(s"${BASE_PATH}/cars.csv") headerDF.printSchema() headerDF.write.mode(SaveMode.Overwrite).option("header", true).csv(s"${BASE_PATH}/headerDF") /* root |-- year: string (nullable = true) |-- make: string (nullable = true) |-- model: string (nullable = true) |-- comment: string (nullable = true) |-- blank: string (nullable = true) */ headerDF.show () /* ---- + ----- + ----- + + ----- + -------------------- + | year | the make | Model | the Comment | blank | + ---- + ----- + ----- + ----- + -------------------- + | 2012 | Tesla | S | No the Comment | null | | 1997 | Ford | E350 | Go One now GET TH ... | null | | 2015 | Chevy | Volt | null | null | + ---- + - + -------------------- + ----- + --- + ----- * / // 3: InferSchema expressed support from the data deduced schema (read-only parameter) Val inferSchemaDF = spark.read.option ( "header", to true) .option ( "InferSchema", to true) .csv (S "$ {} /cars.csv BASE_PATH") inferSchemaDF.printSchema () / * root | - year: Integer (Nullable = to true) | - the make: String (Nullable = to true) | - Model: String (Nullable = to true) | - the Comment: String (Nullable = to true) | - blank: String (Nullable = to true) * / inferSchemaDF.show ( ) / * + ---- + ----- + ----- + ----- + -------------------- + | year | the make | Model | the Comment | blank | + ---- + ----- + ----- + -------------------- + - + --- | 2012 | Tesla | S | No the Comment | null | | 1997 | Ford | E350 | Go One now GET TH ... | null | | 2015 | Chevy | Volt | null | null | + ---- -------------------- ----- + ----- + + + + ----- * / //. 4: charset and encoding ( the default is UTF-8), decode (read-only parameter) of the csv file according to the specified encoder spark.read.option ( "header", "true "). option ( "encoding", "iso-8859-1"). option ( "sep", "þ"). csv (s "$ {BASE_PATH} / 8859-1.csv-cars_iso "). Show () / * + ---- + ----- + ----- + ----------------- + ----- + --- | year | the make | Model | the Comment | blank | + ---- + ----- + ----- + ----------- + ----- + --------- | 2012 | Tesla | S | No the Comment | null | | 1997 | Ford | E350 | Go One now GET TH ... | null | | 2015 | Chevy | Volt | null | null | + ---- + ----- + ----- + -------------------- + ---- - + * / //. 5: quote (default is ` '') shows no cutting mark field value together with the quote (read and write parameters) var optMap the Map = (" quote "->" \ ' "," delimiter "->" ") spark.read.options (optMap) .csv (Seq (" 23 is' jeffy tang ' "," 34 katy ").toDS()).show() /* +---+----------+ | _c0 | _c1 | + --- + ---------- + | 23 | Jeffy Tang | | 34 | Katy | + --- + ---------- + * / // 6: escape (default is `\`) if further contained in the field values quote quote labeled, to avoid the escape with (read and write parameters) optMap the Map = ( "quote" -> "\ '", " DELIMITER "->" "," Escape "->" \ "") spark.read.options (optMap) .csv (Seq ( "23 is' Jeffy \" 'Tang' "," 34 is Katy ") ToDS (). ) the .Show () //. 7: comment (default is an empty string, denotes that this feature) indicates a comment tag (read and write parameters) in the CSV optMap = Map ( "comment" - > "~", "header "->" to false ") spark.read.options (optMap) .csv (S" $ {} /comments.csv BASE_PATH ").show() /* +---+---+---+---+----+-------------------+ |_c0|_c1|_c2|_c3| _c4| _c5| +---+---+---+---+----+-------------------+ |. 1 | 2 |. 3 |. 4 | 5.01 | 2015-08-20 15: 57 is: 00 | |. 6 |. 7 |. 8 |. 9 | 0 | 2015-08-21 16: 58: 01 | |. 1 | 2 |. 3 | 4 | 5 | 2015-08-23 18: 00: 42 | + --- + --- + --- + --- + ---- + ------------ + ------- * / // 8: (write parameter) // ignoreLeadingWhiteSpace (the default is false) indicating whether to ignore the value of the field in front spaces // ignoreTrailingWhiteSpace (the default is false) value indicating whether to ignore the back of the field space optMap the Map = ( "ignoreLeadingWhiteSpace" -> "to true", "ignoreTrailingWhiteSpace" -> "to true") spark.read.options (optMap) .csv (Seq ( "A, B, C") .toDS ()). Show () // 9: mULTILINE (default is false) whether to support the analytical reading a record is split into multiple lines of csv (read-only parameter) Val primitiveFieldAndType = Seq ( "" """" |string","integer | | |","long | |","bigInteger",double,boolean,null""".stripMargin, """"this is a |simple |string."," | |10"," |21474836470","92233720368547758070"," | |1.7976931348623157E308",true,""".stripMargin) primitiveFieldAndType.toDF("value").coalesce(1).write.mode(SaveMode.Overwrite).text(s"csv_multiLine_test") spark.read.option("header", true).option("multiLine", true).csv("csv_multiLine_test").show() //10:mode (the default is PERMISSIVE) (Read-only parameter) // FAILFAST resolved when there is wrong, immediately throw abnormal // DROPMALFORMED expressed resolve to ignore the wrong record // PERMISSIVE represent encountered parsing errors when the fields are set to null Schema new new StructType = Val (). the Add ( "A", IntegerType) .add ( "B", TimestampType) Val DF1 = spark.read.option ( "MODE", "PERMISSIVE"). Schema (Schema) .csv ( Seq ( "0,2013-111-11 12:13:14", "1,1983-08-04") ToDS ()). df1.show () // 11: The nullValue (default is an empty string), represents nullValue needs to be resolved into a specified string null (read and write parameters) spark.read.option ( "nullValue", "-"). CSV (Seq ( "0,2013-11-11, -", " 1,1983-08-04,3 ") ToDS ()) Show ().. // 12 is: nanValue (default value NaN3) (read-only parameter) // positiveInf // negativeInf Val Numbers = spark.read.format ( "CSV"). Schema (StructType (List ( StructField ( "int", IntegerType, to true), StructField ( "Long", LongType,true), StructField("float", FloatType, true), StructField ( "Double", DoubleType, to true) ))).options(Map( "header" -> "true", "mode" -> "DROPMALFORMED", "nullValue" -> "--", "nanValue" -> "NAN", "negativeInf" -> "-INF", "positiveInf" -> "INF")).load(s"${BASE_PATH}/numbers.csv") numbers.show() /* +----+--------+---------+---------------+ | int| long| float| double| +----+--------+---------+---------------+ | 8| 1000000| 1.042|2.38485450374E7| |null|34232323| 98.343|184721.23987223| | 34| null| 98.343|184721.23987223| | 34|43323123| null|184721.23987223| | 34|43323123|223823.95| null| | 34 | 43,323,123 | 223,823.0 | NaN | root | 34|43323123| 223823.0| Infinity| | 34|43323123| 223823.0| -Infinity| ---- + -------- + --------- + + --------------- + * / // 13: CODEC and compression compression format, compression formats are supported: // none means no compression and uncompressed // bzip2, deflate, gzip, lz4 , snappy ( write only parameter) inferSchemaDF.write.mode (SaveMode.Overwrite) .option ( "compression", "the gzip"). CSV (S "$ {} BASE_PATH / compression") // 14 the dateFormat (read and write parameters) Val = customSchema new new StructType (the Array (StructField ( "DATE", the DateType, to true))) Val date1 = Spark .read.option ( "dateFormat", "dd / mM / yyyy HH: mm"). schema (customSchema) .csv (Seq ( "26/08/2015 18:00", "27/10/2014 18:30 ") .toDS ()) date1.printSchema () / * | - DATE: DATE (Nullable = to true) * / date1.write.mode(SaveMode.Overwrite).option("dateFormat", "yyyy-MM-dd").csv(s"${BASE_PATH}/dateFormat") spark.read.csv(s"${BASE_PATH}/dateFormat").show() //15: timestampFormat (读写参数) val timeSchema = new StructType(Array(StructField("date", TimestampType, true))) val time = spark.read.option("timestampFormat", "dd/MM/yyyy HH:mm").schema(timeSchema).csv(Seq("26/08/2015 18:00", "27/10/2014 18:30").toDS()) time.printSchema() /* root |-- date: timestamp (nullable = true) */ time.write.mode(SaveMode.Overwrite).option("timestampFormat", "yyyy-MM-dd HH:mm").csv(s"${BASE_PATH}/timestampFormat") spark.read.csv (S "$ {} BASE_PATH / TimestampFormat") Show (). // 16: maxColumns (default 20480) csv a predetermined maximum number of record (read-only parameter) column spark.read.option ( "maxColumns", "3" ). csv (Seq ( "test, as, g", "h, bm, s"). toDS ()). show () // being given spark.stop () } }