-
1: 基于文件数据源
- readTextFile(path) / TextInputFormat:逐行读取文件并将其作为字符串(String)返回
- readTextFileWithValue(path) / TextValueInputFormat:逐行读取文件并将其作为StringValue返回。StringValue是Flink对String的封装,可变、可序列化,一定程度上提高性能。
- readCsvFile(path) / CsvInputFormat:解析以逗号(或其他字符)分隔字段的文件。返回元组或pojo
- readFileOfPrimitives(path, Class) / PrimitiveInputFormat
- readFileOfPrimitives(path, delimiter, Class) / PrimitiveInputFormat 跟readCsvFile类似,只不过以原生类型返回而不是Tuple。
- readSequenceFile(Key, Value, path) / SequenceFileInputFormat:读取SequenceFile,以Tuple2<Key, Value>返回
- 小案例
-
2: 基于集合数据源
- fromCollection(Collection)
- fromCollection(Iterator, Class)
- fromElements(T …)
- fromParallelCollection(SplittableIterator, Class)
- generateSequence(from, to)
-
3:通用数据源
- readFile(inputFormat, path) / FileInputFormat
- createInput(inputFormat) / InputFormat
-
小案例:
// 文件数据源 import org.apache.flink.api.scala.{AggregateDataSet, DataSet, ExecutionEnvironment} object BatchOperate { def main(args: Array[String]): Unit = { val inputPath = "D:\\count.txt" val outPut = "D:\\data\\result2" // 对文件夹下面的多级文件夹进行递归 val configuration: Configuration = new Configuration() configuration.setBoolean("recursive.file.enumeration",true) //获取程序入口类ExecutionEnvironment val env = ExecutionEnvironment.getExecutionEnvironment val text = env.readTextFile(inputPath) .withParameters(configuration) //引入隐式转换 import org.apache.flink.api.scala._ val value: AggregateDataSet[(String, Int)] = text.flatMap(x => x.split(" ")).map(x =>(x,1)).groupBy(0).sum(1) value.writeAsText("d:\\datas\\result.txt").setParallelism(1) env.execute("batch word count") } }
// 集合数据源 import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment} object DataSetSource { def main(args: Array[String]): Unit = { //获取批量处理程序入口类ExecutionEnvironment val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment import org.apache.flink.api.scala._ //从集合当中创建dataSet val myArray = Array("hello world","spark flink") val collectionSet: DataSet[String] = environment.fromCollection(myArray) val result: AggregateDataSet[(String, Int)] = collectionSet.flatMap(x => x.split(" ")).map(x =>(x,1)).groupBy(0).sum(1) result.setParallelism(1).print() // result.writeAsText("c:\\HELLO.TXT") environment.execute() } }
flink学习笔记-dataSet内置数据源
猜你喜欢
转载自blog.csdn.net/qq_26719997/article/details/105098162
今日推荐
周排行