首先引入pom.xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.fuyun</groupId>
<artifactId>flinkLearning</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>aliyun</id>
<url>http://repository.apache.org/content/groups/snapshots/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.com/nexus/content/groups/public</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<flink.version>1.12.0</flink.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<!-- provided在这表示此依赖只在代码编译的时候使用,运行和打包的时候不使用 -->
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>${flink.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
</project>
flink离线批处理WordCount代码如下:
package com.fuyun.flink
// 因为很多算子Java和Scala名称都一样,必须要显示导入scala所有包告知使用Scala语言的算子,不然会报错
import org.apache.flink.api.scala._
object BatchWordCount {
def main(args: Array[String]): Unit = {
// 定义输入输出文件路径
val input = "E:\\IDEAworkspace\\flinkLearning\\datas\\WordCount"
val output = "E:\\IDEAworkspace\\flinkLearning\\datas\\WordCountResult"
// 创建一个批处理的执行环境
val env:ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
// 读取离线文件
val inputDataSet:DataSet[String] = env.readTextFile(input)
// 对离线文件单词统计,不区分大小写
val counts = inputDataSet.flatMap(_.toLowerCase.split("\\s")) // 对元素转换为小写再进行分割
.filter(_.nonEmpty) // 过滤空值
.map((_, 1)) // 将元素组合成二元组
.groupBy(0) // 对第一个元素进行分组
.sum(1) // 对第二个元素求和
/**
* def writeAsCsv(
* filePath: String,
* rowDelimiter: String = ScalaCsvOutputFormat.DEFAULT_LINE_DELIMITER,
* fieldDelimiter: String = ScalaCsvOutputFormat.DEFAULT_FIELD_DELIMITER,
* writeMode: FileSystem.WriteMode = null): DataSink[T] = {
* require(javaSet.getType.isTupleType, "CSV output can only be used with Tuple DataSets.")
* val of = new ScalaCsvOutputFormat[Product](new Path(filePath), rowDelimiter, fieldDelimiter)
* if (writeMode != null) {
* of.setWriteMode(writeMode)
* }
* output(of.asInstanceOf[OutputFormat[T]])
* }
*/
// 将统计结果写入csv文件,第一个参数为写入文件路径,第二个参数为行分隔符,第三个参数为字段分隔符,第四个参数为写入模式
// setParallelism方法设置并行度
counts.writeAsCsv(output, "\n", " ").setParallelism(1)
// 必须调用execute方法执行,不然不会触发执行操作,参数为jobName
env.execute("bath word count")
}
}