【Flink入门】Flink离线批处理WordCount

首先引入pom.xml文件:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.fuyun</groupId>
    <artifactId>flinkLearning</artifactId>
    <version>1.0-SNAPSHOT</version>

    <repositories>
        <repository>
            <id>aliyun</id>
            <url>http://repository.apache.org/content/groups/snapshots/</url>
        </repository>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
        <repository>
            <id>jboss</id>
            <url>http://repository.jboss.com/nexus/content/groups/public</url>
        </repository>
    </repositories>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <flink.version>1.12.0</flink.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
            <!-- provided在这表示此依赖只在代码编译的时候使用,运行和打包的时候不使用 -->
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>${flink.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>${flink.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>${flink.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
    </dependencies>

</project>

flink离线批处理WordCount代码如下:

package com.fuyun.flink

// 因为很多算子Java和Scala名称都一样,必须要显示导入scala所有包告知使用Scala语言的算子,不然会报错
import org.apache.flink.api.scala._

object BatchWordCount {
    
    

  def main(args: Array[String]): Unit = {
    
    

    // 定义输入输出文件路径
    val input = "E:\\IDEAworkspace\\flinkLearning\\datas\\WordCount"
    val output = "E:\\IDEAworkspace\\flinkLearning\\datas\\WordCountResult"

    // 创建一个批处理的执行环境
    val env:ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
    // 读取离线文件
    val inputDataSet:DataSet[String] = env.readTextFile(input)

    // 对离线文件单词统计,不区分大小写
    val counts = inputDataSet.flatMap(_.toLowerCase.split("\\s")) // 对元素转换为小写再进行分割
      .filter(_.nonEmpty) // 过滤空值
      .map((_, 1))  // 将元素组合成二元组
      .groupBy(0)  // 对第一个元素进行分组
      .sum(1)  // 对第二个元素求和

    /**
     * def writeAsCsv(
     * filePath: String,
     * rowDelimiter: String = ScalaCsvOutputFormat.DEFAULT_LINE_DELIMITER,
     * fieldDelimiter: String = ScalaCsvOutputFormat.DEFAULT_FIELD_DELIMITER,
     * writeMode: FileSystem.WriteMode = null): DataSink[T] = {
     * require(javaSet.getType.isTupleType, "CSV output can only be used with Tuple DataSets.")
     * val of = new ScalaCsvOutputFormat[Product](new Path(filePath), rowDelimiter, fieldDelimiter)
     * if (writeMode != null) {
     *       of.setWriteMode(writeMode)
     * }
     * output(of.asInstanceOf[OutputFormat[T]])
     * }
     */
     // 将统计结果写入csv文件,第一个参数为写入文件路径,第二个参数为行分隔符,第三个参数为字段分隔符,第四个参数为写入模式
     // setParallelism方法设置并行度
     counts.writeAsCsv(output, "\n", " ").setParallelism(1) 

     // 必须调用execute方法执行,不然不会触发执行操作,参数为jobName
     env.execute("bath word count")

  }
}

猜你喜欢

转载自blog.csdn.net/lz6363/article/details/112573754