Flink Java编程:统计字数和读取本地文件

一、第一个程序wordcount(实时流处理)

1、pom.xml文件

<properties>

          <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

           <flink.version>1.5.1</flink.version>

     </properties>

     <dependencies>

           <dependency>

                <groupId>org.apache.flink</groupId>

                <artifactId>flink-java</artifactId>

                <version>${flink.version}</version>

           </dependency>

           <dependency>

                <groupId>org.apache.flink</groupId>

                <artifactId>flink-streaming-java_2.11</artifactId>

                <version>${flink.version}</version>

           </dependency>

           <dependency>

                <groupId>org.apache.flink</groupId>

                <artifactId>flink-clients_2.11</artifactId>

                <version>${flink.version}</version>

           </dependency>

           

     </dependencies>

     

     <build>

           <!-- 不打包core.properties -->

           <!-- <resources> <resource> <directory>src/main/resources</directory> <excludes>

                <exclude>core.properties</exclude> </excludes> </resource> </resources> -->

           <plugins>

                <plugin>

                     <artifactId>maven-compiler-plugin</artifactId>

                     <version>3.1</version>

                     <configuration>

                           <source>1.8</source>

                           <target>1.8</target>

                     </configuration>

                </plugin>

                <plugin>

                     <groupId>org.apache.maven.plugins</groupId>

                     <artifactId>maven-jar-plugin</artifactId>

                     <version>2.4</version>

                     <configuration>

                           <archive>

                                <manifest>

                                     <addClasspath>true</addClasspath>

                                     <classpathPrefix>lib/</classpathPrefix>

                                     <mainClass>com.tydic.SocketWindowWordCount</mainClass>

                                </manifest>

                           </archive>

                     </configuration>

                </plugin>

                <!-- 将依赖包放到lib文件夹中 -->

                <plugin>

                     <groupId>org.apache.maven.plugins</groupId>

                     <artifactId>maven-dependency-plugin</artifactId>

                     <executions>

                           <execution>

                                <id>copy</id>

                                <phase>package</phase>

                                <goals>

                                     <goal>copy-dependencies</goal>

                                </goals>

                                <configuration>

                                     <outputDirectory> ${project.build.directory}/lib

                                     </outputDirectory>

                                </configuration>

                           </execution>

                     </executions>

                </plugin>

           </plugins>

     </build>

2、代码

package com.tydic;

importorg.apache.flink.api.common.functions.FlatMapFunction;

importorg.apache.flink.api.common.functions.ReduceFunction;

import org.apache.flink.api.java.utils.ParameterTool;

import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.streaming.api.windowing.time.Time;

import org.apache.flink.util.Collector;

public class SocketWindowWordCount {

     public static void main(String[] args) throws Exception {

          final int port;

          try {

              final ParameterTool params = ParameterTool.fromArgs(args);

              port = params.getInt("port");

          } catch (Exception e) {

              System.err.println("No port specified.Pleas run 'SocketWindowCount --port <port>'");

              return;

          }

          final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

          // local模式

//        final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();

          DataStream<String> text = env.socketTextStream("192.168.128.146", port, "\n");

          @SuppressWarnings("serial")

          DataStream<WordWithCount> windowCounts = text

                   .flatMap(new FlatMapFunction<String, WordWithCount>() {

                        public void flatMap(String value, Collector<WordWithCount> out) throws Exception {

                             for (String word : value.split("\\s")) {

                                  out.collect(new WordWithCount(word, 1L));

                             }

                        }

                   })

                   .keyBy("word").timeWindow(Time.seconds(5), Time.seconds(1))

                   .reduce(new ReduceFunction<WordWithCount>() {

                        public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {

                             return new WordWithCount(a.word, a.count + b.count);

                        }

                   });

          windowCounts.print().setParallelism(1);

          env.execute("Socket Window WordCount(zyl_test)");

     }

     public static class WordWithCount {

          public String word;

          public long count;

          public WordWithCount() {

          }

          public WordWithCount(String word, long count) {

              this.word = word;

              this.count = count;

          }

          @Override

          public String toString() {

              return word + " : " + count;

          }

     }

}

3、打jar包,提交jar到flink集群

/opt/flink-1.5.1/bin/flink run FlinkMaven-0.0.1-SNAPSHOT.jar --port 9000

4、测试

启动服务:nc -l 9000

5、日志查看

[root@rhel6-147 log]# tail -f flink-root-taskexecutor-0-rhel6-147.out

hello : 1

tert : 1

...


二、读取本地文件(类似批处理)

package com.tydic;

importorg.apache.flink.api.common.functions.FlatMapFunction;

import org.apache.flink.api.java.DataSet;

import org.apache.flink.api.java.ExecutionEnvironment;

import org.apache.flink.api.java.aggregation.Aggregations;

import org.apache.flink.api.java.tuple.Tuple2;

import org.apache.flink.api.java.utils.ParameterTool;

import org.apache.flink.util.Collector;

public class LocalFileWordCount {

     public static void main(String[] args) throws Exception {

          final ParameterTool params = ParameterTool.fromArgs(args);

          final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

          env.getConfig().setGlobalJobParameters(params);

          // get input data

          DataSet<String> text = env.readTextFile(params.get("input"));

          DataSet<Tuple2<String, Integer>> counts = text.flatMap(new Splitter()) // split up the lines in pairs (2-tuples) containing: (word,1)

                   .groupBy(0).aggregate(Aggregations.SUM, 1);// group by the tuple field "0" and sum up tuple field "1"

          counts.writeAsText(params.get("output"));

          env.execute("WordCount Example");

     }

}

// The operations are defined by specialized classes, here the Splitter class.

@SuppressWarnings("serial")

class Splitter implements FlatMapFunction<String, Tuple2<String, Integer>> {

     @Override

     public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {

          // normalize and split the line into words

          String[] tokens = value.split("\\W+");

          // emit the pairs

          for (String token : tokens) {

              if (token.length() > 0) {

                   out.collect(new Tuple2<String, Integer>(token, 1));

              }

          }

     }

}

运行:

/opt/flink-1.5.1/bin/flink run --class com.tydic.LocalFileWordCount FlinkMaven-0.0.1-SNAPSHOT.jar  --input file:///tmp/zyl/input.txt --output file:///tmp/zyl/output.txt

问题:

org.apache.flink.client.program.ProgramInvocationException: java.io.IOException: Error opening the Input Split file:/tmp/zyl/input.txt [0,24]: /tmp/zyl/input.txt (No such file or directory)

        at org.apache.flink.client.program.rest.RestClusterClient.submitJob(RestClusterClient.java:264)

        at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:464)

        at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:452)

        at org.apache.flink.client.program.ContextEnvironment.execute(ContextEnvironment.java:62)

        at com.tydic.LocalFileWordCount.main(LocalFileWordCount.java:25)

        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.jav

解决方法:本地文件要在taskmanager所对应的机器上!

猜你喜欢

转载自blog.csdn.net/zyl651334919/article/details/88836019
今日推荐