Finlk之程序员甩锅必杀技门清

Finlk之程序员甩锅必杀技门清

java代码实现数据条数统计

package counts;

import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.accumulators.LongCounter;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.configuration.Configuration;

import java.util.ArrayList;

public class JavaCounts {
    public static void main(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

        ArrayList<String> words = new ArrayList<>();
        for (int i = 97; i <102 ; i++) {
            words.add(String.valueOf((char) i));
        }
        DataSource<String> data = env.fromCollection(words);

        MapOperator<String, String> result = data.map(new RichMapFunction<String, String>() {

            //setup1 : 定义计数器
            LongCounter counter = new LongCounter();

            @Override
            public void open(Configuration parameters) throws Exception {
                //setup2 :注册计数器
                getRuntimeContext().addAccumulator("counterName", counter);
            }

            @Override
            public String map(String value) throws Exception {
                //setup3 :累加计数器的值
                counter.add(1);
                return value;
            }
        }).setParallelism(4);

        //sink
        result.writeAsText("./out");

        JobExecutionResult counts = env.execute("Counts");
        Object total = counts.getAccumulatorResult("counterName");

        System.out.println("数据总共: "+total+"条");
        //数据总共: 5条

    }
}

Scala代码实现数据条数统计

package counts

import org.apache.flink.api.common.JobExecutionResult
import org.apache.flink.api.common.accumulators.LongCounter
import org.apache.flink.api.common.functions.{IterationRuntimeContext, RichFunction, RichMapFunction, RuntimeContext}
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration

/*统计数据的条数*/
object Counts {
  def main(args: Array[String]): Unit = {
    val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment

    val words: DataSet[String] = env.fromElements("a","b","c","d","e")

    /*错误的统计方式*/
    /*words.map(new RichMapFunction[String,Int] {
      var counter = 0;
      override def map(value: String) = {
        counter = counter +1;
        println("counter: "+counter)
        counter
      }
    }).setParallelism(1).print()*/

    /**
      * counter: 1
      * counter: 1
      * counter: 1
      * counter: 1
      * counter: 2
      * 1
      * 2
      * 1
      * 1
      * 1
      *
      * 为什么不行呢? 因为 因为 并行度是4 每个分区统计都是从0开始
      * 设置并行度为1 就可以统计
      */


      /*正确的统计方式*/
    val out: DataSet[String] = words.map(new RichMapFunction[String, String]() {

      // setup1 :定义计数器
      val counter: LongCounter = new LongCounter()


      override def open(parameters: Configuration): Unit = {
        //setup2: 注册计数器
        getRuntimeContext.addAccumulator("counterName", counter)
      }

      override def map(value: String) = {
        //setup3: 对计数器进行累加赋值
        counter.add(1)
        value
      }
    }).setParallelism(4)

    //sink
    out.writeAsText("./out2")

    val result: JobExecutionResult = env.execute("Counts")

    //setup4: 获取计数器结果
    val total: Long = result.getAccumulatorResult[Long]("counterName")

    println("数据总共: "+total+"条")
    //数据总共: 5条  更改并行度也完全不影响数据的正确统计







  }
}

发布了33 篇原创文章 · 获赞 12 · 访问量 3351

猜你喜欢

转载自blog.csdn.net/IT_BULL/article/details/104157580