通过Apache Beam官方实例User Score实验流式join


不建议阅读,太乱了

使用UserScore例子中的Input.UnboundedGenerator()

不使用window的时候,无界数据是不能和有界数据join的。

Join需要使用相同的窗口。不能一个是globalWindow一个是fixedWindow

Exception in thread "main" java.lang.IllegalStateException: GroupByKey cannot be applied to non-bounded PCollection in the GlobalWindow without a trigger. Use a Window.into or Window.triggering transform prior to GroupByKey.
	at org.apache.beam.sdk.transforms.GroupByKey.applicableTo(GroupByKey.java:173)
	at org.apache.beam.sdk.transforms.GroupByKey.expand(GroupByKey.java:204)
	at org.apache.beam.sdk.transforms.GroupByKey.expand(GroupByKey.java:120)
	at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:537)
	at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:472)
	at org.apache.beam.sdk.values.PCollection.apply(PCollection.java:286)
	at org.apache.beam.sdk.transforms.join.CoGroupByKey.expand(CoGroupByKey.java:126)
	at org.apache.beam.sdk.transforms.join.CoGroupByKey.expand(CoGroupByKey.java:74)
	at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:537)
	at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:472)
	at org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple.apply(KeyedPCollectionTuple.java:107)
	at org.apache.beam.sdk.extensions.joinlibrary.Join.innerJoin(Join.java:59)
	at mobileGame.solution.StreamJoin.main(StreamJoin.java:238)

两个流进行join,可行。

通过Input.UnboundedGenerator()和Input.BoundedGenerator()实现有界数据和无界数据的join,都使用FixedWindows.of(WINDOW_SIZE)),发现:结果有界文件只join了第一个无界源窗口中的数据

代码:

package mobileGame.solution;

import mobileGame.GameActionInfo;
import mobileGame.utils.ExerciseOptions;
import mobileGame.utils.Input;
import mobileGame.utils.Output;
import myTest.JoinTest;
import org.apache.beam.runners.flink.FlinkPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.joinlibrary.Join;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.transforms.windowing.*;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.joda.time.Duration;

public class StreamJoin {

    public static int num=1;
    // Extract user/score pairs from the event stream using processing time, via
    // global windowing.
    public static class UserLeaderBoard
            extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {

        private Duration allowedLateness;
        private Duration updateFrequency;

        public UserLeaderBoard(Duration allowedLateness, Duration updateFrequency) {
            this.allowedLateness = allowedLateness;
            this.updateFrequency = updateFrequency;
        }

        @Override
        public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> input) {
            // [START EXERCISE 4 Part 1 - User Leaderboard]
            // JavaDoc: https://cloud.google.com/dataflow/java-sdk/JavaDoc
            // Developer Docs: https://cloud.google.com/dataflow/model/triggering
            //
            // Compute the user scores since the beginning of time. To do this we will
            // need:
            //
            // 1. Use the GlobalWindows WindowFn so that all events occur in the same
            // window.
            // 2. Specify the accumulation mode so that total score is constantly
            // increasing.
            // 3. Trigger every updateFrequency duration
            //
            return input
                    .apply(Window
                            // Since we want a globally increasing sum, use the GlobalWindows
                            // WindowFn
                            .<GameActionInfo>into(new GlobalWindows())
                            // We want periodic results every updateFrequency of processing
                            // time. We will be triggering repeatedly and forever, starting
                            // updateFrequency after the first element seen. Window.
                            .triggering(Repeatedly.forever(
                                    AfterProcessingTime
                                            .pastFirstElementInPane()
                                            .plusDelayOf(updateFrequency)))
                            // Specify the accumulation mode to ensure that each firing of the
                            // trigger produces monotonically increasing sums rather than just
                            // deltas.
                            .accumulatingFiredPanes())

                    // Extract and sum username/score pairs from the event data.
                    // You can use the ExtractAndSumScore transform again.
                    // Name the step -- look at overloads of apply().
                    .apply("ExtractUserScore", new ExtractAndSumScore(GameActionInfo.KeyField.USER));
            // [END EXERCISE 4 Part 1]
        }
    }

    // Extract user/score pairs from the event stream using processing time, via
    // global windowing.
    public static class TeamLeaderBoard
            extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {

        private Duration allowedLateness;
        private Duration earlyUpdateFrequency;
        private Duration lateUpdateFrequency;
        private Duration windowSize;

        public TeamLeaderBoard(Duration allowedLateness, Duration earlyUpdateFrequency, Duration lateUpdateFrequency,
                               Duration windowSize) {
            this.allowedLateness = allowedLateness;
            this.earlyUpdateFrequency = earlyUpdateFrequency;
            this.lateUpdateFrequency = lateUpdateFrequency;
            this.windowSize = windowSize;
        }

        @Override
        public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> input) {
            // [START EXERCISE 4 Part 2 - Team Leaderboard]
            // JavaDoc: https://cloud.google.com/dataflow/java-sdk/JavaDoc
            // Developer Docs: https://cloud.google.com/dataflow/model/triggering
            //
            // We're going to produce windowed team score again, but this time we want
            // to get
            // early (speculative) results as well as occasional late updates.
            System.out.println("TeamLeaderBoard expand");
            return input
                    .apply("FixedWindows",
                            Window.<GameActionInfo>into(FixedWindows.of(windowSize))
                                    .triggering(AfterWatermark.pastEndOfWindow()
                                            // Specify .withEarlyFirings to produce speculative
                                            // results
                                            // with a delay of earlyUpdateFrequency
                                            .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().alignedTo(earlyUpdateFrequency))
                                            // Specify .withLateFirings to produce late updates with a
                                            // delay
                                            // of lateUpdateFrequency
                                            .withLateFirings(AfterProcessingTime.pastFirstElementInPane().alignedTo(lateUpdateFrequency)))
                                    // Specify allowed lateness, and ensure that we get cumulative
                                    // results
                                    // across the window.
                                    .withAllowedLateness(allowedLateness).accumulatingFiredPanes())
                    // Extract and sum teamname/score pairs from the event data.
                    // You can use the ExtractAndSumScore transform again.
                    .apply("ExtractTeamScore", new ExtractAndSumScore(GameActionInfo.KeyField.TEAM));
            // [END EXERCISE 4 Part 2]
        }
    }

    /**
     * A transform to extract key/score information from GameActionInfo, and sum
     * the scores. The constructor arg determines whether 'team' or 'user' info is
     * extracted.
     */
    private static class ExtractAndSumScore
            extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {

        private final GameActionInfo.KeyField field;

        ExtractAndSumScore(GameActionInfo.KeyField field) {
            this.field = field;
            System.out.println("ExtractAndSumScore field");
        }

        @Override
        public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> gameInfo) {
            System.out.println("ExtractAndSumScore expand");
            return gameInfo
                    .apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
                            .via((GameActionInfo gInfo) -> KV.of(field.extract(gInfo), gInfo.getScore())))
                    .apply(Sum.<String>integersPerKey());
        }
    }

    //用于输出到文件
    public static class FormatAsTextFn extends SimpleFunction<KV<String,Integer>, String> {
        @Override
        public String apply(KV<String,Integer> input) {
            return "key:"+input.getKey()+"   value:"+input.getValue();
        }
    }

    //用于输出Pcollection中的字符串
    static class printStringInteger extends DoFn<KV<String ,Integer> ,KV<String ,Integer>> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            System.out.println("c.element  StringInteger:" + c.element());
            c.output(c.element());
        }
    }

    static class printStringString extends DoFn<KV<String ,String> ,KV<String ,String>> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            System.out.println("c.element:" + c.element());
            c.output(c.element());
        }
    }
    static class printStringIntegerString extends DoFn<KV<String,KV<Integer,String>> ,KV<String,KV<Integer,String>>> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            System.out.println("joined.element:" + c.element());
            c.output(c.element());
        }
    }
    static class printStringIntegerInteger extends DoFn<KV<String,KV<Integer,Integer>> ,KV<String,KV<Integer,Integer>>> {
        public static int sum=0;
        @ProcessElement
        public void processElement(ProcessContext c) {
            System.out.println("joined.StringIntegerInteger.element:" + c.element() +"sum:"+sum);
            sum++;
            c.output(c.element());
        }
    }

    //预处理,即将每行为 a,b De 的数据转化为KV<a,b>
    public static class Preprocess extends PTransform<PCollection<GameActionInfo>,PCollection<KV<String,Integer>>>{
        @Override
        public PCollection<KV<String,Integer>> expand(PCollection<GameActionInfo> gameActionInfoPCollection){
            //String[] temps = lines.toString().split(",");
            PCollection<KV<String,Integer>> result = gameActionInfoPCollection.apply(ParDo.of(new DoFn<GameActionInfo, KV<String,Integer>>() {
                @ProcessElement
                public void processElement(ProcessContext c){
                    String name = c.element().getUser();
                    Integer score = c. element().getScore();
                    KV<String ,Integer> kv=KV.of(name,score);
                    c.output(kv);
                }
            }));
            return result;
        }
    }
    private static final Duration ALLOWED_LATENESS = Duration.standardMinutes(30);
    private static final Duration EARLY_UPDATE_FREQUENCY = Duration.standardSeconds(10);
    private static final Duration LATE_UPDATE_FREQUENCY = Duration.standardSeconds(20);
    private static final Duration WINDOW_SIZE = Duration.standardSeconds(20);

    public static void main(String[] args) throws Exception {

        ExerciseOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(ExerciseOptions.class);
        options.setStreaming(false);
        Pipeline pipeline = Pipeline.create(options);

        // Read game events from the unbounded injector.
        //如果不加入window是不能运行的

        PCollection<GameActionInfo> gameEvents = pipeline.apply(new Input.UnboundedGenerator())
                .apply(Window.<GameActionInfo>into(FixedWindows.of(WINDOW_SIZE)));

        //得到了流的PCollection<KV<String,Integer>>
        PCollection<KV<String,Integer>> stream = gameEvents.apply(new Preprocess());


        PCollection<GameActionInfo> gameEvents2 = pipeline.apply(new Input.BoundedGenerator())
                .apply(Window.<GameActionInfo>into(FixedWindows.of(WINDOW_SIZE)));

        //得到了流的PCollection<KV<String,Integer>>
        PCollection<KV<String,Integer>> stream2 = gameEvents2.apply(new Preprocess());

        //Beam里头没有CountWindow么
        PCollection<String> lines2=pipeline.apply(TextIO.read().from("/home/maqy/桌面/output/streamJoin"));

        PCollection<KV<String,String>> batch=lines2.apply(new PTransform<PCollection<String>, PCollection<KV<String, String>>>() {
            @Override
            public PCollection<KV<String, String>> expand(PCollection<String> input) {
                PCollection<KV<String,String>> result = input.apply(ParDo.of(new DoFn<String, KV<String, String>>() {
                    @ProcessElement
                    public void processElement(ProcessContext c){
                        String[] s =c.element().split("\\W+");
                        if(s.length==2){
                            KV<String,String> kv =KV.of(s[0],s[1]);
                            c.output(kv);
                        }
                    }
                }));
                return result;
            }
        });

        stream2.apply(ParDo.of(new printStringInteger()));
        //batch.apply(ParDo.of(new printStringString()));

        //stream.apply(ParDo.of(new printStringInteger()));
        //不允许  无界数据和有界数据进行join
        PCollection<KV<String,KV<Integer,Integer>>> joinedPcollection = Join.innerJoin(stream,stream2);

        joinedPcollection.apply(ParDo.of(new printStringIntegerInteger()));
        //PCollection<KV<String,Integer>> result =results.apply(ParDo.of(new printString()));

        pipeline.run();
    }
}

猜你喜欢

转载自blog.csdn.net/u013036495/article/details/85250495