上篇:第 11 节 DataStream之自定义source
1、 DataStream之算子过滤只有偶数
filter:过滤函数,对传入的数据进行判断,符合条件的数据会被留下
具体代码实现:
package xuwei.custormSource;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
/**
* Filter演示
*
*/
public class StreamingDemoFilter {
public static void main(String[] args)throws Exception {
//获取flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据源
DataStreamSource<Long>text= env.addSource(new MyNoParalleSource()).setParallelism(1);//注意,针对此source,并行度只能设置为1
DataStream<Long>num= text.map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
System.out.println("接收到的数据"+value);
return value;
}
});
//执行filter过滤,满足条件的数据会被留下
DataStream<Long> filterData = num.filter(new FilterFunction<Long>() {
//把所有的奇数过滤掉
@Override
public boolean filter(Long value) throws Exception {
return value % 2 == 0;
}
});
DataStream<Long> resultData = filterData.map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
System.out.println("过滤之后的数据:" + value);
return value;
}
});
//每2秒钟处理一次数据
DataStream<Long> sum = filterData.timeWindowAll(Time.seconds(2)).sum(0);
//打印结果
sum.print().setParallelism(1);
String jobname = StreamingDemoFilter.class.getSimpleName();
env.execute(jobname);
}
}
控制台打印信息,不断打印循环下去:
2、Untion基本操作
Union:合并多个流,新的流会包含所有流中的数据,但是union是一个限制,就是所有合并的流类型必须是一致的。
具体代码实现:
package xuwei.streaming;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import xuwei.custormSource.MyNoParalleSource;
/**
*
*Union:合并多个流,新的流会包含所有流中的数据,但是union是一个限制,就是所有合并的流类型必须是一致的。
*/
public class StreamingDemoUnion {
public static void main(String[] args)throws Exception {
//获取flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据源
DataStreamSource<Long>text1= env.addSource(new MyNoParalleSource()).setParallelism(1);//注意,针对此source,并行度只能设置为1
DataStreamSource<Long>text2= env.addSource(new MyNoParalleSource()).setParallelism(1);
//把text1和text2组装到一起
DataStream<Long> union = text1.union(text2);
DataStream<Long>num= text1.map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
System.out.println("接收到的数据"+value);
return value;
}
});
//每2秒钟处理一次数据
DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);
//打印结果
sum.print().setParallelism(1);
String jobname = StreamingDemoUnion.class.getSimpleName();
env.execute(jobname);
}
}
控制台打印信息,不断打印循环下去:
3、Connect基本操作
Connect:和union类似,但是只能连接两个流,两个流的数据类型可以不同,会对两个流中的数据应用不同的处理方法。
具体代码实现:
package xuwei.streaming;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
import xuwei.custormSource.MyNoParalleSource;
/**
*
*Connect:和union类似,但是只能连接两个流,两个流的数据类型可以不同,会对两个流中的数据应用不同的处理方法。
*/
public class StreamingDemoConnect {
public static void main(String[] args) throws Exception {
//获取flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据源
DataStreamSource<Long> text1 = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意,针对此source,并行度只能设置为1
DataStreamSource<Long> text2 = env.addSource(new MyNoParalleSource()).setParallelism(1);
SingleOutputStreamOperator<String> text2_str = text2.map(new MapFunction<Long, String>() {
@Override
public String map(Long value) throws Exception {
return "str_" + value; //转为字符串
}
});
ConnectedStreams<Long, String> connectStream = text1.connect(text2_str);
SingleOutputStreamOperator<Object> result = connectStream.map(new CoMapFunction<Long, String, Object>() {
@Override
public Object map1(Long value) throws Exception {
return value;
}
@Override
public Object map2(String value) throws Exception {
return value;
}
});
//打印结果
result.print().setParallelism(1);
String jobname = StreamingDemoUnion.class.getSimpleName();
env.execute(jobname);
}
}
控制台打印信息,不断打印循环下去:
4、Split基本操作
Split:根据规则把一个数据流切分为多个流
具体代码实现:
扫描二维码关注公众号,回复:
9669589 查看本文章
package xuwei.streaming;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import xuwei.custormSource.MyNoParalleSource;
import java.lang.reflect.Array;
import java.util.ArrayList;
/**
* Split操作
*
*Split:根据规则把一个数据流切分为多个流
*
* 应用场景:
* 可能根据实际工作中,源数据流混合了多种类似的数据,多种类型的数据处理规则不一样,所以就可以在根据一定的规则
* 把一个数据源切分成多个数据源,这样每个数据流就可以使用不用的处理逻辑
*/
public class StreamingDemoSpilt {
public static void main(String[] args)throws Exception {
//获取flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据源
DataStreamSource<Long>text= env.addSource(new MyNoParalleSource()).setParallelism(1);//注意,针对此source,并行度只能设置为1
//对流进行切分,按照数据的奇偶性进行切分
SplitStream<Long> splitStream = text.split(new OutputSelector<Long>() {
@Override
public Iterable<String> select(Long value) {
ArrayList<String> outPut = new ArrayList<>();
if (value % 2 == 0) {
outPut.add("even"); //偶数
} else {
outPut.add("odd"); //奇数
}
return outPut;
}
});
//选择一个或多个切分后的流
DataStream<Long> evenStream = splitStream.select("even"); //偶数
DataStream<Long> oddStream = splitStream.select("odd"); //奇数
DataStream<Long> moreStream = splitStream.select("odd","even"); //奇数偶数一起打印
//打印结果
moreStream.print().setParallelism(1);
String jobname = StreamingDemoSpilt.class.getSimpleName();
env.execute(jobname);
}
}
控制台打印信息,不断打印循环下去:
5、DataStream API之Transformations
- Union:合并多个流,新的流会包含所有流中的数据,但是union是一个限制,就是所有合并的流类型必须是一致的。
- Connect:和union类似,但是只能连接两个流,两个流的数据类型可以不同,会对两个流中的数据应用不同的处理方法。
- CoMap, CoFlatMap:在ConnectedStreams中需要使用这种函数,类似于map和flatmap
Split:根据规则把一个数据流切分为多个流
Select:和split配合使用,选择切分后的流