需求描述:每隔5秒,计算最近10秒单词出现的次数。
TimeWindow实现
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();DataStreamSource<String> dataStream = env.socketTextStream("localhost", 8888);SingleOutputStreamOperator<Tuple2<String, Integer>> result = dataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {@Overridepublic void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {String[] fields = line.split(",");for (String word : fields) {out.collect(new Tuple2<>(word, 1));}}}).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).sum(1);result.print().setParallelism(1);env.execute("TimeWindowWordCount");}
}
ProcessWindowFunction
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();DataStreamSource<String> dataStream = env.socketTextStream("10.148.15.10", 8888);SingleOutputStreamOperator<Tuple2<String, Integer>> result = dataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {@Overridepublic void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {String[] fields = line.split(",");for (String word : fields) {out.collect(new Tuple2<>(word, 1));}}}).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).process(new SumProcessWindowFunction());result.print().setParallelism(1);env.execute("TimeWindowWordCount");}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Integer>,Tuple2<String,Integer>,Tuple,TimeWindow> {FastDateFormat dataFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Integer>> elements,Collector<Tuple2<String, Integer>> out) {System.out.println("当天系统的时间:"+dataFormat.format(System.currentTimeMillis()));System.out.println("Window的处理时间:"+dataFormat.format(context.currentProcessingTime()));System.out.println("Window的开始时间:"+dataFormat.format(context.window().getStart()));System.out.println("Window的结束时间:"+dataFormat.format(context.window().getEnd()));int sum = 0;for (Tuple2<String, Integer> ele : elements) {sum += 1;}// 输出单词出现的次数out.collect(Tuple2.of(tuple.getField(0), sum));}}
}
Time的种类
针对stream数据中的时间,可以分为以下三种:
Event Time:事件产生的时间,它通常由事件中的时间戳描述。
Ingestion time:事件进入Flink的时间
Processing Time:事件被处理时当前系统的时间
Process Time Window
需求:每隔5秒计算最近10秒的单词出现的次数
自定义source,模拟:第 13 秒的时候连续发送 2 个事件,第 16 秒的时候再发送 1 个事件
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);DataStreamSource<String> dataStream = env.addSource(new TestSouce());SingleOutputStreamOperator<Tuple2<String, Integer>> result = dataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {@Overridepublic void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {String[] fields = line.split(",");for (String word : fields) {out.collect(new Tuple2<>(word, 1));}}}).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).process(new SumProcessWindowFunction());result.print().setParallelism(1);env.execute("TimeWindowWordCount");}public static class TestSouce implements SourceFunction<String>{FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");@Overridepublic void run(SourceContext<String> ctx) throws Exception {// 控制大约在 10 秒的倍数的时间点发送事件String currTime = String.valueOf(System.currentTimeMillis());while (Integer.valueOf(currTime.substring(currTime.length() - 4)) > 100) {currTime = String.valueOf(System.currentTimeMillis());continue;}System.out.println("开始发送事件的时间:" + dateFormat.format(System.currentTimeMillis()));// 第 13 秒发送两个事件TimeUnit.SECONDS.sleep(13);ctx.collect("hadoop," + System.currentTimeMillis());// 产生了一个事件,但是由于网络原因,事件没有发送ctx.collect("hadoop," + System.currentTimeMillis());// 第 16 秒发送一个事件TimeUnit.SECONDS.sleep(3);ctx.collect("hadoop," + System.currentTimeMillis());TimeUnit.SECONDS.sleep(300);}@Overridepublic void cancel() {}}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Integer>,Tuple2<String,Integer>,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Integer>> elements,Collector<Tuple2<String, Integer>> out) {// System.out.println("当天系统的时间:"+dateFormat.format(System.currentTimeMillis()));
//
// System.out.println("Window的处理时间:"+dateFormat.format(context.currentProcessingTime()));
// System.out.println("Window的开始时间:"+dateFormat.format(context.window().getStart()));
// System.out.println("Window的结束时间:"+dateFormat.format(context.window().getEnd()));int sum = 0;for (Tuple2<String, Integer> ele : elements) {sum += 1;}// 输出单词出现的次数out.collect(Tuple2.of(tuple.getField(0), sum));}}
}
Process Time Window
自定义source,模拟:第 13 秒的时候连续发送 2 个事件,但是有一个事件确实在第13秒的发送出去了,另外一个事件因为某种原因在19秒的时候才发送出去,第 16 秒的时候再发送 1 个事件
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);DataStreamSource<String> dataStream = env.addSource(new TestSouce());SingleOutputStreamOperator<Tuple2<String, Integer>> result = dataStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {@Overridepublic void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {String[] fields = line.split(",");for (String word : fields) {out.collect(new Tuple2<>(word, 1));}}}).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).process(new SumProcessWindowFunction());result.print().setParallelism(1);env.execute("TimeWindowWordCount");}/*** 模拟:第 13 秒的时候连续发送 2 个事件,第 16 秒的时候再发送 1 个事件*/public static class TestSouce implements SourceFunction<String>{FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");@Overridepublic void run(SourceContext<String> ctx) throws Exception {// 控制大约在 10 秒的倍数的时间点发送事件String currTime = String.valueOf(System.currentTimeMillis());while (Integer.valueOf(currTime.substring(currTime.length() - 4)) > 100) {currTime = String.valueOf(System.currentTimeMillis());continue;}System.out.println("开始发送事件的时间:" + dateFormat.format(System.currentTimeMillis()));// 第 13 秒发送两个事件TimeUnit.SECONDS.sleep(13);ctx.collect("hadoop," + System.currentTimeMillis());// 产生了一个事件,但是由于网络原因,事件没有发送String event = "hadoop," + System.currentTimeMillis();// 第 16 秒发送一个事件TimeUnit.SECONDS.sleep(3);ctx.collect("hadoop," + System.currentTimeMillis());// 第 19 秒的时候发送TimeUnit.SECONDS.sleep(3);ctx.collect(event);TimeUnit.SECONDS.sleep(300);}@Overridepublic void cancel() {}}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Integer>,Tuple2<String,Integer>,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Integer>> elements,Collector<Tuple2<String, Integer>> out) {// System.out.println("当天系统的时间:"+dateFormat.format(System.currentTimeMillis()));
//
// System.out.println("Window的处理时间:"+dateFormat.format(context.currentProcessingTime()));
// System.out.println("Window的开始时间:"+dateFormat.format(context.window().getStart()));
// System.out.println("Window的结束时间:"+dateFormat.format(context.window().getEnd()));int sum = 0;for (Tuple2<String, Integer> ele : elements) {sum += 1;}// 输出单词出现的次数out.collect(Tuple2.of(tuple.getField(0), sum));}}
}
使用Event Time处理无序
使用Event Time处理
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);DataStreamSource<String> dataStream = env.addSource(new TestSouce());dataStream.map(new MapFunction<String, Tuple2<String,Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0],Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor() ).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).process(new SumProcessWindowFunction()).print().setParallelism(1);env.execute("TimeWindowWordCount");}public static class TestSouce implements SourceFunction<String>{FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");@Overridepublic void run(SourceContext<String> ctx) throws Exception {// 控制大约在 10 秒的倍数的时间点发送事件String currTime = String.valueOf(System.currentTimeMillis());while (Integer.valueOf(currTime.substring(currTime.length() - 4)) > 100) {currTime = String.valueOf(System.currentTimeMillis());continue;}System.out.println("开始发送事件的时间:" + dateFormat.format(System.currentTimeMillis()));// 第 13 秒发送两个事件TimeUnit.SECONDS.sleep(13);ctx.collect("hadoop," + System.currentTimeMillis());// 产生了一个事件,但是由于网络原因,事件没有发送String event = "hadoop," + System.currentTimeMillis();// 第 16 秒发送一个事件TimeUnit.SECONDS.sleep(3);ctx.collect("hadoop," + System.currentTimeMillis());// 第 19 秒的时候发送TimeUnit.SECONDS.sleep(3);ctx.collect(event);TimeUnit.SECONDS.sleep(300);}@Overridepublic void cancel() {}}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,Tuple2<String,Integer>,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<Tuple2<String, Integer>> out) {// System.out.println("当天系统的时间:"+dateFormat.format(System.currentTimeMillis()));
//
// System.out.println("Window的处理时间:"+dateFormat.format(context.currentProcessingTime()));
// System.out.println("Window的开始时间:"+dateFormat.format(context.window().getStart()));
// System.out.println("Window的结束时间:"+dateFormat.format(context.window().getEnd()));int sum = 0;for (Tuple2<String, Long> ele : elements) {sum += 1;}// 输出单词出现的次数out.collect(Tuple2.of(tuple.getField(0), sum));}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {return element.f1;}@Nullable@Overridepublic Watermark getCurrentWatermark() {return new Watermark(System.currentTimeMillis());}}
}
现在我们第三个window的结果已经计算准确了,但是我们还是没有彻底的解决问题。接下来就需要我们使用WaterMark机制来解决了。
使用WaterMark机制解决无序
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);DataStreamSource<String> dataStream = env.addSource(new TestSouce());dataStream.map(new MapFunction<String, Tuple2<String,Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0],Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor() ).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).process(new SumProcessWindowFunction()).print().setParallelism(1);env.execute("TimeWindowWordCount");}public static class TestSouce implements SourceFunction<String>{FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");@Overridepublic void run(SourceContext<String> ctx) throws Exception {// 控制大约在 10 秒的倍数的时间点发送事件String currTime = String.valueOf(System.currentTimeMillis());while (Integer.valueOf(currTime.substring(currTime.length() - 4)) > 100) {currTime = String.valueOf(System.currentTimeMillis());continue;}System.out.println("开始发送事件的时间:" + dateFormat.format(System.currentTimeMillis()));// 第 13 秒发送两个事件TimeUnit.SECONDS.sleep(13);ctx.collect("hadoop," + System.currentTimeMillis());// 产生了一个事件,但是由于网络原因,事件没有发送String event = "hadoop," + System.currentTimeMillis();// 第 16 秒发送一个事件TimeUnit.SECONDS.sleep(3);ctx.collect("hadoop," + System.currentTimeMillis());// 第 19 秒的时候发送TimeUnit.SECONDS.sleep(3);ctx.collect(event);TimeUnit.SECONDS.sleep(300);}@Overridepublic void cancel() {}}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,Tuple2<String,Integer>,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<Tuple2<String, Integer>> out) {// System.out.println("当天系统的时间:"+dateFormat.format(System.currentTimeMillis()));
//
// System.out.println("Window的处理时间:"+dateFormat.format(context.currentProcessingTime()));
// System.out.println("Window的开始时间:"+dateFormat.format(context.window().getStart()));
// System.out.println("Window的结束时间:"+dateFormat.format(context.window().getEnd()));int sum = 0;for (Tuple2<String, Long> ele : elements) {sum += 1;}// 输出单词出现的次数out.collect(Tuple2.of(tuple.getField(0), sum));}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {return element.f1;}@Nullable@Overridepublic Watermark getCurrentWatermark() {//window延迟5秒触发return new Watermark(System.currentTimeMillis() - 5000);}}
}
WaterMark的周期
/*** 每隔5秒计算最近10秒单词出现的次数*/
public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//设置waterMark产生的周期为1senv.getConfig().setAutoWatermarkInterval(1000);DataStreamSource<String> dataStream = env.addSource(new TestSouce());dataStream.map(new MapFunction<String, Tuple2<String,Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0],Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor() ).keyBy(0).timeWindow(Time.seconds(10), Time.seconds(5)).process(new SumProcessWindowFunction()).print().setParallelism(1);env.execute("TimeWindowWordCount");}public static class TestSouce implements SourceFunction<String>{FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");@Overridepublic void run(SourceContext<String> ctx) throws Exception {// 控制大约在 10 秒的倍数的时间点发送事件String currTime = String.valueOf(System.currentTimeMillis());while (Integer.valueOf(currTime.substring(currTime.length() - 4)) > 100) {currTime = String.valueOf(System.currentTimeMillis());continue;}System.out.println("开始发送事件的时间:" + dateFormat.format(System.currentTimeMillis()));// 第 13 秒发送两个事件TimeUnit.SECONDS.sleep(13);ctx.collect("hadoop," + System.currentTimeMillis());// 产生了一个事件,但是由于网络原因,事件没有发送String event = "hadoop," + System.currentTimeMillis();// 第 16 秒发送一个事件TimeUnit.SECONDS.sleep(3);ctx.collect("hadoop," + System.currentTimeMillis());// 第 19 秒的时候发送TimeUnit.SECONDS.sleep(3);ctx.collect(event);TimeUnit.SECONDS.sleep(300);}@Overridepublic void cancel() {}}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,Tuple2<String,Integer>,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<Tuple2<String, Integer>> out) {int sum = 0;for (Tuple2<String, Long> ele : elements) {sum += 1;}// 输出单词出现的次数out.collect(Tuple2.of(tuple.getField(0), sum));}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {//这个方法是每获取到一个数据就会被调用一次。return element.f1;}@Nullable@Overridepublic Watermark getCurrentWatermark() {/*** WasterMark会周期性的产生,默认就是每隔200毫秒产生一个** 设置 watermark 产生的周期为 1000ms* env.getConfig().setAutoWatermarkInterval(1000);*///window延迟5秒触发System.out.println("water mark...");return new Watermark(System.currentTimeMillis() - 5000);}}
}
得到并打印每隔 3 秒钟统计前 3 秒内的相同的 key 的所有的事件
/*** 得到并打印每隔 3 秒钟统计前 3 秒内的相同的 key 的所有的事件*/
public class WaterMarkWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//设置waterMark产生的周期为1senv.getConfig().setAutoWatermarkInterval(1000);DataStreamSource<String> dataStream = env.socketTextStream("10.148.15.10", 8888);dataStream.map(new MapFunction<String, Tuple2<String,Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0],Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor() ).keyBy(0).timeWindow(Time.seconds(3)).process(new SumProcessWindowFunction()).print().setParallelism(1);env.execute("TimeWindowWordCount");}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,String,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<String> out) {System.out.println("处理时间:" + dateFormat.format(context.currentProcessingTime()));System.out.println("window start time : " + dateFormat.format(context.window().getStart()));List<String> list = new ArrayList<>();for (Tuple2<String, Long> ele : elements) {list.add(ele.toString() + "|" + dateFormat.format(ele.f1));}out.collect(list.toString());System.out.println("window end time : " + dateFormat.format(context.window().getEnd()));}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");private long currentMaxEventTime = 0L;private long maxOutOfOrderness = 10000; // 最大允许的乱序时间 10 秒// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {long currentElementEventTime = element.f1;currentMaxEventTime = Math.max(currentMaxEventTime, currentElementEventTime);System.out.println("event = " + element+ "|" + dateFormat.format(element.f1) // Event Time+ "|" + dateFormat.format(currentMaxEventTime) // Max Event Time+ "|" + dateFormat.format(getCurrentWatermark().getTimestamp())); // Current Watermarkreturn currentElementEventTime;}@Nullable@Overridepublic Watermark getCurrentWatermark() {/*** WasterMark会周期性的产生,默认就是每隔200毫秒产生一个** 设置 watermark 产生的周期为 1000ms* env.getConfig().setAutoWatermarkInterval(1000);*///window延迟5秒触发System.out.println("water mark...");return new Watermark(currentMaxEventTime - maxOutOfOrderness);}}
}
收集迟到的数据
/*** 得到并打印每隔 3 秒钟统计前 3 秒内的相同的 key 的所有的事件* 收集迟到太多的数据*/
public class WaterMarkWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(1);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//设置waterMark产生的周期为1senv.getConfig().setAutoWatermarkInterval(1000);// 保存迟到的,会被丢弃的数据OutputTag<Tuple2<String, Long>> outputTag =new OutputTag<Tuple2<String, Long>>("late-date"){};DataStreamSource<String> dataStream = env.socketTextStream("10.148.15.10", 8888);SingleOutputStreamOperator<String> result = dataStream.map(new MapFunction<String, Tuple2<String, Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0], Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor()).keyBy(0).timeWindow(Time.seconds(3))// .allowedLateness(Time.seconds(2)) // 允许事件迟到 2 秒.sideOutputLateData(outputTag) // 保存迟到太多的数据.process(new SumProcessWindowFunction());//打印正常的数据result.print();//获取迟到太多的数据DataStream<String> lateDataStream= result.getSideOutput(outputTag).map(new MapFunction<Tuple2<String, Long>, String>() {@Overridepublic String map(Tuple2<String, Long> stringLongTuple2) throws Exception {return "迟到的数据:" + stringLongTuple2.toString();}});lateDataStream.print();env.execute("TimeWindowWordCount");}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,String,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<String> out) {System.out.println("处理时间:" + dateFormat.format(context.currentProcessingTime()));System.out.println("window start time : " + dateFormat.format(context.window().getStart()));List<String> list = new ArrayList<>();for (Tuple2<String, Long> ele : elements) {list.add(ele.toString() + "|" + dateFormat.format(ele.f1));}out.collect(list.toString());System.out.println("window end time : " + dateFormat.format(context.window().getEnd()));}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");private long currentMaxEventTime = 0L;private long maxOutOfOrderness = 10000; // 最大允许的乱序时间 10 秒// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {long currentElementEventTime = element.f1;currentMaxEventTime = Math.max(currentMaxEventTime, currentElementEventTime);System.out.println("event = " + element+ "|" + dateFormat.format(element.f1) // Event Time+ "|" + dateFormat.format(currentMaxEventTime) // Max Event Time+ "|" + dateFormat.format(getCurrentWatermark().getTimestamp())); // Current Watermarkreturn currentElementEventTime;}@Nullable@Overridepublic Watermark getCurrentWatermark() {/*** WasterMark会周期性的产生,默认就是每隔200毫秒产生一个** 设置 watermark 产生的周期为 1000ms* env.getConfig().setAutoWatermarkInterval(1000);*/System.out.println("water mark...");return new Watermark(currentMaxEventTime - maxOutOfOrderness);}}
}
多并行度下的WaterMark
一个window可能会接受到多个waterMark,我们以最小的为准。
/**
* 得到并打印每隔 3 秒钟统计前 3 秒内的相同的 key 的所有的事件
* 测试多并行度
*/
public class WaterMarkWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();//把并行度设置为2env.setParallelism(2);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//设置waterMark产生的周期为1senv.getConfig().setAutoWatermarkInterval(1000);// 保存迟到的,会被丢弃的数据OutputTag<Tuple2<String, Long>> outputTag =new OutputTag<Tuple2<String, Long>>("late-date"){};DataStreamSource<String> dataStream = env.socketTextStream("10.148.15.10", 8888);SingleOutputStreamOperator<String> result = dataStream.map(new MapFunction<String, Tuple2<String, Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0], Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor()).keyBy(0).timeWindow(Time.seconds(3))// .allowedLateness(Time.seconds(2)) // 允许事件迟到 2 秒.sideOutputLateData(outputTag) // 保存迟到太多的数据.process(new SumProcessWindowFunction());//打印正常的数据result.print();//获取迟到太多的数据DataStream<String> lateDataStream= result.getSideOutput(outputTag).map(new MapFunction<Tuple2<String, Long>, String>() {@Overridepublic String map(Tuple2<String, Long> stringLongTuple2) throws Exception {return "迟到的数据:" + stringLongTuple2.toString();}});lateDataStream.print();env.execute("TimeWindowWordCount");}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,String,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<String> out) {System.out.println("处理时间:" + dateFormat.format(context.currentProcessingTime()));System.out.println("window start time : " + dateFormat.format(context.window().getStart()));List<String> list = new ArrayList<>();for (Tuple2<String, Long> ele : elements) {list.add(ele.toString() + "|" + dateFormat.format(ele.f1));}out.collect(list.toString());System.out.println("window end time : " + dateFormat.format(context.window().getEnd()));}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");private long currentMaxEventTime = 0L;private long maxOutOfOrderness = 10000; // 最大允许的乱序时间 10 秒// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {long currentElementEventTime = element.f1;currentMaxEventTime = Math.max(currentMaxEventTime, currentElementEventTime);//打印线程long id = Thread.currentThread().getId();System.out.println("当前线程ID:"+id+"event = " + element+ "|" + dateFormat.format(element.f1) // Event Time+ "|" + dateFormat.format(currentMaxEventTime) // Max Event Time+ "|" + dateFormat.format(getCurrentWatermark().getTimestamp())); // Current Watermarkreturn currentElementEventTime;}@Nullable@Overridepublic Watermark getCurrentWatermark() {/*** WasterMark会周期性的产生,默认就是每隔200毫秒产生一个** 设置 watermark 产生的周期为 1000ms* env.getConfig().setAutoWatermarkInterval(1000);*/System.out.println("water mark...");return new Watermark(currentMaxEventTime - maxOutOfOrderness);}}
}
WaterMark生成机制
/*** 得到并打印每隔 3 秒钟统计前 3 秒内的相同的 key 的所有的事件* 有条件的产生watermark*/
public class WaterMarkWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();//把并行度设置为2env.setParallelism(2);//步骤一:设置时间类型env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//设置waterMark产生的周期为1senv.getConfig().setAutoWatermarkInterval(1000);// 保存迟到的,会被丢弃的数据OutputTag<Tuple2<String, Long>> outputTag =new OutputTag<Tuple2<String, Long>>("late-date"){};DataStreamSource<String> dataStream = env.socketTextStream("10.148.15.10", 8888);SingleOutputStreamOperator<String> result = dataStream.map(new MapFunction<String, Tuple2<String, Long>>() {@Overridepublic Tuple2<String, Long> map(String line) throws Exception {String[] fields = line.split(",");return new Tuple2<>(fields[0], Long.valueOf(fields[1]));}//步骤二:获取数据里面的event Time}).assignTimestampsAndWatermarks(new EventTimeExtractor()).keyBy(0).timeWindow(Time.seconds(3))// .allowedLateness(Time.seconds(2)) // 允许事件迟到 2 秒.sideOutputLateData(outputTag) // 保存迟到太多的数据.process(new SumProcessWindowFunction());//打印正常的数据result.print();//获取迟到太多的数据DataStream<String> lateDataStream= result.getSideOutput(outputTag).map(new MapFunction<Tuple2<String, Long>, String>() {@Overridepublic String map(Tuple2<String, Long> stringLongTuple2) throws Exception {return "迟到的数据:" + stringLongTuple2.toString();}});lateDataStream.print();env.execute("TimeWindowWordCount");}/*** IN, OUT, KEY, W* IN:输入的数据类型* OUT:输出的数据类型* Key:key的数据类型(在Flink里面,String用Tuple表示)* W:Window的数据类型*/public static class SumProcessWindowFunction extendsProcessWindowFunction<Tuple2<String,Long>,String,Tuple,TimeWindow> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");/*** 当一个window触发计算的时候会调用这个方法* @param tuple key* @param context operator的上下文* @param elements 指定window的所有元素* @param out 用户输出*/@Overridepublic void process(Tuple tuple, Context context, Iterable<Tuple2<String, Long>> elements,Collector<String> out) {System.out.println("处理时间:" + dateFormat.format(context.currentProcessingTime()));System.out.println("window start time : " + dateFormat.format(context.window().getStart()));List<String> list = new ArrayList<>();for (Tuple2<String, Long> ele : elements) {list.add(ele.toString() + "|" + dateFormat.format(ele.f1));}out.collect(list.toString());System.out.println("window end time : " + dateFormat.format(context.window().getEnd()));}}/*** 按条件产生waterMark*/private static class EventTimeExtractor2implements AssignerWithPunctuatedWatermarks<Tuple2<String, Long>> {@Nullable@Overridepublic Watermark checkAndGetNextWatermark(Tuple2<String, Long> lastElement,long extractedTimestamp) {// 这个方法是每接收到一个事件就会调用// 根据条件产生 watermark ,并不是周期性的产生 watermarkif (lastElement.f0 == "000002") {// 才发送 watermarkreturn new Watermark(lastElement.f1 - 10000);}// 则表示不产生 watermarkreturn null;}@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {return element.f1;}}private static class EventTimeExtractorimplements AssignerWithPeriodicWatermarks<Tuple2<String, Long>> {FastDateFormat dateFormat = FastDateFormat.getInstance("HH:mm:ss");private long currentMaxEventTime = 0L;private long maxOutOfOrderness = 10000; // 最大允许的乱序时间 10 秒// 拿到每一个事件的 Event Time@Overridepublic long extractTimestamp(Tuple2<String, Long> element,long previousElementTimestamp) {long currentElementEventTime = element.f1;currentMaxEventTime = Math.max(currentMaxEventTime, currentElementEventTime);long id = Thread.currentThread().getId();System.out.println("当前线程ID:"+id+"event = " + element+ "|" + dateFormat.format(element.f1) // Event Time+ "|" + dateFormat.format(currentMaxEventTime) // Max Event Time+ "|" + dateFormat.format(getCurrentWatermark().getTimestamp())); // Current Watermarkreturn currentElementEventTime;}@Nullable@Overridepublic Watermark getCurrentWatermark() {/*** WasterMark会周期性的产生,默认就是每隔200毫秒产生一个** 设置 watermark 产生的周期为 1000ms* env.getConfig().setAutoWatermarkInterval(1000);*** 和事件关系不大* 1. watermark 值依赖处理时间的场景* 2. 当有一段时间没有接收到事件,但是仍然需要产生 watermark 的场景*/System.out.println("water mark...");return new Watermark(currentMaxEventTime - maxOutOfOrderness);}}
}