天天看点

基于DataStream API 的flink程序实现TopN

背景介绍:

每隔五分钟,统计过去一个小时的热门商品。

数据集:

https://tianchi.aliyun.com/dataset/dataDetail?dataId=60747

代码:

数据结构:

package com.flink.topn;

public class UserDataBean {
    public long userId;         // 用户ID
    public long itemId;         // 商品ID
    public int categoryId;      // 商品类目ID
    public String behavior;     // 用户行为, 包括("pv", "buy", "cart", "fav")
    public long timestamp;      // 行为发生的时间戳,单位秒
}
           

主函数

package com.flink.topn;

import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.io.PojoCsvInputFormat;
import org.apache.flink.api.java.typeutils.PojoTypeInfo;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

import java.io.File;
import java.net.URL;

public class HotGoodTopN {
    public static void main(String[] args) {
        try {
            StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
            env.setParallelism(1);
            env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

            URL fileUrl = HotGoodTopN.class.getClassLoader().getResource("UserBehavior.csv");
            System.out.println(fileUrl);
            Path path = Path.fromLocalFile(new File(fileUrl.toURI()));
            PojoTypeInfo<UserDataBean> typeInfo = (PojoTypeInfo) TypeExtractor.createTypeInfo(UserDataBean.class);
            String[] fieldInfo = new String[]{"userId", "itemId", "categoryId", "behavior", "timestamp"};
            PojoCsvInputFormat<UserDataBean> csvInputFormat = new PojoCsvInputFormat<>(path, typeInfo, fieldInfo);
            DataStream<UserDataBean> input = env.createInput(csvInputFormat, typeInfo);

            //生成watermark
            DataStream<UserDataBean> operator = input.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<UserDataBean>() {
                @Override
                public long extractAscendingTimestamp(UserDataBean userDataBean) {
                    return userDataBean.timestamp * 1000;
                }
            });

            //过滤出点击商品行为的数据
            DataStream<UserDataBean> pv = operator.filter(new FilterFunction<UserDataBean>() {
                @Override
                public boolean filter(UserDataBean userDataBean) throws Exception {
                    return userDataBean.behavior.equals("pv");
                }
            });
            
            //根据商品编码进行分组,滑动窗口,窗口大小为1h,没5m滑动一次
            DataStream<CountResultWindowOutput> itemIdData = pv.keyBy("itemId").window(SlidingEventTimeWindows.of(Time.hours(1L), Time.minutes(5L)))
                    .aggregate(new CountGoodNum(), new CountResultWindow());

            //根据窗口的结束时间进行分组,求出每个窗口中的topN热门商品
            DataStream<String> process = itemIdData.keyBy("windowEnd").process(new TopNGoodsInfo(3));
            process.print();

            env.execute("HotGoodTopN");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
           

中间输出数据结构POJO:

package com.flink.topn;

public class CountResultWindowOutput {
    public long itemId;     // 商品ID
    public long windowEnd;  // 窗口结束时间戳
    public long viewCount;  // 商品的点击量

    //必须有个无参构造函数,否则运行报如下错误(在flink pojo必须有无参构造函数,否则不能进行key分组)
    /**
     * org.apache.flink.api.common.InvalidProgramException: This type (GenericType<com.suning.flink.topn.CountResultWindowOutput>) cannot be used as key.
     * 	at org.apache.flink.api.common.operators.Keys$ExpressionKeys.<init>(Keys.java:330)
     * 	at org.apache.flink.streaming.api.datastream.DataStream.keyBy(DataStream.java:337)
     * 	at com.suning.flink.topn.HotGoodTopN.main(HotGoodTopN.java:55)
     */
    public CountResultWindowOutput() {
    }

    public CountResultWindowOutput(long itemId, long windowEnd, long viewCount) {
        this.itemId = itemId;
        this.windowEnd = windowEnd;
        this.viewCount = viewCount;
    }
}
           

窗口函数:

1.增量聚合函数:

package com.flink.topn;

import org.apache.flink.api.common.functions.AggregateFunction;

public class CountGoodNum implements AggregateFunction<UserDataBean, Long, Long> {

    @Override
    public Long createAccumulator() {
        return 0L;
    }

    @Override
    public Long add(UserDataBean userDataBean, Long acc) {
        return acc + 1;
    }

    @Override
    public Long getResult(Long acc) {
        return acc;
    }

    @Override
    public Long merge(Long acc1, Long acc2) {
        return acc1 + acc2;
    }
}
           

2.输出窗口函数:

package com.flink.topn;

import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

public class CountResultWindow implements WindowFunction<Long, CountResultWindowOutput, Tuple, TimeWindow> {
    @Override
    public void apply(Tuple key, TimeWindow timeWindow, Iterable<Long> iterable,
                      Collector<CountResultWindowOutput> collector) throws Exception {

        Long itemId = (Long) ((Tuple1) key).f0;
        collector.collect(new CountResultWindowOutput(itemId, timeWindow.getEnd(), iterable.iterator().next()));
    }
}
           

3.最终输出窗口函数:

package com.flink.topn;

import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;

public class TopNGoodsInfo extends KeyedProcessFunction<Tuple, CountResultWindowOutput, String> {
    private int topSize = 0;
    //保留商品计数状态
    private ListState<CountResultWindowOutput> goodsState;

    public TopNGoodsInfo(int topSize) {
        this.topSize = topSize;
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        ListStateDescriptor<CountResultWindowOutput> stateDescriptor = new ListStateDescriptor<>("goods_state", CountResultWindowOutput.class);
        goodsState = this.getRuntimeContext().getListState(stateDescriptor);
    }

    @Override
    public void processElement(CountResultWindowOutput countResultWindowOutput, Context context, Collector<String> collector) throws Exception {
        goodsState.add(countResultWindowOutput);
        context.timerService().registerEventTimeTimer(countResultWindowOutput.windowEnd + 1);
    }

    @Override
    public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
        super.onTimer(timestamp, ctx, out);
        ArrayList<CountResultWindowOutput> list = new ArrayList<>();
        for (CountResultWindowOutput goodsInfo : goodsState.get()) {
            list.add(goodsInfo);
        }
        list.sort(new Comparator<CountResultWindowOutput>() {
            @Override
            public int compare(CountResultWindowOutput o1, CountResultWindowOutput o2) {
                return (int) (o2.viewCount - o1.viewCount);
            }
        });

        //格式化输出结果
        StringBuilder builder = new StringBuilder();
        builder.append("====================================\n");
        builder.append("时间: ").append(new Timestamp(timestamp - 1)).append("\n");
        for (int i = 0; i < topSize && i < list.size(); i++) {
            CountResultWindowOutput countResultWindowOutput = list.get(i);
            builder.append("No").append(i + 1).append(":")
                    .append("  商品ID=").append(countResultWindowOutput.itemId)
                    .append("  浏览量=").append(countResultWindowOutput.viewCount)
                    .append("\n");

        }
        builder.append("====================================\n");
        out.collect(builder.toString());

    }

    @Override
    public void close() throws Exception {
        super.close();
    }
}
           

参考:https://ververica.cn/developers/computing-real-time-hot-goods/

继续阅读