版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/someby/article/details/89017277
目录
本篇文章记录广告点击流量实时统计-计算每天各省各城市各广告的点击量。
代码
domain
AdStat.java
package graduation.java.domain; /** * FileName: AdStat * Author: hadoop * Email: [email protected] * Date: 19-4-4 上午10:26 * Description: * 广告点击实时状态类 */ public class AdStat { private String date; private String province; private String city; private long adid; private long clickCount; public String getDate() { return date; } public void setDate(String date) { this.date = date; } public String getProvince() { return province; } public void setProvince(String province) { this.province = province; } public String getCity() { return city; } public void setCity(String city) { this.city = city; } public long getAdid() { return adid; } public void setAdid(long adid) { this.adid = adid; } public long getClickCount() { return clickCount; } public void setClickCount(long clickCount) { this.clickCount = clickCount; } @Override public String toString() { return "AdStat{" + "date='" + date + '\'' + ", province='" + province + '\'' + ", city='" + city + '\'' + ", adid=" + adid + ", clickCount=" + clickCount + '}'; } }
dao
IAdStatDAO.java
package graduation.java.dao; import graduation.java.domain.AdStat; import java.util.List; /** * FileName: IAdStatDAO * Author: hadoop * Email: [email protected] * Date: 19-4-4 上午10:28 * Description: * * 广告点击实时状态DAO接口 */ public interface IAdStatDAO { /** * 批量插入广告实时状态信息 * @param adStats */ void updateBatch(List<AdStat> adStats); }
impl
AdStatDAOImpl.java
package graduation.java.impl; import graduation.java.dao.IAdStatDAO; import graduation.java.domain.AdStat; import graduation.java.jdbc.JDBCHelper; import graduation.java.model.AdStatQueryResult; import java.sql.ResultSet; import java.util.ArrayList; import java.util.List; /** * FileName: AdStatDAOImpl * Author: hadoop * Email: [email protected] * Date: 19-4-4 上午10:31 * Description: * 广告实时点击状态DAO实现类 */ public class AdStatDAOImpl implements IAdStatDAO { @Override public void updateBatch(List<AdStat> adStats) { JDBCHelper jdbcHelper = JDBCHelper.getInstance(); AdStatQueryResult queryResult = new AdStatQueryResult(); List<AdStat> updateAdStatList = new ArrayList<AdStat>(); List<AdStat> insertAdStatList = new ArrayList<AdStat>(); String selectSQL = "SELECT count(*) FROM ad_stat " + "WHERE date=? " + "AND province=? " + "AND city=? " + "AND adid=?"; for (AdStat adStat : adStats){ Object[] params = new Object[]{ adStat.getDate(), adStat.getProvince(), adStat.getCity(), adStat.getAdid()}; jdbcHelper.executeQuery(selectSQL, params, new JDBCHelper.QueryCallback() { @Override public void process(ResultSet rs) throws Exception { while (rs.next()){ int count = rs.getInt(1); queryResult.setCount(count); } } }); int count = queryResult.getCount(); if (count > 0){ updateAdStatList.add(adStat); }else{ insertAdStatList.add(adStat); } } //对首次进入的信息进行插入 String insertSQL = "INSERT INTO ad_stat VALUES(?,?,?,?,?)"; List<Object[]> iinsertParamsList = new ArrayList<Object[]>(); for (AdStat adStat : insertAdStatList){ Object[] params = new Object[]{ adStat.getDate(), adStat.getProvince(), adStat.getCity(), adStat.getAdid(), adStat.getClickCount()}; iinsertParamsList.add(params); } jdbcHelper.executeBatch(insertSQL,iinsertParamsList); //对已有的信息进行更新操作 String updateSQL = "UPDATE ad_stat SET click_count=? " + "WHERE date=? " + "AND province=? " + "AND city=? " + "AND adid=?"; List<Object[]> updateParamsList = new ArrayList<Object[]>(); for (AdStat adStat : updateAdStatList){ Object[] params = new Object[]{ adStat.getClickCount(), adStat.getDate(), adStat.getProvince(), adStat.getCity(), adStat.getAdid()}; updateParamsList.add(params); } jdbcHelper.executeBatch(updateSQL,updateParamsList); } }
factory
DAOFactory.java
/** * 广告实时点击管理DAO * @return */ public static AdStatDAOImpl getAdStatDAO(){ return new AdStatDAOImpl(); }
model
AdStatQueryResult.java
package graduation.java.model; /** * FileName: AdStatQueryResult * Author: hadoop * Email: [email protected] * Date: 19-4-4 上午10:39 * Description: * 广告实时点击查询结果 */ public class AdStatQueryResult { private int count; public int getCount() { return count; } public void setCount(int count) { this.count = count; } @Override public String toString() { return "AdStatQueryResult{" + "count=" + count + '}'; } }
spark.ad
AdClickRealTimeStatSpark.java
/** *计算广告点击流量实时统计 * @param filteredAdRealTimeLogDStream * @return */ private static JavaPairDStream<String, Long> calculateRealTimeStat(JavaPairDStream<String, String> filteredAdRealTimeLogDStream) { // 业务逻辑一 // 广告点击流量实时统计 // 上面的黑名单实际上是广告类的实时系统中,比较常见的一种基础的应用 // 实际上,我们要实现的业务功能,不是黑名单 // 计算每天各省各城市各广告的点击量 // 这份数据,实时不断地更新到mysql中的,J2EE系统,是提供实时报表给用户查看的 // j2ee系统每隔几秒钟,就从mysql中搂一次最新数据,每次都可能不一样 // 设计出来几个维度:日期、省份、城市、广告 // j2ee系统就可以非常的灵活 // 用户可以看到,实时的数据,比如2015-11-01,历史数据 // 2015-12-01,当天,可以看到当天所有的实时数据(动态改变),比如江苏省南京市 // 广告可以进行选择(广告主、广告名称、广告类型来筛选一个出来) // 拿着date、province、city、adid,去mysql中查询最新的数据 // 等等,基于这几个维度,以及这份动态改变的数据,是可以实现比较灵活的广告点击流量查看的功能的 // date province city userid adid // date_province_city_adid,作为key;1作为value // 通过spark,直接统计出来全局的点击次数,在spark集群中保留一份;在mysql中,也保留一份 // 我们要对原始数据进行map,映射成<date_province_city_adid,1>格式 // 然后呢,对上述格式的数据,执行updateStateByKey算子 // spark streaming特有的一种算子,在spark集群内存中,维护一份key的全局状态 JavaPairDStream<String,Long> mappedDStream = filteredAdRealTimeLogDStream.mapToPair(new PairFunction<Tuple2<String, String>, String, Long>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Long> call(Tuple2<String, String> tuple) throws Exception { String log = tuple._2; String[] logSplited = log.split(" "); String timestamp = logSplited[0]; Date date = new Date(Long.valueOf(timestamp)); String dateKey = DateUtils.formatDateKey(date); String province = logSplited[1]; String city = logSplited[2]; long adid = Long.valueOf(logSplited[4]); String key = dateKey +"_" + province + "_" + city + "_" + adid; return new Tuple2<String,Long>(key,1L); } }); // 在这个dstream中,就相当于,有每个batch rdd累加的各个key(各天各省份各城市各广告的点击次数) // 每次计算出最新的值,就在aggregatedDStream中的每个batch rdd中反应出来 JavaPairDStream<String,Long> aggregatedDStream = mappedDStream.updateStateByKey(new Function2<List<Long>, Optional<Long>, Optional<Long>>() { private static final long serialVersionUID = 1L; @Override public Optional<Long> call(List<Long> values, Optional<Long> optional) throws Exception { // 举例来说 // 对于每个key,都会调用一次这个方法 // 比如key是<20151201_Jiangsu_Nanjing_10001,1>,就会来调用一次这个方法7 // 10个 // values,(1,1,1,1,1,1,1,1,1,1) // 首先根据optional判断,之前这个key,是否有对应的状态 long clickCount = 0L; // 如果说,之前是存在这个状态的,那么就以之前的状态作为起点,进行值的累加 if (optional.isPresent()){ clickCount = optional.get(); } // values,代表了,batch rdd中,每个key对应的所有的值 for (Long value : values){ clickCount += value; } return Optional.of(clickCount); } }); // 将计算出来的最新结果,同步一份到mysql中,以便于j2ee系统使用 aggregatedDStream.foreachRDD(new VoidFunction<JavaPairRDD<String, Long>>() { private static final long serialVersionUID = 1L; @Override public void call(JavaPairRDD<String, Long> rdd) throws Exception { rdd.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Long>>>() { private static final long serialVerisonUID = 1L; @Override public void call(Iterator<Tuple2<String, Long>> iterator) throws Exception { List<AdStat> adStats = new ArrayList<AdStat>(); while (iterator.hasNext()){ Tuple2<String,Long> tuple = iterator.next(); String[] keySplited = tuple._1.split("_"); String date = keySplited[0]; String province = keySplited[1]; String city = keySplited[2]; long adid = Long.valueOf(keySplited[3]); long clickCount = tuple._2; AdStat adStat = new AdStat(); adStat.setDate(date); adStat.setProvince(province); adStat.setCity(city); adStat.setAdid(adid); adStat.setClickCount(clickCount); adStats.add(adStat); } IAdStatDAO adStatDAO = DAOFactory.getAdStatDAO(); adStatDAO.updateBatch(adStats); } }); } }); return aggregatedDStream; }