需求
從web日志中統計每日訪客平均停留時間
實作步驟
1、由于要從大量請求中分辨出使用者的各次通路,邏輯相對複雜,通過hive直接實作有困難,是以編寫一個mr程式來求出訪客通路資訊(詳見代碼)
啟動mr程式擷取結果:
[hadoop@hdp-node-01 ~]$ hadoop jar weblog.jar cn.itcast.bigdata.hive.mr.UserStayTime /weblog/input /weblog/stayout
2、将mr的處理結果導入hive表
drop table t_display_access_info_tmp;
create table t_display_access_info_tmp(remote_addr string,firt_req_time string,last_req_time string,stay_long bigint)
row format delimited fields terminated by '\t';
load data inpath '/weblog/stayout4' into table t_display_access_info_tmp;
drop table t_display_access_info;
create table t_display_access_info as
select remote_addr,firt_req_time,last_req_time,
case stay_long
when 0 then 30000
else stay_long
end as stay_long
from t_display_access_info_tmp;
select avg(stay_long) from t_display_access_info;