1.分位数
percentile_approx(DOUBLE col, p [, B]) Returns an approximate pth percentile of a numeric column (including floating point types) in the group
含义: 在col列中返回p%的分位数
select percentile_approx( arr_id , 0.5 )from (selectarr_idfrom(selectarray(1, 2, 3, 4, 5, 6, 7, 8, 9, 1000) as arr) a lateral view explode(arr) tt as arr_id) a |
2.url解析
解码url: URL转码,encodeURI,encodeURIComponent — 在线工具
解析json: 在线JSON校验格式化工具(Be JSON)
上报: :path: /log/web?0000171612422098683https%3A%2F%2Fwww.bilibili.com%2Fbangumi%2Flist%2Fsl17662%3Ffrom_spmid%3Dpgc.cinema-tab.0.0%26intentFrom%3D15%26native.theme%3D1%26navhide%3D1%26share_medium%3Dandroid%26share_plat%3Dandroid%26share_source%3DCOPY%26share_tag%3Ds_i%26timestamp%3D1612343552%26unique_k%3DHKz3gy|666.49.selfDef.click_unfollow||1612422098000|0|0|980x1743|2|{%22event%22:%22click_unfollow%22,%22value%22:{%22sl_id%22:%2217662%22,%22season_id%22:35582,%22season_type%22:3,%22mid%22:36865977},%22bsource%22:%22share_source_copy_link%22}|{}|https%3A%2F%2Fm.bilibili.com%2F|FF7C614F-2539-A3E3-1AE8-E4265F23111B03042infoc|zh-CN|nullmsg: {%22event%22:%22click_season%22,%22value%22:{%22sl_id%22:%2218665%22,%22season_id%22:32429,%22season_type%22:2,%22mid%22:179753067},%22bsource%22:%22%22}解码url: https://www.sojson.com/encodeurl.html 解析json: https://www.bejson.com/ 解析msg 字段: select * ,get_json_object(msg, '$.event') as event1 ,get_json_object(msg, '$.value') as value1 ,get_json_object(get_json_object(msg, '$.value'), '$.sl_id') as sub_value_sl ,get_json_object(get_json_object(msg, '$.value'), '$.season_id') as sub_value_season ,get_json_object(get_json_object(msg, '$.value'), '$.season_type') as sub_value_season_type ,get_json_object(get_json_object(msg, '$.value'), '$.mid') as sub_value_mid from ( select '{"event":"click_season","value":{"sl_id":"18665","season_id":32429,"season_type":2,"mid":179753067},"bsource":""}' as msg union all select '{"event":"unload","value":{"enter":1612292714266,"leave":1612292745642},"bsource":""}' as msg union all select '{"event":"show_hover","value":{"sl_id":"20233","season_id":33987,"season_type":2,"mid":172278354},"bsource":""}' as msg union all select '{"event":"click_season","value":{"sl_id":"17834","season_id":34565,"season_type":2,"mid":271658710},"bsource":"search_baidu"}' as msg union all select '{"event":"show_hover","value":{"sl_id":"17834","season_id":28274,"season_type":2,"mid":281264988},"bsource":""}' as msg ) alimit 2000select *, reflect("java.net.URLDecoder", "decode", msg, "UTF-8") as url ,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.event') as event_type ,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value') as event_value ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.sl_id') as sub_value_sl ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_id') as sub_value_season ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_type') as sub_value_season_type ,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.mid') as sub_value_mid fromb_dwd.dwd_flow_web_click_l_d WHERElog_date between '<%= log_date %>' and '<%= log_date %>' and substr(spm_id,1,6)= '666.49'limit 2000 |
3.正则
滑动验证页面
select
*,
a.from_spmid rlike concat('^','dynamic') ,
a.from_spmid rlike concat('^','dt+') ,
a.from_spmid rlike '^dt'
from
(
select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
select 'out_open_deeplink_gdtlh-and-yf-31yaz-17776718' as from_spmid union ALL
select 'out_open_deeplink_gdtlh-and-yf-31yaz-18253662' as from_spmid union ALL
select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
select 'dynamic.video-tab.0.0' as from_spmid union ALL
select 'default-value' as from_spmid
) a
3.hsql 间隔日期补数
方法1:窗口函数
with tmp_table as (select dt ,city , val from (select '20220101' as dt ,'上海' as city , 100 val union all select '20220102' as dt ,'上海' as city , null val union all select '20220104' as dt ,'上海' as city , null val union all select '20220106' as dt ,'上海' as city , 200 val ) a ) ,tmp_a as ( select dt ,city , val , case when val is null then 0 else 1 end val1 -- 将空值设置为0,非空值设置为1 from tmp_table ), tmp_b as ( select dt ,city , val ,val1, sum(val1) over(partition by '1' order by dt) as cnt_val1 -- 区分 空和费控分空分组 from tmp_a ) select dt ,city , val ,val1,cnt_val1, max(val) over(partition by cnt_val1 order by dt) as fill_val -- 使用开窗函数按照分组求最大值 from tmp_b |
方法2:udf
4. 用户大会员在期的天数
with tmp_dt as (select id,sdt,edtfrom (select 1 id, '2022-02-02' as sdt, '2022-02-07' as edt union all select 1 id, '2022-02-07' as sdt, '2022-02-18' as edt union allselect 1 id, '2022-02-10' as sdt, '2022-02-23' as edt ) a ) select id,base_line,max(dt) dt ,datediff(max(dt),base_line)+1 as days from (select id ,dt,max(base_f) over (PARTITION BY id ORDER BY dt) base_line from (select id ,dt,flag,sum(flag) over(partition by id order by concat(dt,flag) desc) as sm ,case when sum(flag) over(partition by id order by concat(dt,flag) desc)=0 then dt else 0 end as base_ffrom (select id,sdt as dt,1 as flagfrom tmp_dtunion all select id,edt,-1 as flagfrom tmp_dt) a ) b ) c group by id ,base_line |
5.取每个mid同时播放的设备数分布。
已经整理好buvid、mid、开始播放时间、结束播放时间。主要背景是想看下现在“共享账号”的用户量级有多少
https://berserker.bilibili.co/?URL=http://berserker.bilibili.co/#/adhoc?sqlId=100670985
with ogv_play as (select mid,buvid,season_id,epid,log_date,arch_play_timestamp as stime,arch_play_timestamp+total_duration as etimefrom bili_ogv.dwd_flow_ogv_play_app_timed_i_dwhere log_date between '20230226' and '20230226' and season_id >0 and total_duration>0 union select mid,buvid,season_id,epid,log_date,arch_play_timestamp as stime,arch_play_timestamp+played_duration as etimefrom bili_ogv.dwd_flow_ogv_play_web_timed_i_dwhere log_date between '20230226' and '20230226' and season_id >0 and played_duration>0 ),vip_user as (select mid,log_datefrom bili_ogv.dim_user_full_d where log_date between '20230226' and '20230226' and is_vip_user = 1 ),vip_play_pay_ep as (select a.mid,a.buvid,a.epid,a.stime,a.etime ,a.log_datefrom ogv_play a join vip_user b on a.mid = b.mid and a.log_date = b.log_datejoin (select epid,log_datefrom bili_ogv.dim_ep_av_full_d where log_date between '20230226' and '20230226' and ep_status not in (2,14)) c on a.epid = c.epid and a.log_date = c.log_date)-- mid同时播放的设备数分布 select i.buvid_cnt,count(*) as mid_cnt from ( select -- mid 的设备数去重 buvidj.mid,count(distinct j.buvid ) as buvid_cnt from (select -- 删除重复进入播放记录, 保留同时进入播放的记录h.log_date,h.mid,h.buvid,h.epid,h.in_out_time,h.label ,h.num -- 每个mid 同时在线计数,h.in_out_time_rank -- 每个mid 播放开始时间排序 ,h.num1 -- 用户进入次数计数,h.diff ,h.max_num ,h.max_inc_rank -- 升序的最大值,h.max_inc_rank_for_row -- 升序的最大值存放在每一行,h.acc_by_mid_num1 -- 每个mid重复进入播放计数 from (selectg.log_date,g.mid,g.buvid,g.epid,g.in_out_time,g.label ,g.num -- 每个mid 同时在线计数,g.in_out_time_rank -- 每个mid 播放开始时间排序 ,g.num1 -- 用户进入次数计数,g.diff ,g.max_num ,g.max_inc_rank -- 升序的最大值,g.max_inc_rank_for_row -- 升序的最大值存放在每一行,sum(1) over(partition by mid,num1 ) as acc_by_mid_num1 -- 每个mid重复进入播放计数 from (select -- 得到同时在线num 的升序部分 f.log_date,f.mid,f.buvid,f.epid,f.in_out_time,f.label ,f.num -- 每个mid 同时在线计数,f.in_out_time_rank -- 每个mid 播放开始时间排序 ,f.num1 -- 用户进入次数计数,f.diff ,f.max_num ,f.max_inc_rank -- 升序的最大值,f.max_inc_rank_for_row -- 升序的最大值存放在每一行 from (select e.log_date,e.mid,e.buvid,e.epid,e.in_out_time,e.label ,e.num -- 每个mid 同时在线计数,e.in_out_time_rank,e.num1 -- 用户进入次数计数,e.diff ,e.max_num ,e.max_inc_rank -- 升序的最大值,max(e.max_inc_rank) over(partition by e.mid) max_inc_rank_for_row -- 升序的最大值存放在每一行 from (select d.log_date,d.mid,d.buvid,d.epid,d.in_out_time,d.label ,d.num -- 每个mid 同时在线计数,d.in_out_time_rank,d.num1 -- 用户进入次数计数,d.diff ,d.max_num ,if(d.max_num=d.num,d.in_out_time_rank,0) as max_inc_rank -- 升序的最大值 from (select c.log_date,c.mid,c.buvid,c.epid,c.in_out_time,c.label ,c.num -- 每个mid 同时在线计数,c.in_out_time_rank,c.num1 -- 用户进入次数计数,c.diff ,first_value(c.num) over(partition by c.mid order by c.num desc ) as max_num from (selectb.log_date,b.mid,b.buvid,b.epid,b.in_out_time,b.label ,b.num -- 每个mid 同时在线计数,b.in_out_time_rank,sum(b.label1) over(partition by b.mid order by b.in_out_time ) as num1 -- 用户进入次数计数,b.in_out_time_rank-b.num as diff from(select a.log_date,a.mid,a.buvid,a.epid,a.in_out_time,a.label,sum(a.label) over(partition by a.mid order by a.in_out_time ) as num -- 每个mid 同时在线计数,row_number() over(partition by a.mid order by a.in_out_time ) as in_out_time_rank,if(a.label=-1 ,0,a.label) as label1 -- 退出分组from (-- 开始时间select log_date,mid,buvid,epid,stime as in_out_time,1 as labelfrom vip_play_pay_ep-- where mid = 57706 -- 1645967203union all -- 结束时间select log_date,mid,buvid,epid,etime as in_out_time,-1 as labelfrom vip_play_pay_ep-- where mid = 57706 -- 1645967203) a) b ) c ) d ) e ) f where f.in_out_time_rank between 1 and f.max_inc_rank_for_row ) g ) h where h.acc_by_mid_num1 =1 -- 每个mid重复进入播放计数) j group by j.mid) i group by i.buvid_cnt |
思路:
1.每个mid 同时在线计数
2.同时播放数提取升序部分
3.删除重复进入播放部分
5. 提取一个字符串中重复的item_id 后缀数字
select regexp_extract(sub_str,'item_id\\":\\"([0-9]+)(\\")',1) FROM (select '{"item_infos":[{"item_id":"701969"},{"item_id":"701965"}]},{"item_infos":[{"item_id":"701964"},{"item_id":"701963"}]}' as string_test ) a lateral view explode(split(string_test,',')) t as sub_str |