友情提示:
抄本篇文章答案之前,请务必提前下载好 反诈APP。
多年情怀粉,今日粉转黑....
问题:
Q1: 将附件中 ip_china.csv.zip文件加载为 Hive 内部表,保持格式与 csv header一致,表需要开启压缩
Q2: 将附件中 login_data.csv.zip文件加载为 Hive 外部表,保持格式与csv header一致,表需要开启压缩,需要按日分区
Q3: 通过Q1,Q2加载的数据,将用户登陆表中的ip转化为对应的国家地区并落表(避免笛卡尔积)
Q4: 请输出每个分区下,每个province的去重登陆人数。输出结构为 pt,province,cnt_login
Q5: 请输出总量数据下,存在登陆数据的各个province中,登陆时间最早的前3人及对应的登陆时间,若不满3人,需要留空。输出结构为 province,account_id_1, login_time_1, account_id_2, login_time_2, account_id_3, login_time_3
q1.sql
CREATE TABLE ip_txt (
`ip_start` string
,`ip_end` string
,`long_ip_start` string
,`long_ip_end` string
,`country` string
,`province` string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;LOAD DATA LOCAL INPATH '/root/ip_china.csv' INTO TABLE ip_txt;CREATE TABLE IF NOT EXISTS `ip_orc`(`ip_start` string,`ip_end` string,`long_ip_start` string,`long_ip_end` string,`country` string,`province` string)ROW FORMAT DELIMITED FIELDS TERMINATED BY ','STORED AS ORCTBLPROPERTIES ('orc.compress'='SNAPPY');INSERT INTO TABLE ip_orc SELECT ip_start ,ip_end ,long_ip_start,long_ip_end,country,province FROM ip_txt where ip_start != "ip_start"
q2.sql
CREATE TABLE login_txt (
logtime string
,account_id string
,ip string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;LOAD DATA LOCAL INPATH '/root/login_data.csv' INTO TABLE login_txt;CREATE EXTERNAL TABLE IF NOT EXISTS `login_orc`(
logtime string
,account_id string
,ip string
)
PARTITIONED BY (`ds` string )
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS ORC
TBLPROPERTIES ('orc.compress'='SNAPPY'
);SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstric; INSERT INTO TABLE login_orc PARTITION(ds) SELECT logtime ,account_id ,ip,substr(logtime,0,10) as ds FROM login_txt where logtime != "logtime"
q3.sql
create table if not exists user_info as
WITH ip_tmp AS(SELECTt1.*,cast(split(t1.ip, "\\.")[0] as bigint)*256*256*256+ cast(split(t1.ip, "\\.")[1] as bigint)*256*256+ cast(split(t1.ip,"\\.")[2] as bigint)*256+ cast(split(t1.ip,"\\.")[3] as bigint) AS ip_long
FROM login_orc t1
)
SELECT
ip_tmp.account_id
,ip_tmp.ip_long
,t2.province
,ip_tmp.logtime
,ip_tmp.ds
FROM ip_tmp
LEFT JOIN ip_orc t2
WHERE
ip_tmp.ip_long >= t2.long_ip_start and ip_tmp.ip_long <= t2.long_ip_end
;select * from user_info limit 10;
q3.png
q4.sql
SELECT
ds as pt
,province
,count(distinct account_id) as cnt_login
FROM user_info
GROUP BY ds,province
ORDER BY ds,cnt_login;
q4.png
q5.sql
WITH grouped_data AS (SELECTprovince,concat_ws(',', collect_list(concat(account_id, ',', logtime))) AS merged_valuesFROM (select province ,account_id,logtime, row_number() over(partition by province order by logtime) as rid from user_info ) t where t.rid <=3GROUP BY province)
SELECTprovince,split(merged_values,',')[0] as account_id_1,split(merged_values,',')[1] as login_time_1,split(merged_values,',')[2] as account_id_2,split(merged_values,',')[3] as login_time_2,split(merged_values,',')[4] as account_id_3,split(merged_values,',')[5] as login_time_3
FROMgrouped_data
where (LENGTH(merged_values) - LENGTH(REPLACE(merged_values, ',', ''))) = 5
union all
select province
,"" as account_id_1
,"" as login_time_1
,"" as account_id_2
,"" as login_time_2
,"" as account_id_3
,"" as login_time_3
from grouped_data where (LENGTH(merged_values) - LENGTH(REPLACE(merged_values, ',', ''))) < 5
q5.png