1. SQL中设置常量
set var:pi_sysdate = 20241114;
Variable PI_SYSDATE set to 20241114
2. CDP中impala 创建内外表
set default_transactional_type=none;
create external table stg.hd_aml_mac_ip_ext (machinedate string,vc_fundacco string,ip string
)
stored as textfile
tblproperties ('objcapabilities'='extread,extwrite');create external table stg.hd_aml_mac_ip (machinedate string,vc_fundacco string,ip string
)
stored as parquet
tblproperties ("parquet.compression"="snappy");
3. hive导出逗号分隔文件到本地
hive -e "SELECT * from student" | sed 's/\t/,/g' > /tmp/student.csv
4. hive on mr 的参数设置 开启动态分区
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=500000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set mapreduce.reduce.memory.mb=4096;
5. MYSQL hive元数据
set session group_concat_max_len = 20480;
select concat_ws('',a.create_body_str,CHAR(10),c.tbl_comment,CHAR(10),b.partition_str,CHAR(10),a.stored_format,CHAR(10),d.compress_str,';') AS create_sql
FROM (
select t.TBL_ID,t.TBL_NAME,case when k.INPUT_FORMAT like '%.parquet%' then 'STORED AS PARQUET'when k.INPUT_FORMAT like '%.SequenceFile%' then 'STORED AS SEQUENCEFILE'when k.INPUT_FORMAT like '%.Text%' then ''else 'STORED AS NULL'end AS stored_format,concat_ws('','CREATE',CASE t.TBL_TYPE WHEN 'EXTERNAL_TABLE' THEN ' EXTERNAL'ELSE '' END,' TABLE IF NOT EXISTS ${schema}.',t.TBL_NAME,'(',CHAR(10),group_concat(concat_ws('',g.COLUMN_NAME,' ',g.TYPE_NAME,' COMMENT ','''',REPLACE(REPLACE(g.COMMENT,';',' '),'; ',' '),'''',CHAR(10)) ORDER BY g.INTEGER_IDX separator ','),')') AS create_body_str
from hive.TBLS t,hive.SDS k,hive.COLUMNS_V2 g,hive.DBS s
where t.SD_ID = k.SD_ID
and k.CD_ID = g.CD_ID
and s.DB_ID = t.DB_ID
and k.INPUT_FORMAT not like '%.kudu%'
and s.NAME = 'stg'
group by t.TBL_ID
) a
left join (select t.TBL_ID,concat_ws('','COMMENT ','''',t.param_value,'''') AS tbl_commentfrom hive.TABLE_PARAMS twhere t.param_key = 'comment'group by t.TBL_ID) c on c.tbl_id = a.tbl_id
left join (select t.TBL_ID,concat_ws('','PARTITIONED BY (',group_concat(concat_ws('',t.pkey_name,' ',t.pkey_type,' ','COMMENT ','''',t.pkey_comment,'''')order by t.integer_idx separator ','),')') AS partition_strfrom hive.PARTITION_KEYS t group by t.TBL_ID) b ON b.tbl_id = a.tbl_idleft join (select t.TBL_ID,concat_ws('','TBLPROPERTIES (','''',t.PARAM_KEY,'''','=','''',t.PARAM_VALUE,''')') as compress_strfrom hive.TABLE_PARAMS t where t.param_key like '%compression%'group by t.TBL_ID,t.param_key,t.param_value) d on d.tbl_id = a.tbl_id order by a.tbl_name;
6.修复数据
invalidate metadata ods.tablename;msck repair table ods.tablename;
7. impala中时间戳转(DATE)指定格式的字符串
SELECT from_timestamp(now(),'yyyyMMdd');
SELECT to_timestamp('20230710','yyyyMMdd')
select from_timestamp(date_add(to_timestamp('20231201','yyyyMMdd'),1),'yyyyMMdd')
select date_add(now(),interval -1 years);
8. 使用UDF函数
use default;
show functions;
show create function default.genseq;
hdfs dfs -put /home/app_adm/etl/udf/udf_0608.jar /user/hive/warehouse/udf_0608.jar
hdfs dfs -chown hive:hive /user/hive/warehouse/udf_0608.jar
hdfs dfs -chmod 777 /user/hive/warehouse/udf_0608.jar
DROP FUNCTION DEFAULT.udf10(STRING, STRING);
drop function default.udf10;
create function default.clnseq as 'cn.com.businessmatrix.udf.HLSequenceCleaner' using jar 'hdfs:///user/hive/warehouse/udf_0608';
create function default.genseq as 'cn.com.businessmatrix.udf.HLSequenceGenerator' using jar 'hdfs:///user/hive/warehouse/udf_0608';
hdfs dfs -put -f /home/file/ylb_trade_transfer_ext_out /tmp/hive/stg/ylb_trade_transfer_ext_out
sudo -u hdfs hadoop fs -chown -R hive:supergroup /tmp/hive/stg/ylb_trade_transfer_ext
9. impala更新KUDU表 指定主键
upsert into ${var:schema_ods}.mdm_ip_cust(sk_invpty_of_cust,gp_flag
)
select t.sk_invpty_of_cust,0 as gp_flagfrom ods.mdm_ip_cust twhere t.gp_flag is null;
10.使用hadoop的archive将小文件归档
set hive.archive.enabled=true;
set hive.archive.har.parentdir.settable=true;
set har.partfile.size=1099511627776;
ALTER TABLE A ARCHIVE PARTITION(dt='2020-12-24', hr='12');
ALTER TABLE A UNARCHIVE PARTITION(dt='2020-12-24', hr='12');
11.HBASE基本操作
hbase shell
create 'student','info'
put 'student','1001','info:sex','male'
scan 'student'
scan 'student',{STARTROW => '1001', STOPROW => '1001'}
scan 'student',{STARTROW => '1001'}
describe 'student'
put 'student','1001','info:name','Nick'
get 'student','1001'
get 'student','1001','info:name'
count 'student'
alter 'student',{NAME=>'info',VERSIONS=>3}
get 'student','1001',{COLUMN=>'info:name',VERSIONS=>3}
deleteall 'student','1001'
delete 'student','1002','info:sex'
truncate 'student'
truncate 'student'
drop 'student'
list_namespace
create_namespace 'bigdata'
create 'bigdata:student','info'
drop_namespace 'bigdata'
12.hive脱敏 中文不会脱敏
select mask('不不不bbb123');
序号 策略名 策略说明 Hive 系统函数
1 Redact 用 x 屏蔽字母字符,用 n 屏蔽数字字符 mask
2 Partial mask: show last 4 仅显示最后四个字符,其他用 x 代替 mask_show_last_n
3 Partial mask: show first 4 仅显示前四个字符,其他用 x 代替 mask_show_first_n
4 Hash 用值的哈希值替换原值 mask_hash
5 Nullify 用 NULL 值替换原值 Ranger 自身实现
6 Unmasked 原样显示 Ranger 自身实现
7 Date: show only year 仅显示日期字符串的年份 mask
8 Custom Hive UDF 来自定义策略
13.基于CDH5升级到CDH6.3.x造成的语法兼容
1.传入参数使用STRING
2.'''||value||'''->"'||value||'"
3.""->""
4. 调整成collect_set()函数的使用 hive on mr ,其他使用impala跑进临时表
14.SQL脚本需在代码头部添加说明注释
[示例]
[示例]
nvl(c.en_ratio,0) as plg_rati,
15.hive 修改字段并指定位置
ALTER TABLE test_change CHANGE a a1 STRING AFTER b;
16.hive中的排序
ORDER BY 全局排序,只有一个Reduce任务
SORT BY 只在本机做排序
17. sqoop将hive中的数据导出到Oracle
sqoop export
--connect jdbc:oracle:jdbc:oracle:thin:@locallhost:1521/testdb
--username test
--password 123456
--table t_test
--export-dir '/apps/hive/warehouse/dbi.db/t_test/pdt=20191229'
--columns ID,data_date,data_type,c1,c2,c3
--input-fields-terminated-by '\001'
--input-lines-terminated-by '\n'
--input-null-string '\\N'
--input-null-non-string '\\N' > sqoop_oracle.log 2>&1 &
18. hive 列传行
select new_fundaccount,new_bk_tradeaccount,bk_product from stg.tt0liquidateschema_tmp01
LATERAL VIEW explode(split(fundaccount,','))fundaccount as new_fundaccount
LATERAL VIEW explode(split(bk_tradeaccount,','))bk_tradeaccount as new_bk_tradeaccount;
create table tmp_dz as
select '000855' as bk_product,
'372402834320,37345435345435,37345343434' as fundaccount,
'982342242322342,9842423424,98345333' as tradeaccount from dual;insert into tmp_dz
select '000845' as bk_product,
'37345343454' as fundaccount,
'98345333433' as tradeaccount from dual;select nvl(new_fundaccount,fundaccount) as fundaccount,nvl(new_tradeaccount,tradeaccount) as tradeaccount,bk_productfrom (
SELECT REGEXP_SUBSTR(fundaccount, '[^,]+', 1, ROWNUM) as new_fundaccount,REGEXP_SUBSTR(tradeaccount, '[^,]+', 1, ROWNUM) as new_tradeaccount,t.*FROM tmp_dz t
CONNECT BY ROWNUM <= regexp_count(fundaccount, ',') + 1
) t;
19.hive 列传行 多列逗号分割的字段
with temp as(select '1,2,3' as id,'a,b,c' as name union select '4,5,6' as id,'d,e,f' as name)
select id, name, s_id, s_namefrom temp lateral view posexplode(split(id, ',' )) t as s_id_index,s_id lateral view posexplode(split(name, ',' )) t as s_name_index,s_namewhere s_id_index = s_name_index
20.CDPhive支持事务,增删改查
create table cdhadmin_table_hive (col1 int ) ;
insert into table cdhadmin_table_hive values (1);
insert into table cdhadmin_table_hive values (51);
insert into table cdhadmin_table_hive values (2);
insert into table cdhadmin_table_hive values (3);
select * from cdhadmin_table_hive;
delete from cdhadmin_table_hive where col1 = 51;
select * from cdhadmin_table_hive;
update cdhadmin_table_hive set col1=300 where col1=3;
select * from cdhadmin_table_hive;
grant select on database default to user cdhadmin;