select
movie,
category_name
from
movie_info lateral view explode(category) table_tmp as category_name;
reflect调用Java自带函数
select reflect("java.lang.Math","max",col1,col2) from test_udf;
select reflect(class_name,method_name,col1,col2) from test_udf2;
select reflect("org.apache.commons.lang.math.NumberUtils","isNumber","123")
窗口函数
select
user_id,
user_type,
sales,
--分组内所有行
sum(sales) over(partition by user_type) AS sales_1 ,
sum(sales) over(order by user_type) AS sales_2 ,
--默认为从起点到当前行,如果sales相同,累加结果相同
sum(sales) over(partition by user_type order by sales asc) AS sales_3,
--从起点到当前行,结果与sales_3不同。 根据排序先后不同,可能结果累加不同
sum(sales) over(partition by user_type order by sales asc rows between unbounded preceding and current row) AS sales_4,
--当前行+往前3行
sum(sales) over(partition by user_type order by sales asc rows between 3 preceding and current row) AS sales_5,
--当前行+往前3行+往后1行
sum(sales) over(partition by user_type order by sales asc rows between 3 preceding and 1 following) AS sales_6,
--当前行+往后所有行
sum(sales) over(partition by user_type order by sales asc rows between current row and unbounded following) AS sales_7
from
order_detail
order by user_type,sales,user_id;
分析函数
select user_id,user_type,sales,RANK() over (partition by user_type order by sales desc) as r,ROW_NUMBER() over (partition by user_type order by sales desc) as rn,DENSE_RANK() over (partition by user_type order by sales desc) as dr
fromorder_detail;
cube聚合
selectuser_type,sales,count(user_id) as pv,GROUPING__ID
from order_detail
group by user_type,sales
WITH CUBE
ORDER BY GROUPING__ID;
hive的一些调优参数
set hive.exec.dynamic.partition.modenonstrict; 使用动态分区
set hive.exec.max.dynamic.partitions100000;自动分区数最大值
set hive.exec.max.dynamic.partitions.pernode100000;
set hive.hadoop.supports.splittable.combineinputformattrue;支持切…
统计指标
select
substr(tb.begin_address_code , 1 ,4) as begin_address_code ,
count(distinct vehicle_license) as dayVehicleCount
from
(select
begin_address_code ,
vehicle_license
from
order
where
date_format(create_time , yyyy-MM-dd) 2020-02-15
) tb
grou…
假定样本集 D {X1, 的,…, Xm} 包含 m 个无标记样本, 每个样本 X (X1; X2;… ; Xn) 是一个 n 维特征向量,则聚类算法将样本 集 D 划分为 k 个不相交的簇 {Gl I l 1, 2;… ,时,其中…
k 近邻(k-Nearest Neighbor,简称 kNN)学习是一种常用的监督学习方法, 其工作机制非常简单: 给定测试样本?基于某种距离度量找出训练集中与其最 靠近的 k 个训练样本,然后基于这 k 个"邻居"的信息来进行预测. 通常, 在分…