首页 > 代码库 > Hive综合案例分析之开窗函数使用
Hive综合案例分析之开窗函数使用
知识点:
1、Hive的窗口和分析函数进阶
CUME_DIST 小于等于当前行值的行数 / 总行数
PERCENT_RANK 当前rank值-1 / 总行数-1
NTILE 将窗口分成n片
LEAD(col, n, default) 窗口内下n行值
LAG(col, n , default) 窗口内上n行值
FIRST_VALUE 窗口内第一个值
LAST_VALUE 窗口内最后一个值
2、分析函数中包含三个分析子句
分组(Partition By)
排序(Order By)
窗口(Window)
3、ROWS BETWEEN ... AND ...
CURRENT ROW 当前行
N-rows PRECEDING 前面N行
N-rows FOLLOWING 后面N行
count() / sum() / max() / min()
CREATE TABLE rates ( app_name STRING , star_rates STRING)ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/rates.txt‘ overwrite into table rates;
select * from rates;app0 1app1 2app2 2app3 3app4 3app5 3app6 5app7 5app8 5app9 5
create table app_ranks as select app_name , star_rates , NTILE(3) OVER (ORDER BY star_rates DESC) as nt , row_number() OVER (ORDER BY star_rates DESC) as rn , rank() OVER (ORDER BY star_rates DESC) as rk , dense_rank() OVER (ORDER BY star_rates DESC) as drk , CUME_DIST() OVER (ORDER BY star_rates) as cd , PERCENT_RANK() OVER (ORDER BY star_rates) as pr from rates order by star_rates desc;
select * from app_ranks;app_name star_rates nt rn rk drk cd prapp9 5 1 1 1 1 1.0 0.6666666666666666app8 5 1 2 1 1 1.0 0.6666666666666666app7 5 1 3 1 1 1.0 0.6666666666666666app6 5 1 4 1 1 1.0 0.6666666666666666app5 3 2 5 5 2 0.6 0.3333333333333333app4 3 2 6 5 2 0.6 0.3333333333333333app3 3 2 7 5 2 0.6 0.3333333333333333app2 2 3 8 8 3 0.3 0.1111111111111111app1 2 3 9 8 3 0.3 0.1111111111111111app0 1 3 10 10 4 0.1 0.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) from app_ranks;app0 1 0.1app2 2 0.3app1 2 0.6app5 3 0.6app4 3 1.2app3 3 1.7999999999999998app9 5 1.0app8 5 2.0app7 5 3.0app6 5 4.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) from app_ranks;app0 1 0.1app2 2 0.6app1 2 0.3app5 3 1.7999999999999998app4 3 1.2app3 3 0.6app9 5 4.0app8 5 3.0app7 5 2.0app6 5 1.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) from app_ranks;app0 1 0.1app2 2 0.6app1 2 0.3app5 3 1.2app4 3 1.2app3 3 0.6app9 5 2.0app8 5 2.0app7 5 2.0app6 5 1.0
select app_name, star_rates, rn, lead(rn) OVER (PARTITION BY star_rates ORDER BY rn), lag(rn) OVER (PARTITION BY star_rates ORDER BY rn) from app_ranks;app0 1 10 NULL NULLapp2 2 8 9 NULLapp1 2 9 NULL 8app5 3 5 6 NULLapp4 3 6 7 5app3 3 7 NULL 6app9 5 1 2 NULLapp8 5 2 3 1app7 5 3 4 2app6 5 4 NULL 3
CREATE EXTERNAL TABLE visitors ( domain STRING , month STRING , visitor STRING)ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/visitors.txt‘ overwrite into table visitors;
select * from visitors where domain = ‘d001‘;d001 201301 101d001 201302 111d001 201303 121d001 201304 131d001 201305 141d001 201306 151d001 201307 201d001 201308 211d001 201309 221d001 201310 231d001 201311 241d001 201312 301d001 201401 301d001 201402 211d001 201403 271d001 201404 331d001 201405 351
select domain , month , visitor , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , lead(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , lag(visitor) OVER (PARTITION BY domain ORDER BY month DESC)from visitorswhere domain = ‘d001‘;d001 201405 351 351 351 331 NULLd001 201404 331 351 331 271 351d001 201403 271 351 271 211 331d001 201402 211 351 211 301 271d001 201401 301 351 301 301 211d001 201312 301 351 301 241 301d001 201311 241 351 241 231 301d001 201310 231 351 231 221 241d001 201309 221 351 221 211 231d001 201308 211 351 211 201 221d001 201307 201 351 201 151 211d001 201306 151 351 151 141 201d001 201305 141 351 141 131 151d001 201304 131 351 131 121 141d001 201303 121 351 121 111 131d001 201302 111 351 111 101 121d001 201301 101 351 101 NULL 111
select domain , month , visitor , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) , lag(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) , lag(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)from visitorswhere domain = ‘d001‘;d001 201405 351 351 351 331 0 141 0d001 201404 331 351 331 271 351 131 0d001 201403 271 351 271 211 331 121 0d001 201402 211 351 211 301 271 111 0d001 201401 301 351 301 301 211 101 0d001 201312 301 351 301 241 301 0 0d001 201311 241 351 241 231 301 0 0d001 201310 231 351 231 221 241 0 0d001 201309 221 351 221 211 231 0 0d001 201308 211 351 211 201 221 0 0d001 201307 201 351 201 151 211 0 0d001 201306 151 351 151 141 201 0 0d001 201305 141 351 141 131 151 0 351d001 201304 131 351 131 121 141 0 331d001 201303 121 351 121 111 131 0 271d001 201302 111 351 111 101 121 0 211d001 201301 101 351 101 0 111 0 301
create table visitors_report as select domain , month , visitor , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) as last_mon , visitor - lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_mon , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as last_year , visitor - lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_year from visitors;
select * from visitors_report where domain = ‘d001‘ and month > ‘2014‘;d001 201405 351 331 20.0 141 210.0d001 201404 331 271 60.0 131 200.0d001 201403 271 211 60.0 121 150.0d001 201402 211 301 -90.0 111 100.0d001 201401 301 301 0.0 101 200.0
select month , domain , visitor , last_mon , last_yearfrom visitors_reportwhere (domain = ‘d001‘ or domain = ‘d002‘) and month > ‘2014‘order by month desc, domain asclimit 100;201405 d001 351 331 141201405 d002 352 332 142201404 d001 331 271 131201404 d002 332 272 132201403 d001 271 211 121201403 d002 272 212 122201402 d001 211 301 111201402 d002 212 302 112201401 d001 301 301 101201401 d002 302 302 102
select month , domain , visitor , max(visitor) OVER (PARTITION BY month) as max_visitors , min(visitor) OVER (PARTITION BY month) as min_visitorsfrom visitorswhere month > ‘2014‘order by month desc, domain asc;201405 d001 351 353 351201405 d002 352 353 351201405 d003 353 353 351201404 d001 331 333 331201404 d002 332 333 331201404 d003 333 333 331201403 d001 271 273 271201403 d002 272 273 271201403 d003 273 273 271201402 d001 211 213 211201402 d002 212 213 211201402 d003 213 213 211201401 d001 301 303 301201401 d002 302 303 301201401 d003 303 303 301
select *from (select month , domain , visitor , max(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as max_visitors_last_12_mon , min(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as min_visitors_last_12_monfrom visitors) vwhere month > ‘20131‘order by month desc, domain asc;201405 d001 351 351 141201405 d002 352 352 142201405 d003 353 353 143201404 d001 331 331 131201404 d002 332 332 132201404 d003 333 333 133201403 d001 271 301 121201403 d002 272 302 122201403 d003 273 303 123201402 d001 211 301 111201402 d002 212 302 112201402 d003 213 303 113201401 d001 301 301 101201401 d002 302 302 102201401 d003 303 303 103201312 d001 301 301 101201312 d002 302 302 102201312 d003 303 303 103201311 d001 241 241 101201311 d002 242 242 102201311 d003 243 243 103201310 d001 231 231 101201310 d002 232 232 102201310 d003 233 233 103
Hive综合案例分析之开窗函数使用
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。