首页 > 代码库 > Hive综合案例分析之开窗函数使用

Hive综合案例分析之开窗函数使用

知识点:

1、Hive的窗口和分析函数进阶

  CUME_DIST 小于等于当前行值的行数 / 总行数

  PERCENT_RANK 当前rank值-1 / 总行数-1

  NTILE 将窗口分成n片

  LEAD(col, n, default) 窗口内下n行值

  LAG(col, n , default) 窗口内上n行值

  FIRST_VALUE 窗口内第一个值

  LAST_VALUE 窗口内最后一个值

2、分析函数中包含三个分析子句

  分组(Partition By)

  排序(Order By)

  窗口(Window)

3、ROWS BETWEEN ... AND ...

  CURRENT ROW 当前行

  N-rows PRECEDING 前面N行

  N-rows FOLLOWING 后面N行

  count() / sum() / max() / min()

 

CREATE TABLE rates (    app_name    STRING   , star_rates  STRING)ROW FORMAT DELIMITED FIELDS TERMINATED BY \t;
load data local inpath /home/spark/software/data/rates.txt overwrite into table rates;
select * from rates;app0    1app1    2app2    2app3    3app4    3app5    3app6    5app7    5app8    5app9    5
create table app_ranks as  select app_name       , star_rates       , NTILE(3) OVER (ORDER BY star_rates DESC) as nt       , row_number() OVER (ORDER BY star_rates DESC) as rn       , rank() OVER (ORDER BY star_rates DESC) as rk       , dense_rank() OVER (ORDER BY star_rates DESC) as drk       , CUME_DIST() OVER (ORDER BY star_rates) as cd       , PERCENT_RANK() OVER (ORDER BY star_rates) as pr  from rates  order by star_rates desc;
select * from app_ranks;app_name        star_rates      nt      rn      rk      drk     cd      prapp9    5       1       1       1       1       1.0     0.6666666666666666app8    5       1       2       1       1       1.0     0.6666666666666666app7    5       1       3       1       1       1.0     0.6666666666666666app6    5       1       4       1       1       1.0     0.6666666666666666app5    3       2       5       5       2       0.6     0.3333333333333333app4    3       2       6       5       2       0.6     0.3333333333333333app3    3       2       7       5       2       0.6     0.3333333333333333app2    2       3       8       8       3       0.3     0.1111111111111111app1    2       3       9       8       3       0.3     0.1111111111111111app0    1       3       10      10      4       0.1     0.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) from app_ranks;app0    1       0.1app2    2       0.3app1    2       0.6app5    3       0.6app4    3       1.2app3    3       1.7999999999999998app9    5       1.0app8    5       2.0app7    5       3.0app6    5       4.0

 

select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) from app_ranks;app0    1       0.1app2    2       0.6app1    2       0.3app5    3       1.7999999999999998app4    3       1.2app3    3       0.6app9    5       4.0app8    5       3.0app7    5       2.0app6    5       1.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) from app_ranks;app0    1       0.1app2    2       0.6app1    2       0.3app5    3       1.2app4    3       1.2app3    3       0.6app9    5       2.0app8    5       2.0app7    5       2.0app6    5       1.0
select app_name, star_rates, rn, lead(rn) OVER (PARTITION BY star_rates ORDER BY rn), lag(rn) OVER (PARTITION BY star_rates ORDER BY rn) from app_ranks;app0    1       10      NULL    NULLapp2    2       8       9       NULLapp1    2       9       NULL    8app5    3       5       6       NULLapp4    3       6       7       5app3    3       7       NULL    6app9    5       1       2       NULLapp8    5       2       3       1app7    5       3       4       2app6    5       4       NULL    3

 

 

CREATE EXTERNAL TABLE visitors (    domain  STRING   , month   STRING  , visitor STRING)ROW FORMAT DELIMITED FIELDS TERMINATED BY \t;
load data local inpath /home/spark/software/data/visitors.txt overwrite into table visitors;
select * from visitors where domain = d001;d001    201301  101d001    201302  111d001    201303  121d001    201304  131d001    201305  141d001    201306  151d001    201307  201d001    201308  211d001    201309  221d001    201310  231d001    201311  241d001    201312  301d001    201401  301d001    201402  211d001    201403  271d001    201404  331d001    201405  351
select domain     , month     , visitor     , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)     , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)     , lead(visitor) OVER (PARTITION BY domain ORDER BY month DESC)     , lag(visitor) OVER (PARTITION BY domain ORDER BY month DESC)from visitorswhere domain = d001;d001    201405  351     351     351     331     NULLd001    201404  331     351     331     271     351d001    201403  271     351     271     211     331d001    201402  211     351     211     301     271d001    201401  301     351     301     301     211d001    201312  301     351     301     241     301d001    201311  241     351     241     231     301d001    201310  231     351     231     221     241d001    201309  221     351     221     211     231d001    201308  211     351     211     201     221d001    201307  201     351     201     151     211d001    201306  151     351     151     141     201d001    201305  141     351     141     131     151d001    201304  131     351     131     121     141d001    201303  121     351     121     111     131d001    201302  111     351     111     101     121d001    201301  101     351     101     NULL    111
select domain     , month     , visitor     , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)     , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)     , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)     , lag(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)     , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)     , lag(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)from visitorswhere domain = d001;d001    201405  351     351     351     331     0       141     0d001    201404  331     351     331     271     351     131     0d001    201403  271     351     271     211     331     121     0d001    201402  211     351     211     301     271     111     0d001    201401  301     351     301     301     211     101     0d001    201312  301     351     301     241     301     0       0d001    201311  241     351     241     231     301     0       0d001    201310  231     351     231     221     241     0       0d001    201309  221     351     221     211     231     0       0d001    201308  211     351     211     201     221     0       0d001    201307  201     351     201     151     211     0       0d001    201306  151     351     151     141     201     0       0d001    201305  141     351     141     131     151     0       351d001    201304  131     351     131     121     141     0       331d001    201303  121     351     121     111     131     0       271d001    201302  111     351     111     101     121     0       211d001    201301  101     351     101     0       111     0       301
create table visitors_report as  select domain       , month       , visitor       , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)            as last_mon       , visitor - lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)  as delta_mon       , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)           as last_year       , visitor - lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_year  from visitors;
select * from visitors_report where domain = d001 and month > 2014;d001    201405  351     331     20.0    141     210.0d001    201404  331     271     60.0    131     200.0d001    201403  271     211     60.0    121     150.0d001    201402  211     301     -90.0   111     100.0d001    201401  301     301     0.0     101     200.0
select month     , domain     , visitor     , last_mon     , last_yearfrom visitors_reportwhere (domain = d001 or domain = d002) and month > 2014order by month desc, domain asclimit 100;201405  d001    351     331     141201405  d002    352     332     142201404  d001    331     271     131201404  d002    332     272     132201403  d001    271     211     121201403  d002    272     212     122201402  d001    211     301     111201402  d002    212     302     112201401  d001    301     301     101201401  d002    302     302     102
select month     , domain     , visitor     , max(visitor) OVER (PARTITION BY month) as max_visitors     , min(visitor) OVER (PARTITION BY month) as min_visitorsfrom visitorswhere month > 2014order by month desc, domain asc;201405  d001    351     353     351201405  d002    352     353     351201405  d003    353     353     351201404  d001    331     333     331201404  d002    332     333     331201404  d003    333     333     331201403  d001    271     273     271201403  d002    272     273     271201403  d003    273     273     271201402  d001    211     213     211201402  d002    212     213     211201402  d003    213     213     211201401  d001    301     303     301201401  d002    302     303     301201401  d003    303     303     301
select *from (select month     , domain     , visitor     , max(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as max_visitors_last_12_mon     , min(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as min_visitors_last_12_monfrom visitors) vwhere month > 20131order by month desc, domain asc;201405  d001    351     351     141201405  d002    352     352     142201405  d003    353     353     143201404  d001    331     331     131201404  d002    332     332     132201404  d003    333     333     133201403  d001    271     301     121201403  d002    272     302     122201403  d003    273     303     123201402  d001    211     301     111201402  d002    212     302     112201402  d003    213     303     113201401  d001    301     301     101201401  d002    302     302     102201401  d003    303     303     103201312  d001    301     301     101201312  d002    302     302     102201312  d003    303     303     103201311  d001    241     241     101201311  d002    242     242     102201311  d003    243     243     103201310  d001    231     231     101201310  d002    232     232     102201310  d003    233     233     103

 

Hive综合案例分析之开窗函数使用