首页 > 代码库 > Hive综合案例分析之用户上网行为分析
Hive综合案例分析之用户上网行为分析
知识点:
1、Hive复合数据类型:array
collect_set
collect_list
array_contains
sort_array
2、lateral view
explode(array)
lateral view out
需求:
click_log : cookie_id ad_id time
ad_list: ad_id ad_url catalog_list
统计:
cookie_catalog: cookie_id ad_catalog cat_weight
创建点击日志表:
CREATE TABLE click_log ( cookie_id STRING , ad_id STRING , ts STRING)ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/click_log.txt‘ overwrite into table click_log;
select * from click_log;
11 ad_101 2014-05-01 06:01:12.334+0122 ad_102 2014-05-01 07:28:12.342+0133 ad_103 2014-05-01 07:50:12.33+0111 ad_104 2014-05-01 09:27:12.33+0122 ad_103 2014-05-01 09:03:12.324+0133 ad_102 2014-05-02 19:10:12.343+0111 ad_101 2014-05-02 09:07:12.344+0135 ad_105 2014-05-03 11:07:12.339+0122 ad_104 2014-05-03 12:59:12.743+0177 ad_103 2014-05-03 18:04:12.355+0199 ad_102 2014-05-04 00:36:39.713+0133 ad_101 2014-05-04 19:10:12.343+0111 ad_101 2014-05-05 09:07:12.344+0135 ad_102 2014-05-05 11:07:12.339+0122 ad_103 2014-05-05 12:59:12.743+0177 ad_104 2014-05-05 18:04:12.355+0199 ad_105 2014-05-05 20:36:39.713+01
collect_set功能:去除集合中重复的元素,结果是无序的、
select cookie_id, collect_set(ad_id) as orders from click_log group by cookie_id;
11 ["ad_101","ad_104"]22 ["ad_104","ad_102","ad_103"]33 ["ad_101","ad_102","ad_103"]35 ["ad_105","ad_102"]77 ["ad_104","ad_103"]99 ["ad_105","ad_102"]
select cookie_id, collect_set(ad_id) as orders from click_log where ts > ‘2014-05-02‘ group by cookie_id;
11 ["ad_101"]22 ["ad_104","ad_103"]33 ["ad_101","ad_102"]35 ["ad_105","ad_102"]77 ["ad_104","ad_103"]99 ["ad_105","ad_102"]
查询每个cookie_id访问过的ad_id的访问量
select cookie_id, ad_id, count(1) as amount from click_log group by cookie_id, ad_id;11 ad_101 311 ad_104 122 ad_102 122 ad_103 222 ad_104 133 ad_101 133 ad_102 133 ad_103 135 ad_102 135 ad_105 177 ad_103 177 ad_104 199 ad_102 199 ad_105 1
hive0.13+才支持collect_list:不去除集合中的重复元素
select cookie_id, collect_list(ad_id) as orders from click_log group by cookie_id;
11 ["ad_101","ad_104","ad_101","ad_101"]22 ["ad_102","ad_103","ad_104","ad_103"]33 ["ad_103","ad_102","ad_101"]35 ["ad_105","ad_102"]77 ["ad_103","ad_104"]99 ["ad_102","ad_105"]
创建广告类别表:
CREATE TABLE ad_list ( ad_id STRING , url STRING , catalogs array<STRING>)ROW FORMAT DELIMITEDFIELDS TERMINATED BY ‘\t‘COLLECTION ITEMS TERMINATED BY ‘|‘;
load data local inpath ‘/home/spark/software/data/ad_list.txt‘ overwrite into table ad_list;
select * from ad_list;ad_101 http://www.google.com ["catalog8","catalog1"]ad_102 http://www.sohu.com ["catalog6","catalog3"]ad_103 http://www.baidu.com ["catalog7"]ad_104 http://www.qq.com ["catalog5","catalog1","catalog4","catalog9"]ad_105 http://sina.com []
CREATE TABLE ad_list_string ( ad_id STRING , url STRING , catalogs STRING)ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/ad_list.txt‘ overwrite into table ad_list_string;
select * from ad_list_string;ad_101 http://www.google.com catalog8|catalog1ad_102 http://www.sohu.com catalog6|catalog3ad_103 http://www.baidu.com catalog7ad_104 http://www.qq.com catalog5|catalog1|catalog4|catalog9ad_105 http://sina.com
查询每个cookie_id访问过的ad_id的访问量以及该广告所属的类别
select click.cookie_id, click.ad_id, click.amount, ad_list_string.catalogs as ordersfrom ( select cookie_id, ad_id, count(1) as amount from click_log group by cookie_id, ad_id) clickjoin ad_list_stringon (ad_list_string.ad_id = click.ad_id);11 ad_101 3 catalog8|catalog111 ad_104 1 catalog5|catalog1|catalog4|catalog922 ad_102 1 catalog6|catalog322 ad_103 2 catalog722 ad_104 1 catalog5|catalog1|catalog4|catalog933 ad_101 1 catalog8|catalog133 ad_102 1 catalog6|catalog333 ad_103 1 catalog735 ad_102 1 catalog6|catalog335 ad_105 177 ad_103 1 catalog777 ad_104 1 catalog5|catalog1|catalog4|catalog999 ad_102 1 catalog6|catalog399 ad_105 1
LATERAL VIEW OUTER explode(catalogs)将数组打开变成横向的视图形式,只有hive支持,impala等其他的是不支持的
select ad_id, catalog from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog;ad_101 catalog8ad_101 catalog1ad_102 catalog6ad_102 catalog3ad_103 catalog7ad_104 catalog5ad_104 catalog1ad_104 catalog4ad_104 catalog9ad_105 NULL
注意此处没有使用OUTER,那么就没有ad_105的值,带OUTER的保留空的数据行,类似于left outer join, 使用时一般都带上
select ad_id, catalog from ad_list LATERAL VIEW explode(catalogs) t AS catalog;
ad_101 catalog8ad_101 catalog1ad_102 catalog6ad_102 catalog3ad_103 catalog7ad_104 catalog5ad_104 catalog1ad_104 catalog4ad_104 catalog9
此时数组中的结果是无序的
select ad_id, collect_set(catalog) from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog group by ad_id;
ad_101 ["catalog8","catalog1"]ad_102 ["catalog3","catalog6"]ad_103 ["catalog7"]ad_104 ["catalog9","catalog1","catalog5","catalog4"]ad_105 []
按照catalogs内部的元素进行排序,sort_array在spark中暂不支持
select ad_id, sort_array(catalogs) from ad_list;
ad_101 ["catalog1","catalog8"]ad_102 ["catalog3","catalog6"]ad_103 ["catalog7"]ad_104 ["catalog1","catalog4","catalog5","catalog9"]ad_105 []
判断数组中是否包含指定的数值,常用在where条件中
select ad_id, catalogs from ad_list where array_contains(catalogs, ‘catalog1‘);
ad_101 ["catalog8","catalog1"]ad_104 ["catalog5","catalog1","catalog4","catalog9"]
统计每个cookie_id访问过哪些类别,此时统计输出结果是无序的
select click.cookie_id, ad.catalog from click_log clickleft outer join ( select ad_id, catalog from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog) adon (click.ad_id = ad.ad_id);
11 catalog811 catalog122 catalog622 catalog333 catalog711 catalog511 catalog111 catalog411 catalog922 catalog733 catalog633 catalog311 catalog811 catalog135 NULL22 catalog522 catalog122 catalog422 catalog977 catalog799 catalog699 catalog333 catalog833 catalog111 catalog811 catalog135 catalog635 catalog322 catalog777 catalog577 catalog177 catalog477 catalog999 NULL
将统计结果写入cookie_cats表中,按照cookie_id和访问次数的降序排列
create table cookie_cats asselect click.cookie_id, ad.catalog, count(1) as weight from click_log clickleft outer join ( select ad_id, catalog from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog) adon (click.ad_id = ad.ad_id)group by click.cookie_id, ad.catalogorder by cookie_id, weight desc;
select * from cookie_cats;
11 catalog1 411 catalog8 311 catalog5 111 catalog9 111 catalog4 122 catalog7 222 catalog9 122 catalog1 122 catalog3 122 catalog4 122 catalog5 122 catalog6 133 catalog3 133 catalog8 133 catalog7 133 catalog6 133 catalog1 135 NULL 135 catalog3 135 catalog6 177 catalog1 177 catalog4 177 catalog5 177 catalog7 177 catalog9 199 NULL 199 catalog3 199 catalog6 1
输出集合的元素是无序的
select cookie_id, collect_set(catalog) from cookie_cats group by cookie_id;
11 ["catalog8","catalog9","catalog1","catalog5","catalog4"]22 ["catalog9","catalog1","catalog3","catalog5","catalog4","catalog7","catalog6"]33 ["catalog8","catalog1","catalog3","catalog7","catalog6"]35 ["catalog3","catalog6"]77 ["catalog9","catalog1","catalog5","catalog4","catalog7"]99 ["catalog3","catalog6"]
对集合中的元素进行排序
select cookie_id, sort_array(collect_set(catalog)) from cookie_cats group by cookie_id;
11 ["catalog1","catalog4","catalog5","catalog8","catalog9"]22 ["catalog1","catalog3","catalog4","catalog5","catalog6","catalog7","catalog9"]33 ["catalog1","catalog3","catalog6","catalog7","catalog8"]35 ["catalog3","catalog6"]77 ["catalog1","catalog4","catalog5","catalog7","catalog9"]99 ["catalog3","catalog6"]
Hive综合案例分析之用户上网行为分析