1 HIVE SELECT 语法
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list] ]
[LIMIT number]
基于MapReduce引擎
Map阶段:
1.执行from加载,进行表的查找与加载
2.执行where过滤,进行条件过滤与筛选
3.执行select查询:进行输出项的筛选
4.执行group by分组:描述了分组后需要计算的函数
5.map端文件合并:map端本地溢出写文件的合并操作,每个map最终形成一个临时文件。
然后按列映射到对应的Reduce阶段:
Reduce阶段:
1.group by:对map端发送过来的数据进行分组并进行计算。
2.select:最后过滤列用于输出结果
3.limit排序后进行结果输出到HDFS文件
注意,以上顺序不是绝对的,会根据语句的不同,有所调整。
可以通过执行计划查看大概顺序。
explain sql语句
explain select count(*) from user_install_status_limit;
先执行stage-1,再执行stage-0
explain select count(*) from user_install_status_limit group by country;
2 Hive Join
hive只支持等连接,外连接。hive不支持非相等的join条件(通过其他方式实现,如left outer join),因为它很难在map/reduce job实现这样的条件。
hive可以join两个以上的表。
如果两个以上join,join的字段都一样,类型也一样,那就只生成一个mapreduce任务。
2.1 两表join
建表导入数据
-- 创建表test_a
CREATE TABLE test_a(
id int,
name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
-- 创建表test_b
CREATE TABLE test_b(
id int,
name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--创建表test_c
CREATE TABLE test_c(
id int,
name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--分别导入数据到三个表中
--test_a
1 a1
2 a2
4 a4
--test_b
1 b1
3 b3
4 b4
--test_c
1 c1
4 c4
5 c5
2.1.1 等值连接:inner join
-- 关闭mapjon
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select * from test_a a inner join test_b b on a.id=b.id;
1 a1 1 b1
4 a4 4 b4
2.1.2 外连接:left join 、right join
普通left Join
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select * from
test_a a left join test_b b on a.id=b.id;
1 a1 1 b1
2 a2 null null
4 a4 4 b4
普通right Join
--right join
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select a.*,b.* from test_a a
right join test_b b
on a.id=b.id;
1 a1 1 b1
null null 3 b3
4 a4 4 b4
2.1.3 实现非等值连接
查询test_a有,test_b没有,用 left join + is null
查询test_a没有,test_b有,用 right join + is null
--在得到join结果后,再根据where条件筛选
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select a.*,b.* from test_a a
left join test_b b
on a.id=b.id
where b.id is null;
2 a2 null null
-- 错误例子
select a.*,b.* from test_a a
left join test_b b
on a.id=b.id and b.id is null;
1 a1 NULL NULL
2 a2 NULL NULL
4 a4 NULL NULL
2.2 多表join
1)三表inner join
如果join的字段相同,只生成一个任务
测试:
先关闭map端的join,再执行
测试join字段相同,只生成一个任务
-- 关闭mapjon
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select * from test_a a
inner join test_b b on a.id=b.id
inner join test_c c on a.id=c.id;
-------------------------------------
join字段不同,不一定生成一个任务
-- 关闭mapjon
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select * from test_a a
inner join test_b b on a.id=b.id
inner join test_c c on a.name=c.name;
2)计算新增用户(非等值连接的应用)
CREATE TABLE user_ttt(
aid STRING
)
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
alter table user_ttt add IF NOT EXISTS partition(dt='20141117') location '20141117';
alter table user_ttt add IF NOT EXISTS partition(dt='20141228') location '20141228';
select aid from user_ttt where dt='20141117' group by aid
aid1
aid2
aid3
select aid from user_ttt where dt='20141228' group by aid
aid1
aid2
aid4
aid5
select count(*) from
(select aid from user_ttt where dt='20141117' group by aid) t1
right join
(select aid from user_ttt where dt='20141228' group by aid) t2
on t1.aid = t2.aid where t1.aid is null;
aid1 aid1
aid2 aid2
null aid4
null aid5
3)计算每个国家记录数的百分比
-- 统计所有记录数
select count(*) from user_install_status_limit;
100
-- 统计每个国家的记录数
select country, count(*) from user_install_status_limit group by country;
CN 50
US 20
RU 30
CN 100 50 50%
US 100 20 20%
RU 100 30 30%
-- 统计每个国家占总记录数的占比
select t1.total_num, t2.country, t2.num, concat(round(t2.num/t1.total_num * 100, 2),'%') from
(select count(*) as total_num, 'link' as link from user_install_status_limit) t1
inner join
(select country, count(*) as num, 'link' as link from user_install_status_limit group by country) t2
on t1.link=t2.link;
hive (c28pan)> select concat("aa","bb");
OK
aabb
round,默认取整,如果想精确,那就在后面加精确几位小数
--四舍五入取整
select round(5/3);
--精确6位小数
select round(5/3, 6);
2.3 要避免的查询操作
--笛卡尔积的SQL
select * from test_a
inner join test_b;
set hive.mapred.mode=strict;
设置这个参数,可以限制以下情况:
1)限制执行可能形成笛卡尔积的SQL;
2)partition表使用时不加分区;
3)order by全局排序的时候不加limit的情况;
partition表使用时不加分区
order by全局排序的时候不加limit的情况
取消限制
set hive.mapred.mode=nonstrict;
2.4 full outer join
包括两个表的join结果,(左边有,右边NULL union 右边有,左边NULL)
其结果等于left join union right join
1)做test_a 与 test_b 的full outer join
select a.*,b.* from test_a a
full outer join
test_b b
on a.id=b.id;
2.5 union的使用
union 是把两个表连接在一起,然后去重。
用 left join union right join 实现 full outer join
select a.id as aid, a.name as aname, b.id as bid, b.name as bname from test_a a left join test_b b on a.id = b.id
union
select a.id as aid, a.name as aname, b.id as bid, b.name as bname from test_a a right join test_b b on a.id = b.id;
NULL NULL 3 b3
1 a1 1 b1
2 a2 NULL NULL
4 a4 4 b4
当带有union 的时候, 多个结果集join,需要把字段写清楚,否则union 的时候,得到的数据超乎你的想象。
比如:
select a.id,a.name,b.id,b.name from test_a a left join test_b b on a.id=b.id
union
select a.id,a.name,b.id,b.name from test_a a right join test_b b on a.id=b.id;
为什么呢?
优化方法:
#增加reducer任务数量
set mapred.reduce.tasks=20;
#在同一个sql中的不同的job是否可以同时运行,默认为false
set hive.exec.parallel=true;
#增加同一个sql允许并行任务的最大线程数
set hive.exec.parallel.thread.number=8;
#设置reducer内存大小
set mapreduce.reduce.memory.mb=4096;
set mapreduce.reduce.java.opts=-Xmx3584m;
测试并行运算:
关闭并行运行,发现没有依赖的子查询不会同步执行
-- 关闭并行运行, 默认是false
set hive.exec.parallel=false;
select a.country,a.cn,round(a.cn/b.cn*100,6) from
(select country,count(1) cn,'nn' as joinc from user_install_status_other where dt='20141228' group by country) a
inner join
(select count(1) cn,'nn' as joinc from user_install_status_other where dt='20141228') b on a.joinc=b.joinc;
开启并行运行,发现没有依赖的子查询会同步执行
-- 开启并行运行
set hive.exec.parallel=true;
select a.country,a.cn,round(a.cn/b.cn*100,6) from
(select country,count(1) cn,'nn' as joinc from user_install_status_other where dt='20141228' group by country) a
inner join
(select count(1) cn,'nn' as joinc from user_install_status_other where dt='20141228') b on a.joinc=b.joinc;
2.6 新建字典表用于join
制作字典文件country_dict.dat
http://www.jctrans.com/tool/gjym.htm
--创建表
create table country_dict(
code string,
name string,
region string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--加载数据
LOAD DATA LOCAL INPATH '/home/panniu/hive_test/country.txt' OVERWRITE INTO TABLE country_dict;
2.7 map端的join
类似于mapreduce的mapjoin,在map端join,无reduce。
小表放内存,与大表的数据在map端进行join,hive默认开启map端join。
--大表join 小表
select * from panniu.user_install_status u
inner join country_dict c
on u.country=c.code
where u.dt='20141228'
limit 10;
开启mapjoin 设置:
-- 将小表刷入内存中,默认是true
set hive.auto.convert.join=true;
set hive.ignore.mapjoin.hint=true;
-- 刷入内存表的大小(字节),根据自己的数据集加大
set hive.mapjoin.smalltable.filesize=2500000;
--设置太大也不会校验,所以要根据实际情况来设置
set hive.mapjoin.smalltable.filesize=2500000000000000;
不开启mapjoin 设置:
set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
在设置成false 或 true时,可以手动的mapjoin设置:
select /*+ MAPJOIN(c) */ * from panniu.user_install_status u
inner join country_dict c
on u.country=c.code
where u.dt='20141228'
limit 10;
其中:/+ MAPJOIN(c) / 这里面的c,就代表那个表的数据放入内存
当用mapjoin实现左连接时,必须右表数据放内存;
select /*+ MAPJOIN(u) */ * from panniu.user_install_status u
left join country_dict c
on u.country=c.code
where u.dt='20141228'
limit 10;
当用mapjoin实现右连接时,必须左表数据放内存;
3 select中的条件判断
3.1 if条件
--类似三目运算符 条件表达式 ? true的结果 : false的结果
IF(条件表达式, true的结果, false的结果)
示例:if(a is null,'all',a)
3.2 COALESCE
返回参数中的第一个非空值;
语法:COALESCE( value1,value2,… )
示例:
select coalesce(null,null,5,null,6) ;
作用: 常用于给字段赋默认值;
3.3 case when
第一种:
--用于判断expression是固定值
CASE [ expression ]
WHEN condition1 THEN result1
WHEN condition2 THEN result2
...
WHEN conditionn THEN resultn
ELSE result
END
示例:
select
country,
case country
when 'CN' then '中国'
when 'RU' then '老毛子'
else '其他国家'
end as country_info
from user_install_status_limit limit 10;
第二种:
--类似java 的if else if else 写法
CASE WHEN condition1 THEN result1
WHEN condition2 THEN result2
...
WHEN conditionn THEN resultn
ELSE result
END
示例:
select
country,
case when country='CN' then '中国'
when country='RU' then '老毛子'
else '其他国家'
end as country_info
from user_install_status_limit limit 10;
select
country,
if(country = 'CN' , "中国", if(country = 'RU' , "俄罗斯", if(country = 'US' , "美国", "其他国家")))
from user_install_status_limit limit 10;
4 GROUP BY
已知student_grouping表
--创建student_grouping表
CREATE TABLE student_grouping(
id int,
name string,
age int,
sex string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--给表导入数据
1 name1 12 boy
2 name2 12 boy
3 name3 13 girl
4 name4 13 boy
5 name5 14 boy
6 name6 14 boy
7 name7 15 girl
8 name8 15 girl
写统计SQL
1)查询总记录数;
2)按照年龄分组,统计记录数;
3)按照性别分组,统计记录数;
4)按照年龄、性别分组,统计记录数;
--写统计SQL
--1)查询总记录数;
select count(*) from student_grouping;
--2)按照年龄分组,统计记录数;
select age, count(*) from student_grouping group by age;
--3)按照性别分组,统计记录数;
select sex, count(*) from student_grouping group by sex;
--4)按照年龄、性别分组,统计记录数;
select age, sex, count(*) from student_grouping group by age, sex;
-- 需求, 一个SQL查询出上面4种查询结果
--在同一个sql中的不同的job是否可以同时运行,默认为false
set hive.exec.parallel=true;
--增加同一个sql允许并行任务的最大线程数
set hive.exec.parallel.thread.number=8;
select null as age, null as sex, count(*) as num from student_grouping
union
select age, null as sex, count(*) as num from student_grouping group by age
union
select null as age, sex, count(*) as num from student_grouping group by sex
union
select age, sex, count(*) as num from student_grouping group by age, sex;
NULL NULL 8
NULL boy 5
NULL girl 3
12 NULL 2
12 boy 2
13 NULL 2
13 boy 1
13 girl 1
14 NULL 2
14 boy 2
15 NULL 2
15 girl 2
-- 用 grouping sets 优化后的
select age, sex, count(*) as num from student_grouping group by age, sex
grouping sets((),age, sex, (age, sex));
NULL NULL 8
NULL boy 5
NULL girl 3
12 NULL 2
12 boy 2
13 NULL 2
13 boy 1
13 girl 1
14 NULL 2
14 boy 2
15 NULL 2
15 girl 2
-- 将查询的多个维度的数据导入到hive表里(模拟MySQL)
create table student_count as
select age, sex, count(*) as num from student_grouping group by age, sex
grouping sets((),age, sex, (age, sex));
-- 模拟MySQL出指标
-- 统计总记录数;
select * from student_count where age is null and sex is null;
--统计每个年龄记录数;
select * from student_count where age is not null and sex is null;
--统计性别记录数;
select * from student_count where age is null and sex is not null;
--统计年龄、性别组合的记录数;
select * from student_count where age is not null and sex is not null;
4.1 GROUPING SETS使用
grouping sets是一种将多个group by 逻辑写在一个sql语句中的便利写法。
--GROUP BY a, b GROUPING SETS ((a,b))
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a,b))
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
--GROUP BY a, b GROUPING SETS ((a,b), a)
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a,b), a)
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
UNION ALL
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
--GROUP BY a, b GROUPING SETS (a,b)
SELECT a,b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS (a,b)
等于
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM(c) FROM tab1 GROUP BY b
--GROUP BY a, b GROUPING SETS ((a, b), a, b, ())
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ((a, b), a, b, ())
等于
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
UNION ALL
SELECT a, null, SUM(c) FROM tab1 GROUP BY a
UNION ALL
SELECT null, b, SUM(c) FROM tab1 GROUP BY b
UNION ALL
SELECT null, null, SUM(c) FROM tab1
常用于计算各种组合的报表数据。
查询分组数据
SELECT age, sex, count(id) FROM student_grouping GROUP BY age, sex GROUPING SETS ((age,sex),age,sex,());
通过判断分组字段的is null 和 is not null 来统计不同维度的数据
一般应用方式步骤:【重点,项目能用到】
1)分析表按照什么字段进行分组统计;
2)利用grouping sets 做多个字段的统计,然后把统计结果导出到mysql关系型数据库或者hive表;
3)根据 分组字段的is null 和 is not null 来统计不同维度的数据;
注意:上面的例子也是遵循这个步骤;
给null 赋个默认值
select coalesce(age, 'ALL'), coalesce(sex, 'ALL'), count(*) as num from student_grouping group by age, sex
grouping sets((),age, sex, (age, sex));
ALL ALL 8
ALL boy 5
ALL girl 3
12 ALL 2
12 boy 2
13 ALL 2
13 boy 1
13 girl 1
14 ALL 2
14 boy 2
15 ALL 2
15 girl 2
4.2 with cube是group by中所有key的组合
select
coalesce(age,'ALL'),
if(sex is null, 'ALL', sex),
count(*)
from student_grouping group by age, sex
grouping sets((), age, sex, (age,sex));
-- 等效
select
coalesce(age,'ALL'),
if(sex is null, 'ALL', sex),
count(*)
from student_grouping group by age, sex with cube;
group by a,b,c with cube
等效
group by a,b,c grouping sets((a,b,c), (a,b), (a,c), (b,c), a, b, c, ())
4.3 with rollup是按右侧递减的顺序组合
-- GROUP BY age, sex with rollup 等效于 GROUP BY age, sex GROUPING SETS ( (age,sex),age,() )
-- 相当于按右侧递减的顺序group by
SELECT if(age is not null, age, 'ALL'),
case when sex is not null then sex
else 'ALL'
end as age,
count(id) FROM student_grouping GROUP BY age, sex
with rollup;
--等于
SELECT if(age is not null, age, 'ALL'),
case when sex is not null then sex
else 'ALL'
end as age,
count(id) FROM student_grouping GROUP BY age, sex
GROUPING SETS ( (age,sex),age,() );
例如:group by a,b,c with rollup
group by a,b,c grouping sets((a,b,c), (a,b), (a), ())