-
数据准备
- 建表
create table dept (deptid int ,deptname string,address string); create table emp(empid int ,empname string,salary DECIMAL ,ruzhidate date,mgr int,deptid int);
- 插入数据
insert into dept values(1,'研发部','北京'),(2,'人事部','上海'),(3,'销售部','深圳'),(4,'公关部','东莞'); insert into emp values(1001,'tom',18000,'2013-10-11',1005,1),(1002,'jerry',13000,'2021-11-11',1005,1),(1003,'jack',10000,'2020-09-11',1001,1),(1004,'rose',5000,'2020-10-11',1001,2),(1005,'bob',20000,'2018-08-11',null,2)
- 建表
-
面试题1:统计全公司工资最高的三个人
select * from emp sort by salary desc limit 3
- 面试题2:查出部门的员工数和部门的名称
select d.deptname,count(*) from dept d join emp e on d.deptid=e.deptid group by d.deptname
- 面试题3:统计员工最高的工资是多少,以及这个人是谁
-
方式一:
select e.empname,e.salary from emp e join (select max(salary) as maxsalary from emp) t1 on e.salary=t1.maxsalary;
- 方式二:
select * from emp where salary =(select max(salary) from emp);
-
面试题4:每个部门最高的工资,以及是谁
select e.empname,e.salary,e.deptid from emp e join (select max(salary) as maxsalary ,deptid from emp group by deptid) t1 on e.deptid=t1.deptid and e.salary=t1.maxsalary
-
面试题5:部门平均工资大于公司总的平均工资的部门是哪个部门
select t1.avg ,t1.deptid from (select avg(salary) as avg ,deptid,'link' as link from emp group by deptid) t1 join (select avg(salary) as totalavg ,'link' as link from emp ) t2 on t1.link=t2.link where t1.avg>t2.totalavg;
- 面试题6:求哪个员工的工资大于本部门的平均工资
select * from (select avg(salary) as avg ,deptid from emp group by deptid) t1 join emp t2 on t1.deptid=t2.deptid where t2.salary>t1.avg;
数据准备
现有用户请求数据
用户id(uid) string
请求时间(req_time) string
请求url(req_url) string
处理日期(day) string
处理日期:20220919
uid | req_time | req_url |
---|---|---|
u0001 | 20200918102030 | /index.jsp |
u0001 | 20200918102035 | /goods.action |
u0002 | 20200918102030 | /brand.action |
u0001 | 20200918102130 | /index.jsp |
u0002 | 20200918102230 | /index.jsp |
处理日期:20220920
uid | req_time | req_url |
---|---|---|
u0001 | 20200919102030 | /index.jsp |
u0003 | 20200919102035 | /goods.action |
u0004 | 20200919102030 | /brand.action |
u0001 | 20200919102130 | /index.jsp |
u0003 | 20200919102230 | /index.jsp |
用户数据如下:
用户id(uid)
用户名称(uname)
uid | Name |
---|---|
u0001 | 用户1 |
U0002 | 用户2 |
U0003 | 用户3 |
-
面试题7:根据给的用户请求数据,创建textfile表 分区表hainiu.request_text ,创建每天处理日期的分区,并导入数据
create table request_text(uid string,req_time string ,req_url string) partitioned by (day string) row format delim ited fields terminated by '\t'; load data local inpath '/home/hadoop/19' into table request_text partition(day='20220919'); load data local inpath '/home/hadoop/20' into table request_text partition(day='20220920');
- 面试题8:创建parquet分区表 hainiu.request_p ,创建每天处理日期的分区,并将hainiu.request_text 表每天分区数据导入到 request_p 每天分区中
create table request_p (uid string,req_time string ,req_url string) partitioned by (day string) stored as parquet;
- 普通方式
insert into table request_p partition(day='20220919') select * from request_text where day='20220919'; insert into table request_p partition(day='20220920') select * from request_text where day='20220920';
- 动态分区方式
set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.dynamic.partition=true; insert into table request_p partition(day) select * from request_text;
数据准备
创建用户表 user_text表 存储格式是 字段两个,数据用已经提供的数据
u0001 用户1
u0002 用户2
u0003 用户3
-
面试题9:创建parquet分区表 request_p2, 将用户信息表和request_p表中的数据导入到request_p2表中,格式要符合要求
表结构:
用户id
用户名称
请求日期 需要格式化 只保留 20200919 年月日
请求url
处理日期create table request_p2(uid string,uname string,req_day string,req_url string) partitioned by (day string) stored as parquet;
动态分区的方式进行查询导入
insert into table request_p2 partition(day) select t1.uid,t2.uname,t1.req_day,t1.req_url,t1.day from (select uid,substring(req_time,1,8) as req_day,req_url,day from request_p) t1 join user_text t2 on t1.uid=t2.uid;
-
面试题10: 用 request_p2 表 统计请求日期是20220918 ,处理日期是 20220919 请求的用户数(去重),导出到linux /home/hadoop/export/output1.txt文件中
- 方式1:
insert overwrite directory '/home/hadoop/export/' select count(distinct(uid)) from request_p2 where req_day='20220918' and day='20220919'; qqq
- 方式2:
hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select count(distinct(uid)) from request_p2 where req_day='20200918' and day='20220919';" > /home/hadoop/export/output1.txt
- 方式1:
-
面试题11:用 request_p2 表查询出每个请求日期每个用户的访问量,导出到linux /home/hadoop/export/output2.txt文件中
hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select req_day,uid ,count(*) from request_p2 group by req_day,uid;" > /home/hadoop/export/output2.txt
-
面试题12:用request_p2 表 计算20号比19号的增量用户,导出到linux /home/hadoop/export/output3.txt文件中
hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select t2.uid from (select uid from request_p2 where day='20220919' group by uid) t1 right join (select uid from request_p2 where day='20220920' group by uid) t2 on t1.uid=t2.uid where t1.uid is null;" > /home/hadoop/export/output3.txt
- 面试题13:查询request_p 处理日期是 20220919-20220920,每个用户的最后请求记录 导出到linux /home/hadoop/export/output4.txt文件中
hive -e "use hainiu;set mapreduce.job.queuename=hainiu;select * from (select uid,req_time,req_url,row_number() over(partition by uid order by req_time desc ) as lastreq from request_p) t1 where lastreq=1; " > /home/hadoop/export/output4.txt