SQL 与 Hive 技术总结

1、

Hive 安装_w3cschool所有Hadoop的子项目,如Hive, Pig,和HBase,都需要Linux的操作系统。因此,需要安装Linux OS。以下是为Hive的安装执行的简单步骤:第1步:验证JAVA安装在Hive安装之前,Java必须在系统上已经安装。使用下面的命令来验证是否已经安装Java:$ java –_来自Hive 教程,w3cschool编程狮。https://www.w3cschool.cn/hive_manual/hive_install.html

2、

HIVE:窗口函数,用sql语句查询MySQL安装路径和版本_Jack_2085-CSDN博客SET NAMES utf8mb4;SET FOREIGN_KEY_CHECKS = 0;-- ------------------------------ Table structure for test_table-- ----------------------------DROP TABLE IF EXISTS `test_table`;CREATE TABLE `test_table` ( `id` bigint(6) NULL DEFAULT NULL, `prov...https://blog.csdn.net/weixin_54217632/article/details/120243482

3、

    3.1、根据条件查询连续登录N天的用户。

    3.2、查询连续登录天数最大的用户及天数。

表结构及数据:
DROP TABLE IF EXISTS `test5`;
CREATE TABLE `test5`  (
  `dt` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `user_id` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `age` int(11) NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Records of test5
-- ----------------------------
INSERT INTO `test5` VALUES ('2021-03-11', 'test_1', 23);
INSERT INTO `test5` VALUES ('2021-03-11', 'test_1', 23);
INSERT INTO `test5` VALUES ('2021-03-11', 'test_1', 23);
INSERT INTO `test5` VALUES ('2021-03-13', 'test_1', 23);
INSERT INTO `test5` VALUES ('2021-03-11', 'test_2', 19);
INSERT INTO `test5` VALUES ('2021-04-11', 'test_3', 39);
INSERT INTO `test5` VALUES ('2021-03-11', 'test_3', 39);
INSERT INTO `test5` VALUES ('2021-03-12', 'test_2', 19);
INSERT INTO `test5` VALUES ('2021-03-15', 'test_2', 19);
INSERT INTO `test5` VALUES ('2021-03-16', 'test_2', 19);
INSERT INTO `test5` VALUES ('2021-03-13', 'test_2', 19);
INSERT INTO `test5` VALUES ('2021-03-14', 'test_2', 19);

SET FOREIGN_KEY_CHECKS = 1;



------------------------------------------------ 下面是sql的实现语句



--  方法1 用case when then 方法


select distinct user_id 
from(
SELECT * ,
               CASE
               WHEN DATE_SUB(str_to_date(dt,'%Y-%m-%d'),INTERVAL 1 DAY) = str_to_date(@old,'%Y-%m-%d')  and @u_id=user_id and @old:=dt 
					THEN @size:=@size+1
					
               WHEN @old:=dt 
					THEN @size:=1 
               END
 AS tt, @u_id:=user_id
FROM (select * from test5 
group BY user_id,dt) t
ORDER BY user_id,dt) tb
where tb.tt = 2


-----------------------------------------------------------------



-- 求连续最多登录的天数

select MAX(tt) 
from(
SELECT * ,
               CASE
               WHEN DATE_SUB(str_to_date(dt,'%Y-%m-%d'),INTERVAL 1 DAY) = str_to_date(@old,'%Y-%m-%d')  and @u_id=user_id and @old:=dt 
					THEN @size:=@size+1
					
               WHEN @old:=dt 
					THEN @size:=1 
               END
 AS tt, @u_id:=user_id
FROM (select * from test5 
group BY user_id,dt) t
ORDER BY user_id,dt) tb


------------------------------------------------------------------

-- 查询连续登录系统天数,最多的人和一共连续登录了多少天。

select user_id, tt
from(
SELECT * ,
               CASE
               WHEN DATE_SUB(str_to_date(dt,'%Y-%m-%d'),INTERVAL 1 DAY) = str_to_date(@old,'%Y-%m-%d')  and @u_id=user_id and @old:=dt 
					THEN @size:=@size+1
					
               WHEN @old:=dt 
					THEN @size:=1 
               END
 AS tt, @u_id:=user_id
FROM (select * from test5 
group BY user_id,dt) t
ORDER BY user_id,dt) tb
where tb.tt = 
(
select MAX(tt) 
from(
SELECT * ,
               CASE
               WHEN DATE_SUB(str_to_date(dt,'%Y-%m-%d'),INTERVAL 1 DAY) = str_to_date(@old,'%Y-%m-%d')  and @u_id=user_id and @old:=dt 
					THEN @size:=@size+1
					
               WHEN @old:=dt 
					THEN @size:=1 
               END
 AS tt, @u_id:=user_id
FROM (select * from test5 
group BY user_id,dt) t
ORDER BY user_id,dt) tb)


4、查询用户连续登录系统


DROP TABLE IF EXISTS `test_login_time`;
CREATE TABLE `test_login_time`  (
  `id` int(0) NOT NULL,
  `user_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_ci NULL DEFAULT NULL,
  `login_time` datetime(0) NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_as_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Records of test_login_time
-- ----------------------------
INSERT INTO `test_login_time` VALUES (1, 'jack', '2021-11-17 10:15:37');
INSERT INTO `test_login_time` VALUES (2, 'jack', '2021-11-18 10:15:37');
INSERT INTO `test_login_time` VALUES (3, 'jack', '2021-11-19 10:15:37');
INSERT INTO `test_login_time` VALUES (4, 'jack', '2021-11-20 10:15:37');
INSERT INTO `test_login_time` VALUES (5, 'jack', '2021-11-21 10:15:37');
INSERT INTO `test_login_time` VALUES (6, 'jack', '2021-11-22 10:15:37');
INSERT INTO `test_login_time` VALUES (7, 'jack', '2021-11-16 10:15:37');
INSERT INTO `test_login_time` VALUES (8, 'jack', '2021-11-15 10:15:37');
INSERT INTO `test_login_time` VALUES (9, 'lucy', '2021-12-01 10:15:37');
INSERT INTO `test_login_time` VALUES (10, 'lucy', '2021-12-02 10:15:37');
INSERT INTO `test_login_time` VALUES (11, 'lucy', '2021-12-03 10:15:37');
INSERT INTO `test_login_time` VALUES (12, 'lucy', '2021-12-04 10:15:37');
INSERT INTO `test_login_time` VALUES (13, 'lucy', '2021-12-06 10:15:37');
INSERT INTO `test_login_time` VALUES (14, 'lucy', '2021-12-08 10:15:37');
INSERT INTO `test_login_time` VALUES (15, 'lucy', '2021-12-09 10:15:37');
INSERT INTO `test_login_time` VALUES (16, 'lucy', '2021-12-10 10:15:37');
INSERT INTO `test_login_time` VALUES (17, 'mark', '2022-02-01 10:15:37');
INSERT INTO `test_login_time` VALUES (18, 'mark', '2022-02-02 10:15:37');
INSERT INTO `test_login_time` VALUES (19, 'mark', '2022-02-03 10:15:37');
INSERT INTO `test_login_time` VALUES (20, 'mark', '2022-02-04 10:15:37');
INSERT INTO `test_login_time` VALUES (21, 'mark', '2022-03-06 10:15:37');
INSERT INTO `test_login_time` VALUES (22, 'mark', '2022-06-08 10:15:37');
INSERT INTO `test_login_time` VALUES (23, 'mark', '2022-05-09 10:15:37');
INSERT INTO `test_login_time` VALUES (24, 'mark', '2022-05-10 10:15:37');
INSERT INTO `test_login_time` VALUES (25, 'mark', '2023-08-10 10:15:37');
INSERT INTO `test_login_time` VALUES (26, 'mayun', '2099-09-15 10:15:37');
INSERT INTO `test_login_time` VALUES (27, 'mayun', '2099-09-15 10:15:37');
INSERT INTO `test_login_time` VALUES (28, 'mayun', '2099-09-15 10:15:37');
INSERT INTO `test_login_time` VALUES (30, 'mayun', '2099-09-16 10:15:37');
INSERT INTO `test_login_time` VALUES (31, 'mayun', '5999-08-18 10:15:37');

SET FOREIGN_KEY_CHECKS = 1;

--   ---------------------------------------------------

--  方法1、


-- 繁琐版本

select *,
         CASE
               WHEN  lag_t2=1 
					THEN @size:=@size+1
					ELSE
               @size:=1 
               END
 AS tt,
 
-- if方法也行
 if(lag_t2=1,@size_2:=@size_2+1,@size_2:=1) as tt_2
from(
select *,
TIMESTAMPDIFF(DAY,lead_t1,login_time) as lag_t2
from(
select *,
lag(t.login_time, 1) over (PARTITION by t.user_name order by t.login_time) as lead_t1
from test_login_time t 
) ta)tab;








******************************************************************
-- - 简单版本

-- 从来没有连续登录的用户,和连续n次登录的用户。
select *,
 if(ta.tg=1 and ta.t_name=1,@size:=@size+1,@size:=1) as t_num
from(
select *,lead(left(login_time,10), 1) over () = 
     DATE_ADD(left(login_time,10),INTERVAL 1 day) as tg ,
	  lead(user_name, 1) over () = user_name  as t_name
	  from test_login_time t 
ORDER BY t.user_name,t.login_time) ta;

------------------------------------------------------------ 上面sql的升级版本

select *,@size,
 if(ta.tg=1 and ta.t_name=1,@size:=@size+1,@size:=1) as t_num
from(
select *,lead(left(login_time,10), 1) over () = 
     DATE_ADD(left(login_time,10),INTERVAL 1 day) as tg ,
	  lead(user_name, 1) over () = user_name  as t_name
	  from test_login_time t 
ORDER BY t.user_name,t.login_time) ta;

------------------------------------------- 下面是个要解决的问题。

-- 为什么下面的sql语句,@size变量不重新计算。
select *,
 if(lead(left(login_time,10), 1) over () = 
     DATE_ADD(left(login_time,10),INTERVAL 1 day),
	  @size:=@size+1,
	  @size:=1) as t_num
from(
select * from test_login_time t 
ORDER BY t.user_name,t.login_time) ta


5、求各科学生成绩第二名的学生

    

   


DROP TABLE IF EXISTS `km_cj`;
CREATE TABLE `km_cj`  (
  `km` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_ci NULL DEFAULT NULL,
  `user_id` int(0) NOT NULL,
  `cj` bigint(0) NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_as_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Records of km_cj
-- ----------------------------
INSERT INTO `km_cj` VALUES ('jack', 8, 1);
INSERT INTO `km_cj` VALUES ('jack', 7, 2);
INSERT INTO `km_cj` VALUES ('jack', 1, 3);
INSERT INTO `km_cj` VALUES ('jack', 2, 4);
INSERT INTO `km_cj` VALUES ('jack', 3, 5);
INSERT INTO `km_cj` VALUES ('jack', 4, 6);
INSERT INTO `km_cj` VALUES ('jack', 5, 7);
INSERT INTO `km_cj` VALUES ('jack', 6, 8);
INSERT INTO `km_cj` VALUES ('lucy', 9, 1);
INSERT INTO `km_cj` VALUES ('lucy', 10, 2);
INSERT INTO `km_cj` VALUES ('lucy', 11, 3);
INSERT INTO `km_cj` VALUES ('lucy', 12, 4);
INSERT INTO `km_cj` VALUES ('lucy', 13, 1);
INSERT INTO `km_cj` VALUES ('lucy', 14, 1);
INSERT INTO `km_cj` VALUES ('lucy', 15, 2);
INSERT INTO `km_cj` VALUES ('lucy', 16, 3);
INSERT INTO `km_cj` VALUES ('mark', 17, 1);
INSERT INTO `km_cj` VALUES ('mark', 18, 2);
INSERT INTO `km_cj` VALUES ('mark', 19, 3);
INSERT INTO `km_cj` VALUES ('mark', 20, 4);
INSERT INTO `km_cj` VALUES ('mark', 21, 1);
INSERT INTO `km_cj` VALUES ('mark', 23, 1);
INSERT INTO `km_cj` VALUES ('mark', 24, 2);
INSERT INTO `km_cj` VALUES ('mark', 22, 1);
INSERT INTO `km_cj` VALUES ('mark', 25, 1);
INSERT INTO `km_cj` VALUES ('mayun', 26, 1);
INSERT INTO `km_cj` VALUES ('mayun', 27, 1);
INSERT INTO `km_cj` VALUES ('mayun', 28, 2);
INSERT INTO `km_cj` VALUES ('mayun', 30, 1);
INSERT INTO `km_cj` VALUES ('mayun', 31, 1);

SET FOREIGN_KEY_CHECKS = 1;



--   -----------------------------------------------------------------




-- -- 用窗口函数,求各科成绩第二名的学生

select * from (
select *,
DENSE_RANK() over (PARTITION by km ORDER BY cj desc) as d_r_num
from km_cj t) ta
where ta.d_r_num=2



-- 用普通方法求各科成绩第二名的学生
select tb_three.km, tb_three.user_id,tb_three.cj
from 
(select t_tu.km as km_two, max(t_tu.cj)  as max_cj_two from 
km_cj t_tu
where t_tu.user_id not in(
select tb.user_id
from
(select km as km_one,max(cj) as max_cj from km_cj t
group by  km) ta
LEFT JOIN  km_cj  tb
on  ta.km_one = tb.km and ta.max_cj = tb.cj)
GROUP BY t_tu.km) t_zj
LEFT JOIN km_cj  tb_three
on t_zj.km_two=tb_three.km and t_zj.max_cj_two = tb_three.cj







6、求连续3个相同的数。


DROP TABLE IF EXISTS `km_cj`;
CREATE TABLE `km_cj`  (
  `km` varchar(255) ,
  `user_id` int(0),
  `cj` bigint(0)
) 


INSERT INTO `km_cj` VALUES ('语文', 8, 1);
INSERT INTO `km_cj` VALUES ('语文', 7, 2);
INSERT INTO `km_cj` VALUES ('语文', 1, 3);
INSERT INTO `km_cj` VALUES ('语文', 2, 4);
INSERT INTO `km_cj` VALUES ('语文', 3, 5);
INSERT INTO `km_cj` VALUES ('语文', 4, 6);
INSERT INTO `km_cj` VALUES ('语文', 5, 7);
INSERT INTO `km_cj` VALUES ('语文', 6, 8);
INSERT INTO `km_cj` VALUES ('数学', 9, 1);
INSERT INTO `km_cj` VALUES ('数学', 10, 2);
INSERT INTO `km_cj` VALUES ('数学', 11, 3);
INSERT INTO `km_cj` VALUES ('数学', 12, 4);
INSERT INTO `km_cj` VALUES ('数学', 13, 1);
INSERT INTO `km_cj` VALUES ('数学', 14, 1);
INSERT INTO `km_cj` VALUES ('数学', 15, 2);
INSERT INTO `km_cj` VALUES ('数学', 16, 3);
INSERT INTO `km_cj` VALUES ('英语', 17, 1);
INSERT INTO `km_cj` VALUES ('英语', 18, 2);
INSERT INTO `km_cj` VALUES ('英语', 19, 3);
INSERT INTO `km_cj` VALUES ('英语', 20, 4);
INSERT INTO `km_cj` VALUES ('英语', 21, 1);
INSERT INTO `km_cj` VALUES ('英语', 23, 1);
INSERT INTO `km_cj` VALUES ('英语', 24, 2);
INSERT INTO `km_cj` VALUES ('英语', 22, 1);
INSERT INTO `km_cj` VALUES ('英语', 25, 1);
INSERT INTO `km_cj` VALUES ('体育', 26, 1);
INSERT INTO `km_cj` VALUES ('体育', 27, 1);
INSERT INTO `km_cj` VALUES ('体育', 28, 2);
INSERT INTO `km_cj` VALUES ('体育', 30, 1);
INSERT INTO `km_cj` VALUES ('体育', 31, 1);





--  -----------------------------------------------------------


---- 求连续3个相同的数。
select DISTINCT t_num
from  
(select id,num as t_num,
LEAD(num,1) over () as t1,
LEAD(num,2) over () as t2
from test_num) ta
where ta.t_num = ta.t1 and
       ta.t_num = ta.t2 

7、关于学生成绩排名的所有问题查询总结:


DROP TABLE IF EXISTS `sc`;
CREATE TABLE `sc`  (
  `s_id` int(255) ,
  `s_g` varchar(10) ,
  `s_score` int(4)
) ;

-- ----------------------------
-- Records of sc
-- ----------------------------
INSERT INTO `sc` VALUES (1, 's1', 79);
INSERT INTO `sc` VALUES (2, 's1', 79);
INSERT INTO `sc` VALUES (3, 's1', 59);
INSERT INTO `sc` VALUES (4, 's2', 81);
INSERT INTO `sc` VALUES (5, 's2', 73);
INSERT INTO `sc` VALUES (6, 's3', 82);
INSERT INTO `sc` VALUES (7, 's3', 82);
INSERT INTO `sc` VALUES (8, 's3', 91);




--  ------------------------------------




select * from  sc;
-- s_id  学生id
-- s_g   科目
-- s_score 成绩


-- 实现窗口函数:ROW_NUMBER()
select s_id,s_g,s_score,row_num
from(
select *,
if(s_g=@s_g,@num:=@num+1,@num:=1) as row_num, @s_g:=s_g
from(
select * from sc,(select @num:=null) r
GROUP BY s_g,s_id,s_score
ORDER BY s_g,s_score desc) ta)tab;



-- 实现窗口函数:DENSE_RANK()
select s_id,s_g,s_score,row_num
from(
select *,
if(s_g=@s_g and s_score!=@s_score,@num:=@num+1,if(s_g=@s_g and s_score=@s_score,@num,@num:=1)) as row_num, @s_g:=s_g,@s_score:=s_score
from(
select * from sc,(select @num:=1) r
GROUP BY s_g,s_id,s_score
ORDER BY s_g,s_score desc) ta)tab



--  -------------------问题总结:

-- -- 第一次执行的时候,为什么,row_num 都等于1那?是Navicat的问题,还是sql语句的问题。



-- 一条sql语句 实现窗口函数:ROW_NUMBER(),DENSE_RANK()和RANK()
select 
s_id_ten as s_id,s_g_ten as s_g,s_score_ten as s_score,
row_num_one_ten as ROW_NUMBER ,row_num_ten as DENSE_RANK,
z_j_ban as rank
from (
select *,
if(row_num_one_ten=row_num_ten or @row_num_ten=row_num_ten,row_num_ten,row_num_one_ten) as z_j_ban,@row_num_ten:=row_num_ten
from(
select s_id as s_id_ten,s_g as s_g_ten,s_score as s_score_ten,row_num_one as row_num_one_ten,row_num as row_num_ten
from(
select *,
if(s_g=@s_g_one,@num_one:=@num_one+1,@num_one:=1) as row_num_one,@s_g_one:=s_g,
if(s_g=@s_g and s_score!=@s_score,@num:=@num+1,if(s_g=@s_g and s_score=@s_score,@num,@num:=1)) as row_num, @s_g:=s_g,@s_score:=s_score
from(
select * from sc,(select @num_one:=null,@num:=1) r
GROUP BY s_g,s_id,s_score
ORDER BY s_g,s_score desc) ta)tab)tabc)tabcd







8、在没有分组的情况下,实现rank()函数


DROP TABLE IF EXISTS `score`;
CREATE TABLE `score`  (
  `id` int(11),
  `name` varchar(255),
  `score` int(11) 
);


INSERT INTO `score` VALUES (1, '001', 100);
INSERT INTO `score` VALUES (2, '002', 99);
INSERT INTO `score` VALUES (3, '003', 99);
INSERT INTO `score` VALUES (4, '004', 96);
INSERT INTO `score` VALUES (5, '005', 95);
INSERT INTO `score` VALUES (6, '006', 95);
INSERT INTO `score` VALUES (7, '007', 65);




--  --------------------------------------- 上面是表及数据


select * from score;


-- 用 case when then  else end 的实现过程
select 
tmp.id,tmp.name,tmp.score,
-- 顺序一直在变大
@j:=@j+1 as j,
-- 只有在前后二次排序值不同时才会使用顺序号
@k:=(case when @pre_score=tmp.score then @k else @j end) as rank,
@pre_score:=tmp.score as pre_score
from 
(
-- 成绩排序
select * from score order by score desc
) tmp,
-- @k 表示最终的排名(相同值时序号相同) 
-- @j 表示顺序排名 
-- @pre_score上一次排序值
(select @k :=0,@j:=0, @pre_score:=0) sdcore



-- 用 if  的实现过程
select 
tmp.id,tmp.name,tmp.score,
-- 顺序一直在变大
@j:=@j+1 as j,
-- 只有在前后二次排序值不同时才会使用顺序号
@k:=if(@pre_score=tmp.score ,@k ,@j) as rank,
@pre_score:=tmp.score as pre_score_ed
from 
(
-- 成绩排序
select * from score order by score desc
) tmp,
-- @k 表示最终的排名(相同值时序号相同) 
-- @j 表示顺序排名 
-- @pre_score上一次排序值
(select @k :=0,@j:=0, @pre_score:=0) sdcore





9、简单版本:留存率的问题。

-- --- ----数据库表


DROP TABLE IF EXISTS `15.17_user_login`;
CREATE TABLE `15.17_user_login`  (
  `uid` varchar(255) ,
  `login_time` varchar(255)
) ;

INSERT INTO `15.17_user_login` VALUES ('1', '2019/1/1 6:00');
INSERT INTO `15.17_user_login` VALUES ('2', '2019/1/1 10:00');
INSERT INTO `15.17_user_login` VALUES ('3', '2019/1/1 19:00');
INSERT INTO `15.17_user_login` VALUES ('1', '2019/1/2 10:00');
INSERT INTO `15.17_user_login` VALUES ('2', '2019/1/2 9:00');
INSERT INTO `15.17_user_login` VALUES ('3', '2019/1/2 14:00');
INSERT INTO `15.17_user_login` VALUES ('1', '2019/1/3 8:00');
INSERT INTO `15.17_user_login` VALUES ('2', '2019/1/9 14:00');
INSERT INTO `15.17_user_login` VALUES ('3', '2019/1/9 10:00');
INSERT INTO `15.17_user_login` VALUES ('3', '2019/1/9 15:00');




--  -------------------



select(CASE 
	WHEN left(login_time,8)="2019/1/2" THEN
		"次日留存率"
	WHEN left(login_time,8)="2019/1/3" THEN
		"3日留存率"
	WHEN left(login_time,8)="2019/1/9" THEN
		"9日留存率"
	ELSE
		"其他时间留存率"
END 
) as len_u, count(uid) as id_num from (select * from `15.17_user_login` GROUP BY uid,left(login_time,8)) r
GROUP BY (
CASE 
	WHEN left(login_time,8)="2019/1/2" THEN
		"次日留存率"
	WHEN left(login_time,8)="2019/1/3" THEN
		"3日留存率"
	WHEN left(login_time,8)="2019/1/9" THEN
		"9日留存率"
	ELSE
		"其他时间留存率"
END 
)

10、

11、

12、

13、

14、

15、

16、

17、

18、

19、

20、

21、

22、

23、

24、

25、

26、

27、

28、

29、

30、

31、

32、

33、

34、

35、

36、

37、

38、

39、

40、

41、

42、

43、

44、

45、

46、

47、

48、

49、

50.

猜你喜欢

转载自blog.csdn.net/weixin_54217632/article/details/120819965