hive学习之路

#变量

#引入变量
given_dayno="'20190601'"
……
dayno=${given_dayno}
#退出
exit ${v_job_stat}

#参数

#map过大 跑不动number of mappers: 230691; number of reducers: 1099
set mapred.max.split.size=1000000000;
#并行执行(union all 多的情况)
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
#跟大家分享个hive调优的方法
今天有个任务,数据量不大(几个字段),但是记录数3kw左右,做join有个stage跑了一个多小时都没跑完
--时间长原因 
map数太少,运行时该stage被只分配了11个map
--解决方法:适当加大map
set mapreduce.input.fileinputformat.split.maxsize=20000000;
set mapreduce.input.fileinputformat.split.minsize.per.node=10000000;
set mapreduce.input.fileinputformat.split.minsize.per.rack=10000000;
优化后,脚本大概20分钟就跑完了
#常见的设置参数
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.merge.smallfiles.avgsize=536870912;
set mapred.max.split.size=134217728;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=6144;
set hive.auto.convert.join=true;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
set hive.exec.compress.intermediate=true;
set hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set hive.exec.reducers.bytes.per.reducer=134217728;

#hql

#count(0)也算一条记录
错:count(if(act_code in ('20090031','20180031','20090080'),1,0))
对:count(if(act_code in ('20090031','20180031','20090080'),1,null))
    sum(if(act_code in ('20090031','20180031','20090080'),1,0))
#增加列
alter table table_name add columns (col_name bigint);
#修改名
ALTER TABLE name RENAME TO new_name
ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name DROP [COLUMN] column_name
ALTER TABLE name CHANGE column_name new_name new_type
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])
#根据分区字段删除分区:
ALTER TABLE my_partition_test_table DROP IF EXISTS PARTITION (p_loctype='MHA');
#substring,注意,从1开始,不是0;
#截取 从字符串的第 4 个字符位置开始取,只取 2 个字符。
select substring('example.com', 4, 2);
#substring 取最后两位
select substring('abc',length('abc')-1,2)
#建表
CREATE TABLE tableName
(
     col1       string,
     col2       bigint
)
partitioned by (dayno string)
row format delimited
fields terminated by '\t' 
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
stored as orcfile; 
#分享一个自己遇到的MAPJOIN代码运行效率调优的实例:调优前运行5小时,
MAPJOIN调优后运行1小时左右,大表小表join时使用MAPJOIN能有效提升运行效率。
优化后代码截取段:
    select /*+ MAPJOIN(t2,t3) */
               t1.imei,t1.gender,t1.age,t1.city_grade_name,t1.model,t3.series 
        from dw.f_agt_user_tag t1 
        inner join
        (
            select max(dayno) as dayno 
            from dw.f_agt_user_tag
        ) t2 on t1.dayno = t2.dayno
        inner join 
        (
            select model_level_1 as model,series 
            from dw.f_dim_model_info 
            group by model_level_1,series
        ) t3 on t1.model=t3.model
        where t1.imei is not null
        group by t1.imei,t1.gender,t1.age,t1.city_grade_name,t1.model,t3.series

参考阅读资料:
https://blog.csdn.net/kwu_ganymede/article/details/51365002
https://blog.csdn.net/liuxiao723846/article/details/78739097
https://help.aliyun.com/knowledge_detail/40270.html
#PARTITION BY      order by
计算累计
SELECT cookieid,
createtime,
pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1 
SUM(pv) OVER(PARTITION BY cookieid) AS pv3,                                --分组内所有行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,   --当前行+往前3行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    --当前行+往前3行+往后1行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6   ---当前行+往后所有行  
FROM lxw1234;
 
cookieid createtime     pv      pv1     pv2     pv3     pv4     pv5      pv6 
-----------------------------------------------------------------------------
cookie1  2015-04-10      1       1       1       26      1       6       26
cookie1  2015-04-11      5       6       6       26      6       13      25
cookie1  2015-04-12      7       13      13      26      13      16      20
cookie1  2015-04-13      3       16      16      26      16      18      13
cookie1  2015-04-14      2       18      18      26      17      21      10
cookie1  2015-04-15      4       22      22      26      16      20      8
pv1: 分组内从起点到当前行的pv累积,如,11号的pv1=10号的pv+11号的pv, 12号=10号+11号+12号
pv2: 同pv1
pv3: 分组内(cookie1)所有的pv累加
pv4: 分组内当前行+往前3行,如,11号=10号+11号, 12号=10号+11号+12号, 13号=10号+11号+12号+13号, 14号=11号+12号+13号+14号
pv5: 分组内当前行+往前3行+往后1行,如,14号=11号+12号+13号+14号+15号=5+7+3+2+4=21
pv6: 分组内当前行+往后所有行,如,13号=13号+14号+15号+16号=3+2+4+4=13,14号=14号+15号+16号=2+4+4=10
#索引失效
看看有没有索引,没有建,有,看是否失效;
比如dim int ,查询可以写成dim=1,或者dim='1',但是dim='1'就会索引失效
#rlike(一列中包含另一列,如标题,包含低俗词)
a.title rlike b.word
#笛卡尔积
cross join(匹配一个表,如含低俗词)
#开窗函数 分析函数
OVER(PARTITION BY)函数用法
分析函数用于计算基于组的某种聚合值,它和聚合函数的不同之处是:对于每个组返回多行,而聚合函数对于每个组只返回一行。
1:over后的写法:    
   over(order by salary)    按照salary排序进行累计,order by是个默认的开窗函数
   over(partition by deptno)按照部门分区
2、开窗的窗口范围:
   over(order by salary range between 5 preceding and 5 following):窗口范围为当前行数据幅度减5加5后的范围内的。
3、与over函数结合的几个函数介绍
   a)rank()over() 、dense_rank()over()可以把并列第一查出来,row_number()over()只返回一个结果
   b)rank()是跳跃排序,有两个第二名时接下来就是第四名;dense_rank()是连续排序,有两个第二名时仍然跟着第三名
   select * from                                                                      
    (                                                                           
    select name,class,s,rank()over(partition by class order by s desc) mm from t2
    )                                                                           
    where mm=1;
    
   c)sum()over()
   d)first_value() over()和last_value() over()
#concat_ws  用|串联 
concat_ws('|',collect_set(host)) as host
#筛选条件
select * from t1 left outer join t2 on t1.id=t2.id and t2.cnt>=40  表2先筛选大于等于40,再关联
select * from t1 left outer join t2 on t1.id=t2.id where t2.cnt>=40  最后筛选,如果有空,也会过滤掉
#多层聚类cube
SELECT  month, day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID 
FROM cookie5 
GROUP BY month,day 
WITH CUBE 
ORDER BY GROUPING__ID;


等价于



SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM cookie5
UNION ALL 
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM cookie5 GROUP BY month 
UNION ALL 
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM cookie5 GROUP BY day
UNION ALL 
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM cookie5 GROUP BY month,day

参考:https://www.cnblogs.com/qingyunzong/p/8798987.html

猜你喜欢

转载自www.cnblogs.com/syj-love-dy/p/11130194.html