#variable
Introducing variable # given_dayno = " '20,190,601' " ...... dayno = $ {} given_dayno
# Exit
exit $ {v_job_stat}
#parameter
#map of too large not run by mappers Number: 230691 ; Number of the reducers: 1099 . mapred.max SET Split .size = 1000000000 ;
# Parallel execution (union all cases more) SET hive.exec.parallel = to true ; SET hive.exec.parallel.thread.number = . 8 ;
# To share with you a hive tuning methods today have a task, the amount of data (several fields), but the record about the number of 3kw, do join there is a stage run more than an hour did not finish - for a long time the reason map too few runs this stage is only allocated 11 the Map - solution: appropriate to increase the Map the SET mapreduce.input.fileinputformat. Split .maxsize = 20000000 ; the SET mapreduce.input.fileinputformat. Split .minsize.per. = the Node 10000000 ; . the SET mapreduce.input.fileinputformat Split .minsize.per.rack = 10000000 ; optimized, the script ran over about 20 minutes
#常见的设置参数 set hive.merge.mapredfiles=true; set hive.merge.mapfiles=true; set hive.merge.smallfiles.avgsize=536870912; set mapred.max.split.size=134217728; set mapreduce.map.memory.mb=4096; set mapreduce.reduce.memory.mb=6144; set hive.auto.convert.join=true; set hive.exec.parallel=true; set hive.exec.parallel.thread.number=8; set hive.exec.compress.intermediate=true; set hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; set hive.exec.reducers.bytes.per.reducer=134217728;
#hql
#COUNT ( 0 ) can be considered a record error: COUNT ( IF (act_code in ( ' 20,090,031 ' , ' 20,180,031 ' , ' 20.09008 million ' ), . 1 , 0 )) of: COUNT ( IF (act_code in ( ' 20,090,031 ' , ' 20,180,031 ' , ' 20.09008 million ' ), . 1 , null )) SUM ( IF (act_code in ( '20090031','20180031','20090080'),1,0))
#增加列
alter table table_name add columns (col_name bigint);
#修改名
ALTER TABLE name RENAME TO new_name
ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name DROP [COLUMN] column_name
ALTER TABLE name CHANGE column_name new_name new_type
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])
The Delete Partition # partition field: the ALTER TABLE my_partition_test_table the DROP the IF EXISTS the PARTITION (p_loctype = ' the MHA ' );
#substring, note, starting from 1, not 0; # taken from the string 4 starts fetching character position, taking only 2 characters. SELECT the substring ( ' the example.com ' , . 4 , 2 ); #substring take the last two SELECT the substring ( ' ABC ' , length ( ' ABC ' ) - . 1 , 2 )
#建表 CREATE TABLE tableName ( col1 string, col2 bigint ) partitioned by (dayno string) row format delimited fields terminated by '\t' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':' stored as orcfile;
# Share a MAPJOIN code to run efficiently tune instances own experience: tuning five hours before running, running for about 1 hour after MAPJOIN tuning, use MAPJOIN when large table join a small table can effectively improve operating efficiency. After taken optimized code segment: SELECT / * + MAPJOIN (T2, T3) * / t1.imei, t1.gender, t1.age, t1.city_grade_name, t1.model, t3.series from dw.f_agt_user_tag T1 Inner the Join ( SELECT max (dayno) AS dayno from dw.f_agt_user_tag ) ON t1.dayno T2 = t2.dayno Inner the Join ( SELECT model_level_1 Model AS, Series from dw.f_dim_model_info Group by model_level_1, Series ) T3 ON t1.model = t3.model t1.imei IS not the WHERE null Group by t1.imei, t1.gender, t1.age, t1.city_grade_name, t1.model, t3.series reference readings: HTTPS: // blog.csdn.net/kwu_ganymede/article/ the Details / 51,365,002 HTTPS: // blog.csdn.net/liuxiao723846/article/details/78739097 HTTPS: // help.aliyun.com/knowledge_detail/40270.html
#PARTITION BY order by computing the cumulative the SELECT CookieID, CreateTime, pv, SUM (pv) OVER (PARTITION BY cookieid ORDER BY CreateTime) AS PV1, - the default is from the start point to the current line SUM (pv) OVER (PARTITION BY cookieid ORDER BY UNBOUNDED the BETWEEN PRECEDING the AND rOWS CreateTime the cURRENT the rOW) PV2 the AS, - the current line from the starting point to the results with PV1 SUM (pv) OVER (PARTITION BY CookieID) the AS PV3, - within the group all rows SUM (pv) OVER (PARTITION BY the ORDER BY CreateTime the BETWEEN rOWS CookieID 3 PRECEDING the cURRENT the AND the rOW) PV4 the AS, - current line + Previous line 3 the SUM (PV) the OVER (CreateTime the PARTITION BY CookieID the ORDER BY rOWS the BETWEEN 3 PRECEDING the AND . 1FOLLOWING) AS pv5, - forward current line + + 3 rows next 1 line the SUM (PV) the OVER (the PARTITION BY CookieID the ORDER BY CreateTime the BETWEEN ROWS the CURRENT the AND UNBOUNDED the ROW FOLLOWING) the AS Pv6 --- current line + all subsequent line the FROM lxw1234; CookieID CreateTime pv PV1 PV2 PV3 PV4 PV5 Pv6 ------------------------------------- ---------------------------------------- cookiel , 2015 - 04 - 10 . 1 . 1 . 1 26 is . 1 . 6 26 is cookiel , 2015 - 04 - . 11 . 5 . 6 . 6 26 is . 6 13 is 25 cookiel , 2015 - 04 - 12 is . 7 13 is 13 is 26 is 13 is 16 20 is cookiel , 2015 - 04 - 13 is . 3 16 16 26 is 16 18 is 13 is cookiel , 2015 - 04 - 14 2 18 is 18 is 26 is . 17 21 is 10 cookiel , 2015 - 04 - 15 . 4 22 is 22 is 26 is 16 20 is . 8 PV1: the packet from the start to accumulate pv current line, such as, PV1 No. 11Pv = pv + number. 11 No. 10 No. 12 No. 10 No. = +11 + 12 PV2: with PV1 PV3: the packet (cookiel) all accumulated pv pv4: packet within the current line + 3 line forward, such as , 11 = +11 No.10, No.10 = 12 +12 +11 number, number 13 = 10 +13 +11 number +12 number No. 11 No. 14 No. = number +12 +13 + 14 number pv5: packet within the current line + 1 next forward line 3 + line, e.g., 14 = No +13 +12 11 No No No +14 = +15 . 5 + . 7 + 3 + 2 + . 4 = 21 is Pv6 : grouping the current line + all subsequent lines, e.g., 13 = 13 + # 14 No No +15 +16 = 3 + 2 + 4 + 4 = 13 , 14 = 14 + 16 = number number +15 2 + 4 + 4 = 10
# Invalid index to see if there is no index, no built there, to see if the failure; such as Dim int , queries can be written = Dim 1 , or = Dim ' 1 ' , but = Dim ' 1 ' will fail Index
#rlike (in a column includes another, such as title, comprising vulgar words)
a.title RLIKE b.word
# Cartesian product Cross the Join (a matching table, such as those containing vulgar words)
# Windowing function analysis function OVER (PARTITION BY) function usage analysis function calculates the value based on a certain set of polymerization, and it is different from the aggregate function is: return multiple rows for each group, and the aggregate function for each group return only one row. 1 : after writing over: over (Order by salary) according accumulating salary sort, order by default is a windowing function over (partition by deptno) by department partition 2 , windowed window range: over (Range Order by salary BETWEEN 5 PRECEDING and 5 following): window range for the current line data amplitude in the range of plus minus 5 after 5. 3 , in combination with a function over several functions described a) rank () over () , dense_rank () over () can check out a first parallel, ROW_NUMBER () over () returns a result only b) rank () is Sort jump, the next step is the fourth time two second; dense_rank () is a continuous sort, still has two second place followed by the third time the SELECT * from ( the SELECT name,class,s,rank()over(partition by class order by s desc) mm from t2 ) where mm=1; c)sum()over() d)first_value() over()和last_value() over()
#concat_ws with a | series
concat_ws('|',collect_set(host)) as host
Filters # SELECT * from T1 Outer left the Join T2 T1 ON. ID = T2. ID and t2.cnt> = 40 TABLE 2 Screening of greater than or equal to 40, then the associated SELECT * from T1 Outer left the Join T2 T1 ON. ID = T2 . the above mentioned id the WHERE t2.cnt> = 40 last screening, if available, will be filtered out
#多层聚类cube SELECT month, day, COUNT(DISTINCT cookieid) AS uv, GROUPING__ID FROM cookie5 GROUP BY month,day WITH CUBE ORDER BY GROUPING__ID; 等价于 SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM cookie5 UNION ALL SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM cookie5 GROUP BY month UNION ALL SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM cookie5 GROUP BY day UNION ALL SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM cookie5 GROUP BY month,day
Reference: https://www.cnblogs.com/qingyunzong/p/8798987.html