4. HiveSQL query statement methods

4. Query statement

4.1 Grammar rules and data preparation

Official URL
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Select

Basic query statement syntax:

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
 FROM table_reference
 [WHERE where_condition]
 [GROUP BY col_list]
 [ORDER BY col_list]
 [CLUSTER BY col_list
  | [DISTRIBUTE BY col_list] [SORT BY col_list]
 ]
 [LIMIT [offset,] rows]

select [all | distinct] select_expr, select_expr, ...
 from table_reference
 [where where_condition]
 [group by col_list]
 [order by col_list]
 [cluster by col_list
  | [distribute by col_list] [sort by col_list]
 ]
 [limit [offset,] rows]

data preparation:

Raw data
department table: dept

10  accounting  1700
20  research    1800
30  sales   1900
40  operations  1700

Employee table emp:

7369    smith   clerk   7902    1980-12-17  800.00      20
7499    allen   salesman    7698    1981-2-20   1600.00 300.00  30
7521    ward    salesman    7698    1981-2-22   1250.00 500.00  30
7566    jones   manager 7839    1981-4-2    2975.00     20
7654    martin  salesman    7698    1981-9-28   1250.00 1400.00 30
7698    blake   manager 7839    1981-5-1    2850.00     30
7782    clark   manager 7839    1981-6-9    2450.00     10
7788    scott   analyst 7566    1987-4-19   3000.00     20
7839    king    president       1981-11-17  5000.00     10
7844    turner  salesman    7698    1981-9-8    1500.00 0.00    30
7876    adams   clerk   7788    1987-5-23   1100.00     20
7900    james   clerk   7698    1981-12-3   950.00      30
7902    ford    analyst 7566    1981-12-3   3000.00     20
7934    miller  clerk   7782    1982-1-23   1300.00     10

Insert data
Create department table

create table if not exists dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by '\t';

Create employee table

create table if not exists emp(
empno int,
ename string,
job string,
mgr int,
hiredate string, 
sal double, 
comm double,
deptno int)
row format delimited fields terminated by '\t';

Write data to two files under /root/data/ in node4

[root@node4 data]# vim dept.txt
[root@node4 data]# vim emp.txt
[root@node4 data]# cat dept.txt
10	ccounting	1700
20	research	1800
30 	sales		1900
40 	operations  	1700

[root@node4 data]# cat emp.txt 
7369	mith   clerk   7902    1980-12-17  800.00      20
7499    allen   salesman    7698    1981-2-20   1600.00 300.00  30
7521    ward    salesman    7698    1981-2-22   1250.00 500.00  30
7566    jones   manager 7839    1981-4-2    2975.00     20
7654    martin  salesman    7698    1981-9-28   1250.00 1400.00 30
7698    blake   manager 7839    1981-5-1    2850.00     30
7782    clark   manager 7839    1981-6-9    2450.00     10
7788    scott   analyst 7566    1987-4-19   3000.00     20
7839    king    president       1981-11-17  5000.00     10
7844    turner  salesman    7698    1981-9-8    1500.00 0.00    30
7876    adams   clerk   7788    1987-5-23   1100.00     20
7900    james   clerk   7698    1981-12-3   950.00      30
7902    ford    analyst 7566    1981-12-3   3000.00     20
7934    miller  clerk   7782    1982-1-23   1300.00     10

[root@node4 data]# pwd
/root/data

Operation on hive: create table + import
table structure in hive, import txt into table in hive

hive> create table if not exists dept(
    > deptno int,
    > dname string,
    > loc int
    > )
    > row format delimited fields terminated by '\t';
OK
Time taken: 0.829 seconds
hive> load data local inpath '/root/data/dept.txt' into table dept;
Loading data to table default.dept
OK
Time taken: 0.916 seconds
hive> create table if not exists emp(
    > empno int,
    > ename string,
    > job string,
    > mgr int,
    > hiredate string, 
    > sal double, 
    > comm double,
    > deptno int)
    > row format delimited fields terminated by '\t';
OK
Time taken: 0.226 seconds
hive> load data local inpath '/root/data/emp.txt' into table emp;
Loading data to table default.emp
OK
Time taken: 0.599 seconds

4.2 Basic query

4.2.1 Full table and specified column query

Full table query: use * to represent all columns

hive> select * from dept;
OK
10	ccounting  	1700
20	research	1800
30	sales	1900
40	operations	1700
Time taken: 0.424 seconds, Fetched: 4 row(s)

hive> select * from emp;
OK
7369	smith	clerk	7902	1980-12-17	800.0	NULL	20
7499	allen   salesman	7698	NULL	1600.00	300.0	30.0	NULL
7521	ward    salesman    	7698	NULL	1250.00 500.00	30.0	NULL	NULL
Time taken: 0.401 seconds, Fetched: 15 row(s)

Query the specified column in hive: add the specific column name after select

hive> select  deptno,dname from dept;
OK
10	ccounting  
20	research
30	sales
40	operations
Time taken: 0.442 seconds, Fetched: 4 row(s)

The SQL language is not case-sensitive
and can be written in one line or multiple lines.
Keywords cannot be abbreviated and cannot be divided into lines.
Complicated SQL is generally written in lines.
Indentation is used to improve the readability of the statement.

4.2.2 Column aliases

--查询雇员的名称和部门
hive> select ename name, deptno as no from emp;
OK
smith	20
allen   salesman	NULL
ward    salesman    	NULL
jones   manager 7839	NULL
martin  salesman    	NULL
blake   manager 7839   	NULL
clark   manager 7839   	NULL
scott   analyst 7566   	NULL
king    president       	NULL
turner  salesman    	NULL
adams   clerk   7788    1987-5-23   	NULL
james   clerk   7698    1981-12-3   	NULL
ford    analyst 7566    1981-12-3   	NULL
miller  clerk   7782    1982-1-23   	NULL
NULL	NULL
Time taken: 0.443 seconds, Fetched: 15 row(s)
......

How to use hive column aliases: directly follow the column name or use the as keyword to follow the column alias.
The benefits of using column aliases: Simplify the use.

4.2.3 Arithmetic operators

insert image description here
demo

hive> select ename,sal*12 from emp;
OK
smith 9600.0
allen 19200.0
ward 15000.0
jones 35700.0
martin 15000.0
blake 34200.0
clark 29400.0
scott 36000.0
king 60000.0
turner 18000.0
adams 13200.0
james 11400.0
ford 36000.0
miller 15600.0
Time taken: 0.993 seconds, Fetched: 14 row(s)
hive> select ename,sal*12
year_money,sal*12+2000 year_all_money from emp;
OK
smith 9600.0 11600.0
allen 19200.0 21200.0
ward 15000.0 17000.0
jones 35700.0 37700.0
martin 15000.0 17000.0
blake 34200.0 36200.0
clark 29400.0 31400.0
scott 36000.0 38000.0
king 60000.0 62000.0
turner 18000.0 20000.0
adams 13200.0 15200.0
james 11400.0 13400.0
ford 36000.0 38000.0
miller 15600.0 17600.0
Time taken: 0.322 seconds, Fetched: 14 row(s)

4.2.4 Commonly used aggregate functions

# 查询总共多少行
hive> select count(*) from emp;
hive> select count(empno) from emp;
# 查询最低工资
hive> select min(sal) from emp;
# 查询最高工资
hive> select max(sal) from emp;
# 查询平均工资
hive> select avg(sal) from emp;
# 查询和
hive> select sum(sal) from emp;

4.2.5 where statement

Use the where statement, function: filter out the data that does not meet the conditions.

#查询工资大于2500的所有雇员。
hive> select * from emp where sal>2500;
OK
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
hive> select * from emp where sal>2500 and
deptno=20;
OK
128
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
Time taken: 0.504 seconds, Fetched: 3 row(s)
hive> select * from emp where sal>2500 and
deptno!=20;
OK
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7839 king president NULL 1981-11-17 
5000.0 NULL 10
Time taken: 0.391 seconds, Fetched: 2 row(s)

4.2.6 limit statement

Use limit to query the specified number of rows.

hive> select * from emp limit 8;
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
129
130
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
Time taken: 0.434 seconds, Fetched: 8 row(s)
hive> select * from emp limit 0,5;
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
Time taken: 0.308 seconds, Fetched: 5 row(s)
hive> select * from emp limit 5,5;
OK
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30
Time taken: 0.293 seconds, Fetched: 5 row(s)
# limit语句和where语句可以一起使用,一起使用时,需要放在where语句的后面
hive> select * from emp where sal>1000 limit 5;
OK
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
Time taken: 0.307 seconds, Fetched: 5 row(s)

4.2.7 Comparison operators

Usually used in where statement, having statement, join...on
insert image description here
insert image description here

# 查询工资等于1600的
hive> select * from emp where sal = 1600;
OK
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
Time taken: 0.313 seconds, Fetched: 1 row(s)
# 查询工资 [1000,2000]所有雇员
hive> select * from emp where sal between 1000
and 2000;
OK
133
134
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
Time taken: 0.31 seconds, Fetched: 6 row(s)
# 查询工资不在 [1000,2000]区间的所有雇员 <1000 和
>2000的
hive> select * from emp where sal not between
1000 and 2000;
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
7900 james clerk 7698 1981-12-3 
950.0 NULL 30
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
Time taken: 0.305 seconds, Fetched: 8 row(s)
#查询 comm列的值为null的
hive> select * from emp where comm is null;
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7900 james clerk 7698 1981-12-3 
950.0 NULL 30
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
Time taken: 0.335 seconds, Fetched: 10 row(s)
#查询 comm列的值不为null的
hive> select * from emp where comm is not null;
OK
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
136
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30
# 查询工资1500或1600的所有雇员
hive> select * from emp where sal in
(1500,1600);
OK
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30

like and rlike: fuzzy query, query the keywords in the specified column.
Wildcard:
% represents 0 or more arbitrary characters
_ represents 1 arbitrary character
rlike: is an extended function in hive, which can specify matching conditions through regular expressions.
Case combat:

#查询名称以字符m开头的所有雇员
hive> select * from emp where ename like 'm%';
OK
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
137
Time taken: 0.282 seconds, Fetched: 2 row(s)
#查询名称中第二字符是m的所有雇员的信息
hive> select * from emp where ename like '_m%';
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
Time taken: 0.335 seconds, Fetched: 1 row(s)
# like查询名称中包含m的所有雇员的信息
hive> select * from emp where ename like '%m%';
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7900 james clerk 7698 1981-12-3 
950.0 NULL 30
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
Time taken: 0.293 seconds, Fetched: 5 row(s)
# rlike查询名称中包含m的所有雇员的信息
hive> select * from emp where ename rlike
'[m]';
OK
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7900 james clerk 7698 1981-12-3 
950.0 NULL 30
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
Time taken: 0.262 seconds, Fetched: 5 row(s)

4.2.8 Logical operators

insert image description here
Actual case:
query salary is greater than 1000, and bonus is greater than or equal to 500

hive> select * from emp where sal>1000 and comm >=500;
OK
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30

Query salary greater than 1000, or bonus greater than or equal to 500

hive> select * from emp where sal>1000 or comm>=500;
OK
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10

Query all employees except 10 and 30 departments

4.3 Grouping

4.3.1 group by statement

group by is usually used together with aggregate functions to group by one or more columns, and then aggregate each group of data.
Case combat:
Query the average salary, maximum salary, and minimum salary of each department

hive> select deptno,avg(sal) avg_sal,max(sal)
max_sal,min(sal) min_sal from emp group by
deptno;
Query ID = root_20211115150801_985c1ac7-9260-
423b-a9b0-72eae86b3b92
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated
from input data size: 1
In order to change the average load for a
reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=
<number>
In order to limit the maximum number of
reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1636950452604_0003, Tracking
URL =
http://node4:8088/proxy/application_16369504526
04_0003/
Kill Command = /opt/hadoop-3.1.3/bin/mapred job
-kill job_1636950452604_0003
Hadoop job information for Stage-1: number of
mappers: 1; number of reducers: 1
10 2916.6666666666665 5000.0 1300.0
20 2175.0 3000.0 800.0
30 1566.6666666666667 2850.0 950.0

After using the group by statement, the column names and aggregate functions in the group by statement appear in the select statement

hive> select deptno,avg(sal) avg_sal,max(sal) max_sal,min(sal) min_sal,ename from emp group by deptno;
FAILED: SemanticException [Error 10025]: Line
1:65 Expression not in GROUP BY key 'ename'

4.3.2 having statement

Aggregate functions or the results calculated by aggregate functions cannot appear in the where statement.
The having statement can only be used after the group by statement, and the having statement cannot appear in a sql without a group by statement.
Case combat:
Query the average salary, maximum salary, and minimum salary of departments whose average salary is greater than 1800 in each department.

hive> select deptno,avg(sal) avg_sal,max(sal)
max_sal,min(sal) min_sal from emp group by
deptno where avg_sal>1800;
FAILED: ParseException line 1:90 missing EOF at
'where' near 'deptno'
hive> select deptno,avg(sal) avg_sal,max(sal)
max_sal,min(sal) min_sal from emp group by
deptno having avg_sal>1800;
Query ID = root_20211115151258_0b1728f2-27f6-
4b33-84aa-15a93421212b
Total jobs = 1
Launching Job 1 out of 1

4.4 Association query

4.4.1 Table aliases

Benefits: Simplify query and submit query efficiency
Case combat:

hive> select e.empno,e.ename,d.deptno,d.dname
    > from emp e,dept as d
    > where e.deptno = d.deptno;
Query ID = root_20211115153044_53c8363f-fe06-
40ec-95ab-4d85e4c0d521
Total jobs = 1
Execution completed successfully
7369 smith 20 research
7499 allen 30 sales
7521 ward 30 sales
7566 jones 20 research
7654 martin 30 sales
7698 blake 30 sales
7782 clark 10 accounting
7788 scott 20 research
7839 king 10 accounting
7844 turner 30 sales
7876 adams 20 research
7900 james 30 sales
7902 ford 20 research
7934 miller 10 accounting

4.4.2 Cartesian product

hive> select e.empno,e.ename,d.deptno,d.dname
    > from emp e,dept as d;
Warning: Map Join MAPJOIN[9][bigTable=?] in
task 'Stage-3:MAPRED' is a cross product
Query ID = root_20211115153651_bd271024-3bd7-
4355-acad-2c74042c5938
Total jobs = 1
Execution completed successfully
7900 james 10 accounting
7900 james 20 research
7900 james 30 sales
7900 james 40 operations
7902 ford 10 accounting
7902 ford 20 research
7902 ford 30 sales
7902 ford 40 operations
7934 miller 10 accounting
7934 miller 20 research
7934 miller 30 sales
7934 miller 40 operations
Time taken: 43.827 seconds, Fetched: 56 row(s)

Cartesian product: Cartesian product will appear when the connection condition is omitted or the connection condition is invalid.
Avoid Cartesian products when writing SQL. Connect each piece of data in the first table with the data in the second table in turn. There are m rows of data in the first table and n rows of data in the second table. The number of rows in the Cartesian product is m*n rows.

4.4.3 join statement

  1. Inner joins
    will be queried only when the data of the two tables to be joined matches the join conditions.
hive> select e.empno,e.ename,d.dname from
emp e join dept d on e.deptno=d.deptno;
Query ID = root_20211118102413_dda90f63-
c584-4d88-997c-318ddfd18b6b
Total jobs = 1
7369 smith research
7499 allen sales
7521 ward sales
7566 jones research
7654 martin sales
7698 blake sales
7782 clark accounting
7788 scott research
7839 king accounting
7844 turner sales
7876 adams research
7900 james sales
7902 ford research
7934 miller accounting
  1. Right outer join
    Right outer join: All records in the table to the right of the join keyword will be returned.
hive>select e.empno,e.ename,d.dname from
emp e right join dept d on
e.deptno=d.deptno;
7782 clark accounting
7839 king accounting
7934 miller accounting
7369 smith research
7566 jones research
7788 scott research
7876 adams research
7902 ford research
7499 allen sales
7521 ward sales
7654 martin sales
7698 blake sales
7844 turner sales
7900 james sales
NULL NULL operations #使用null替代
  1. Left outer join
    Left outer join: All records in the table to the left of the join keyword will be returned.
hive>select e.empno,e.ename,d.dname from
emp e left join dept d on
e.deptno=d.deptno;
  1. full outer join
hive>select e.empno,e.ename,d.dname from
emp e full join dept d on
e.deptno=d.deptno;

The first part: two pieces of corresponding relationship data.
The second part: No matching data can be found in the left table in the right table.
Part 3: No matching data found in the left table in the right table

4.5 Sorting

4.5.1 order by

For global sorting, there is only one reduce task in mr after hql conversion. When the amount of data is relatively large, order by should be used with caution, which may cause reduce to take a long time to complete or fail to complete.
Format: order by field name [asc|desc]
The default is asc ascending order, desc means descending order
Position: The order by statement usually prevents the end of the hql statement.

hive> select * from emp order by sal;
Query ID = root_20211118103903_c519efbd-9615-
431b-9a56-730616a097ba
Total jobs = 1
......
Hadoop job information for Stage-1: number of
mappers: 1; number of reducers: 1
......
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20
7900 james clerk 7698 1981-12-3 
950.0 NULL 30
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
148
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
Time taken: 33.587 seconds, Fetched: 14 row(s)
# 可以使用列的别名进行排序
hive> select empno,ename,sal*12 year_sal from
emp order by year_sal;
7369 smith 9600.0
7900 james 11400.0
7876 adams 13200.0
7521 ward 15000.0
7654 martin 15000.0
......
# 多列排序
hive> select empno,ename,deptno,sal from emp
order by deptno,sal;
7934 miller 10 1300.0
7782 clark 10 2450.0
7839 king 10 5000.0
7369 smith 20 800.0
7876 adams 20 1100.0
7566 jones 20 2975.0
7788 scott 20 3000.0
7902 ford 20 3000.0
7900 james 30 950.0
7654 martin 30 1250.0
7521 ward 30 1250.0
7844 turner 30 1500.0
7499 allen 30 1600
7698 blake 30 2850.0
#先按照部门编号从小到大排序,部门相同时,在按照sal从小到大排序
hive> select empno,ename,deptno,sal from emp
order by sal,deptno;
7369 smith 20 800.0
7900 james 30 950.0
7876 adams 20 1100.0
7521 ward 30 1250.0
7654 martin 30 1250.0
7934 miller 10 1300.0
7844 turner 30 1500.0
7499 allen 30 1600.0
7782 clark 10 2450.0
7698 blake 30 2850.0
7566 jones 20 2975.0
7788 scott 20 3000.0
7902 ford 20 3000.0
7839 king 10 5000.0
#先按照sal从小到大排序,sal相同时,在按照deptno从小到大排序

4.5.2 sort by

**sort by function: **Sort within each reduce task task. There is a problem of low efficiency when using order by when a large number of data sets are used. In many scenarios, global sorting is not required.
Each reduce task will correspond to the result file part-r-xxxxxx, which is ordered in each result file and disordered globally.
Set the number of reduce tasks through the set command, and the validity period is until the value of this parameter is modified next time or the hive connection is closed:

# set 参数=value; 设置参数的值
hive> set mapreduce.job.reduces=3;
# set 参数; 查看reduce产生的值
hive> set mapreduce.job.reduces;
mapreduce.job.reduces=3
hive> select * from emp sort by deptno desc;
Query ID = root_20211118105647_6964a47b-fc4b-4907-8f35-ea3c9465ed59
Total jobs = 1
Launching Job 1 out of 1
......
Hadoop job information for Stage-1: number of
mappers: 1; number of reducers: 3
7844 turner salesman 7698 1981-9-8 
1500.0 0.0 30
7698 blake manager 7839 1981-5-1 
2850.0 NULL 30
7654 martin salesman 7698 1981-9-28 
1250.0 1400.0 30
7788 scott analyst 7566 1987-4-19 
3000.0 NULL 20
7839 king president NULL 1981-11-17 
5000.0 NULL 10
7782 clark manager 7839 1981-6-9 
2450.0 NULL 10
7521 ward salesman 7698 1981-2-22 
1250.0 500.0 30
7499 allen salesman 7698 1981-2-20 
1600.0 300.0 30
7900 james clerk 7698 1981-12-3 
950.0 NULL 30
7876 adams clerk 7788 1987-5-23 
1100.0 NULL 20
7566 jones manager 7839 1981-4-2 
2975.0 NULL 20
7934 miller clerk 7782 1982-1-23 
1300.0 NULL 10
7902 ford analyst 7566 1981-12-3 
3000.0 NULL 20
7369 smith clerk 7902 1980-12-17 
800.0 NULL 20

The result is not intuitive enough, download the subsequent result file to the local.

hive>insert overwrite local directory 
'/opt/sortbyresult'
select * from emp sort by deptno desc;

node4 view file:

[root@node4 ~]# cd /opt/sortbyresult/
[root@node4 sortbyresult]# pwd
/opt/sortbyresult
[root@node4 sortbyresult]# ls
000000_0 000001_0 000002_0
[root@node4 sortbyresult]# ll
总用量 12
-rw-r--r-- 1 root root 288 11月 18 11:01
000000_0
-rw-r--r-- 1 root root 282 11月 18 11:01
000001_0
-rw-r--r-- 1 root root  91 11月 18 11:01
000002_0
[root@node4 sortbyresult]# cat -A 000000_0
7844^Aturner^Asalesman^A7698^A1981-9-
8^A1500.0^A0.0^A30$
7698^Ablake^Amanager^A7839^A1981-5-
1^A2850.0^A\N^A30$
7654^Amartin^Asalesman^A7698^A1981-9-
28^A1250.0^A1400.0^A30$
7788^Ascott^Aanalyst^A7566^A1987-4-
19^A3000.0^A\N^A20$
7839^Aking^Apresident^A\N^A1981-11-
17^A5000.0^A\N^A10$
7782^Aclark^Amanager^A7839^A1981-6-
9^A2450.0^A\N^A10$
[root@node4 sortbyresult]# cat -A 000001_0
7521^Award^Asalesman^A7698^A1981-2-
22^A1250.0^A500.0^A30$
7499^Aallen^Asalesman^A7698^A1981-2-
20^A1600.0^A300.0^A30$
7900^Ajames^Aclerk^A7698^A1981-12-
3^A950.0^A\N^A30$
7876^Aadams^Aclerk^A7788^A1987-5-
23^A1100.0^A\N^A20$
7566^Ajones^Amanager^A7839^A1981-4-
2^A2975.0^A\N^A20$
7934^Amiller^Aclerk^A7782^A1982-1-
23^A1300.0^A\N^A10$
[root@node4 sortbyresult]# cat -A 000002_0
7902^Aford^Aanalyst^A7566^A1981-12-3^A3000.0^A\N^A20$
7369^Asmith^Aclerk^A7902^A1980-12-17^A800.0^A\N^A20$

4.5.3 distribute by partition

**distribute by:** corresponds to the partition (custom partition) of the MR job, and is usually used in combination with sort by. In some cases it is necessary to control which reduce task a particular row should go to for subsequent aggregation operations. Partitions have corresponding reduce tasks, and there are several reduce tasks for several partitions; otherwise, the effect of distribute by will not be seen.
Practical demonstration:
first partition by department number, and then sort in descending order of employee number

hive> set mapreduce.job.reduces=4;
hive> insert overwrite local directory
'/opt/distributebyresult'
select * from emp distribute by deptno sort by
empno desc;
Query ID = root_20211118111723_4afc790e-a7e8-
4d5c-ba7a-8d790a379ea5
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified.
Defaulting to jobconf value of: 4
......
Hadoop job information for Stage-1: number of
mappers: 1; number of reducers: 4

View the resulting file in node4:

[root@node4 distributebyresult]# pwd
/opt/distributebyresult
[root@node4 distributebyresult]# ll
总用量 8
-rw-r--r-- 1 root root 229 11月 18 11:18
000000_0
-rw-r--r-- 1 root root   0 11月 18 11:18
000001_0
-rw-r--r-- 1 root root 432 11月 18 11:18
000002_0
-rw-r--r-- 1 root root   0 11月 18 11:18
000003_0
[root@node4 distributebyresult]# cat -A
000000_0
7902^Aford^Aanalyst^A7566^A1981-12-
3^A3000.0^A\N^A20$
7876^Aadams^Aclerk^A7788^A1987-5-
23^A1100.0^A\N^A20$
7788^Ascott^Aanalyst^A7566^A1987-4-
19^A3000.0^A\N^A20$
7566^Ajones^Amanager^A7839^A1981-4-
2^A2975.0^A\N^A20$
7369^Asmith^Aclerk^A7902^A1980-12-
17^A800.0^A\N^A20$
[root@node4 distributebyresult]# cat -A
000001_0
[root@node4 distributebyresult]# cat -A
000002_0
7934^Amiller^Aclerk^A7782^A1982-1-
23^A1300.0^A\N^A10$
7900^Ajames^Aclerk^A7698^A1981-12-
3^A950.0^A\N^A30$
7844^Aturner^Asalesman^A7698^A1981-9-
8^A1500.0^A0.0^A30$
7839^Aking^Apresident^A\N^A1981-11-
17^A5000.0^A\N^A10$
7782^Aclark^Amanager^A7839^A1981-6-
9^A2450.0^A\N^A10$
7698^Ablake^Amanager^A7839^A1981-5-
1^A2850.0^A\N^A30$
7654^Amartin^Asalesman^A7698^A1981-9-
28^A1250.0^A1400.0^A30$
7521^Award^Asalesman^A7698^A1981-2-
22^A1250.0^A500.0^A30$
7499^Aallen^Asalesman^A7698^A1981-2-
20^A1600.0^A300.0^A30$
[root@node4 distributebyresult]# cat -A
000003_0

The distribute by partition rule is to divide the modulo by the hash value of the partition field and the number of partitions (total number of reduce tasks), and assign the same remainder to one partition.
Requirement: The distribute by statement is written before the sort by statement.

4.5.4 cluster by

When the fields behind distribute by and sort by are the same, cluster by can be used for simplification . The functions are equivalent; but only ascending order can be used, and the sorting rules cannot be specified as asc or desc.
distribute by is partition; sort by is sort

hive>select * from emp distribute by deptno sort by deptno;
#可以简化为
hive>select * from emp cluster by deptno;
hive>insert overwrite local directory
'/opt/clusterbyresult'
select * from emp cluster by deptno;

Partition according to the number of the department, and then sort in ascending order according to the number of the department within the reduce task. Use the department number to find the hash value % of the number of partitions and take the remainder, and the data with the same result will be divided into one partition.
Check the result on node4:

[root@node4 distributebyresult]# cd
/opt/clusterbyresult/
[root@node4 clusterbyresult]# ll
总用量 8
-rw-r--r-- 1 root root 229 11月 18 11:28
000000_0
-rw-r--r-- 1 root root   0 11月 18 11:28
000001_0
-rw-r--r-- 1 root root 432 11月 18 11:28
000002_0
-rw-r--r-- 1 root root   0 11月 18 11:28
000003_0
[root@node4 clusterbyresult]# cat -A 000000_0
7902^Aford^Aanalyst^A7566^A1981-12-
3^A3000.0^A\N^A20$
7788^Ascott^Aanalyst^A7566^A1987-4-
19^A3000.0^A\N^A20$
7566^Ajones^Amanager^A7839^A1981-4-
2^A2975.0^A\N^A20$
7876^Aadams^Aclerk^A7788^A1987-5-
23^A1100.0^A\N^A20$
7369^Asmith^Aclerk^A7902^A1980-12-
17^A800.0^A\N^A20$
[root@node4 clusterbyresult]# cat -A 000002_0
7934^Amiller^Aclerk^A7782^A1982-1-
23^A1300.0^A\N^A10$
7839^Aking^Apresident^A\N^A1981-11-
17^A5000.0^A\N^A10$
7782^Aclark^Amanager^A7839^A1981-6-
9^A2450.0^A\N^A10$
7698^Ablake^Amanager^A7839^A1981-5-
1^A2850.0^A\N^A30$
7654^Amartin^Asalesman^A7698^A1981-9-
28^A1250.0^A1400.0^A30$
7900^Ajames^Aclerk^A7698^A1981-12-
3^A950.0^A\N^A30$
7521^Award^Asalesman^A7698^A1981-2-
22^A1250.0^A500.0^A30$
7499^Aallen^Asalesman^A7698^A1981-2-
20^A1600.0^A300.0^A30$
7844^Aturner^Asalesman^A7698^A1981-9-
8^A1500.0^A0.0^A30$

4.6 Actual Analysis of Base Station Call Drop Rate

Requirement: Find out the top 10 base stations with the highest call drop rate
and create the original data table:

create table jizhan(
record_time string,
imei int,
cell string,
ph_num int,
call_num int,
drop_num int,
duration int,
drop_rate double,
net_type string,
erl int)
row format delimited fields terminated by ',';

field description

record_time:通话时间
imei:基站编号
cell:手机编号
drop_num:掉话的秒数
duration:通话持续总秒数

Create a result table:

create table jizhan_result(
imei string,
drop_num int,
duration int,
drop_rate double
);

First upload the software /data/cdr_summ_imei_cell_info.csv to the /root/data directory on node4, delete the header data of the first line
and load it into the jizhan table of hive

hive> load data local inpath
'/root/data/cdr_summ_imei_cell_info.csv' into
table jizhan;
Loading data to table default.jizhan
OK
Time taken: 2.734 seconds
hive> select * from jizhan limit 10;
OK
2011-07-13 00:00:00+08 356966 29448-37062 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 352024 29448-51331 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 353736 29448-51331 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 353736 29448-51333 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 351545 29448-51333 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 353736 29448-51343 1 
0 0 8 0.0 0
2011-07-13 00:00:00+08 359681 29448-51462 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 354707 29448-51462 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 356137 29448-51470 0 
0 0 0 0.0 0
2011-07-13 00:00:00+08 352739 29448-51971 0 
0 0 0 0.0 0
hive> select count(*) from jizhan;
976305

Write the sql statement for analysis

select imei,sum(drop_num) sdrop,sum(duration) sdura,sum(drop_num)/sum(duration) drop_rate
from jizhan
group by imei
order by drop_rate desc;

Write the analysis results to the jizhan_result table:

from jizhan
insert into jizhan_result
select imei,sum(drop_num) sdrop,sum(duration) sdura,sum(drop_num)/sum(duration) drop_rate
group by imei
order by drop_rate desc;

Query the result table to obtain the top 10 data, that is, the top 10 base stations with the highest call drop rate:

hive> select * from jizhan_result limit 10;
OK
639876 1 734    0.0013623978201634877
356436 1 1028 9.727626459143969E-4
351760 1 1232 8.116883116883117E-4
368883 1 1448 6.906077348066298E-4
358849 1 1469 6.807351940095302E-4
358231 1 1613 6.199628022318661E-4
863738 2 3343 5.982650314089142E-4
865011 1 1864 5.36480686695279E-4
862242 1 1913 5.227391531625719E-4
350301 2 3998 5.002501250625312E-4

Guess you like

Origin blog.csdn.net/m0_63953077/article/details/130350907