优化大纲:https://blog.csdn.net/u010003835/article/details/105334641
测试表以及测试数据
+----------------------------------------------------+
| createtab_stmt |
+----------------------------------------------------+
| CREATE TABLE `datacube_salary_org`( |
| `company_name` string COMMENT '????', |
| `dep_name` string COMMENT '????', |
| `user_id` bigint COMMENT '??id', |
| `user_name` string COMMENT '????', |
| `salary` decimal(10,2) COMMENT '??', |
| `create_time` date COMMENT '????', |
| `update_time` date COMMENT '????') |
| PARTITIONED BY ( |
| `pt` string COMMENT '????') |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |
| WITH SERDEPROPERTIES ( |
| 'field.delim'=',', |
| 'serialization.format'=',') |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://cdh-manager:8020/user/hive/warehouse/data_warehouse_test.db/datacube_salary_org' |
| TBLPROPERTIES ( |
| 'transient_lastDdlTime'='1586310488') |
+----------------------------------------------------+
+-----------------------------------+-------------------------------+------------------------------+--------------------------------+-----------------------------+----------------------------------+----------------------------------+-------------------------+
| datacube_salary_org.company_name | datacube_salary_org.dep_name | datacube_salary_org.user_id | datacube_salary_org.user_name | datacube_salary_org.salary | datacube_salary_org.create_time | datacube_salary_org.update_time | datacube_salary_org.pt |
+-----------------------------------+-------------------------------+------------------------------+--------------------------------+-----------------------------+----------------------------------+----------------------------------+-------------------------+
| s.zh | engineer | 1 | szh | 28000.00 | 2020-04-07 | 2020-04-07 | 20200405 |
| s.zh | engineer | 2 | zyq | 26000.00 | 2020-04-03 | 2020-04-03 | 20200405 |
| s.zh | tester | 3 | gkm | 20000.00 | 2020-04-07 | 2020-04-07 | 20200405 |
| x.qx | finance | 4 | pip | 13400.00 | 2020-04-07 | 2020-04-07 | 20200405 |
| x.qx | finance | 5 | kip | 24500.00 | 2020-04-07 | 2020-04-07 | 20200405 |
| x.qx | finance | 6 | zxxc | 13000.00 | 2020-04-07 | 2020-04-07 | 20200405 |
| x.qx | kiccp | 7 | xsz | 8600.00 | 2020-04-07 | 2020-04-07 | 20200405 |
| s.zh | engineer | 1 | szh | 28000.00 | 2020-04-07 | 2020-04-07 | 20200406 |
| s.zh | engineer | 2 | zyq | 26000.00 | 2020-04-03 | 2020-04-03 | 20200406 |
| s.zh | tester | 3 | gkm | 20000.00 | 2020-04-07 | 2020-04-07 | 20200406 |
| x.qx | finance | 4 | pip | 13400.00 | 2020-04-07 | 2020-04-07 | 20200406 |
| x.qx | finance | 5 | kip | 24500.00 | 2020-04-07 | 2020-04-07 | 20200406 |
| x.qx | finance | 6 | zxxc | 13000.00 | 2020-04-07 | 2020-04-07 | 20200406 |
| x.qx | kiccp | 7 | xsz | 8600.00 | 2020-04-07 | 2020-04-07 | 20200406 |
| s.zh | enginer | 1 | szh | 28000.00 | 2020-04-07 | 2020-04-07 | 20200407 |
| s.zh | enginer | 2 | zyq | 26000.00 | 2020-04-03 | 2020-04-03 | 20200407 |
| s.zh | tester | 3 | gkm | 20000.00 | 2020-04-07 | 2020-04-07 | 20200407 |
| x.qx | finance | 4 | pip | 13400.00 | 2020-04-07 | 2020-04-07 | 20200407 |
| x.qx | finance | 5 | kip | 24500.00 | 2020-04-07 | 2020-04-07 | 20200407 |
| x.qx | finance | 6 | zxxc | 13000.00 | 2020-04-07 | 2020-04-07 | 20200407 |
| x.qx | kiccp | 7 | xsz | 8600.00 | 2020-04-07 | 2020-04-07 | 20200407 |
+-----------------------------------+-------------------------------+------------------------------+--------------------------------+-----------------------------+----------------------------------+----------------------------------+-------------------------+
场景五.排序问题
1) 合理使用 ORDER BY 与 SORT BY , 在两者之间做取舍
2) 通过使用 LIMIT 限制排序的输出
1) 合理使用 ORDER BY 与 SORT BY , 在两者之间做取舍
应该尽量避免使用 ORDER BY 。ORDER BY 会在全局进行排序
SORT BY 只是在一个 reducer 中,进行局部的刨析
2) 通过使用 LIMIT 限制排序的输出
通过 使用 LMIT N , 我们只需要每个 Reduce 控制前N 条记录即可。然后我们在总的 M个 Reduce 中再去选取其中的 前N 个数据就可以找到前 N 排名的数据了。
据说每个Reduce 取前N 条记录,使用的是冒泡排序。
这样会多出一个JOB