Oracle 分析函数详解

------------------------------------------------------------------------------------------------------------------------------------
/*
分析函数的作用总结
1.可以减少表或索引的访问次数。
2.实现复杂的行间计算,聚合等,使sql的结构清晰。
3.缺点是很多分析函数要求排序,需要对排序的字段建立索引以达到优化的效果。

章节如下
1.分析函数简单示例
2.排序分析函数(RANK,DENSE_RANK,ROW_NUMBER)
3.分析函数(Top/Bottom N,First/Last,NTILE)
4.分析函数之窗口函数
5.分析函数之报表函数
*/
---------------------------------------------------------------------------------------------------------------------------------------
--1.分析函数简单示例

--生成测试数据
DROP TABLE ORDERS_TMP;
CREATE TABLE ORDERS_TMP
(
CUST_NBR NUMBER(5) NOT NULL,
REGION_ID NUMBER(5) NOT NULL,
SALESPERSON_ID NUMBER(5) NOT NULL,
YEAR NUMBER(4) NOT NULL,
MONTH NUMBER(2) NOT NULL,
TOT_ORDERS NUMBER(7) NOT NULL,
TOT_SALES NUMBER(11,2) NOT NULL
);

INSERT INTO ORDERS_TMP VALUES(11,7,11,2001,7,2,12204);
INSERT INTO ORDERS_TMP VALUES(4,5,4,2001,10,2,37802);
INSERT INTO ORDERS_TMP VALUES(7,6,7,2001,2,3,3705);
INSERT INTO ORDERS_TMP VALUES(10,6,8,2001,1,2,21691);
INSERT INTO ORDERS_TMP VALUES(10,6,7,2001,2,3,42624);

COMMIT;

SELECT * FROM ORDERS_TMP A;

 CUST_NBR  REGION_ID SALESPERSON_ID       YEAR      MONTH TOT_ORDERS  TOT_SALES
--------- ---------- -------------- ---------- ---------- ---------- ----------
       11          7             11       2001          7          2      12204
        4          5              4       2001         10          2      37802
        7          6              7       2001          2          3       3705
       10          6              8       2001          1          2      21691
       10          6              7       2001          2          3      42624

--统计销售额大于%20以及所在区域的销售额,及各个客户销售比例
SELECT ALL_SALES.*, 100*ROUND(CUST_SALES/REGION_SALES,2)||'%' PERCENT
  FROM (SELECT A.CUST_NBR CUSTOMER,
               A.REGION_ID REGION,
               SUM(A.TOT_SALES) CUST_SALES,
               SUM(SUM(A.TOT_SALES)) OVER(PARTITION BY A.REGION_ID) REGION_SALES,
               SUM(SUM(A.TOT_SALES)) OVER() TOTAL_SALES
          FROM ORDERS_TMP A
         GROUP BY A.REGION_ID, A.CUST_NBR) ALL_SALES
 WHERE ALL_SALES.CUST_SALES > ALL_SALES.REGION_SALES * 0.2;

  CUSTOMER     REGION CUST_SALES REGION_SALES TOTAL_SALES PERCENT
---------- ---------- ---------- ------------ ----------- --------
         4          5      37802        37802      118026 100%
        10          6      64315        68020      118026 95%
        11          7      12204        12204      118026 100%
--Over函数的作用是告诉SQL引擎:按照区域对数据进行分组,然后累积每个区域下所有客户的订单总额
--PARTITION 子句,将相同的行聚合在一起,若没有partition子句,标示统计所有的行
----------------------------------------------------------------------------------------------------------------------------------------
--二.分析函数(RANK,DENSE_RANK,ROW_NUMBER)

--生成测试数据
CREATE TABLE USER_ORDER
(
REGION_ID NUMBER(2),
CUSTOMER_ID NUMBER(2),
CUSTOMER_SALES NUMBER
);


INSERT INTO USER_ORDER VALUES(7,11,1590421);
INSERT INTO USER_ORDER VALUES(6,10,1696748);
INSERT INTO USER_ORDER VALUES(6,9,1108959);
INSERT INTO USER_ORDER VALUES(5,2,1224992);
INSERT INTO USER_ORDER VALUES(9,24,1224992);
INSERT INTO USER_ORDER VALUES(9,23,1224992);

COMMIT;

SELECT * FROM USER_ORDER;

 REGION_ID CUSTOMER_ID CUSTOMER_SALES
---------- ----------- --------------
         7          11        1590421
         6          10        1696748
         6           9        1108959
         5           2        1224992
         9          24        1224992
         9          23        1224992

--不使用分析函数,查找出订单总额前三名的客户,请注意有金额相同的记录
SELECT ROWNUM, T.*
  FROM (SELECT * FROM USER_ORDER ORDER BY CUSTOMER_SALES DESC) T
 WHERE ROWNUM <= 3
 ORDER BY CUSTOMER_SALES DESC;

    ROWNUM  REGION_ID CUSTOMER_ID CUSTOMER_SALES
---------- ---------- ----------- --------------
         1          6          10        1696748
         2          7          11        1590421
         3          5           2        1224992
--很明显,若是简单的按照rownum排序的话,我们漏掉了两条

--采用分析函数
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) RANK,
       DENSE_RANK() OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) DENSE_RANK,
       ROW_NUMBER() OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) ROWNUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID

 REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK  ROWNUMBER
---------- ----------- ---------- ---------- ---------- ----------
         6          10    1696748          1          1          1
         7          11    1590421          2          2          2
         9          23    1224992          3          3          3
         5           2    1224992          3          3          4
         9          24    1224992          3          3          5
         6           9    1108959          6          4          6

--从上面的结果可以看出三种排序策略的不同
--1.RANK 函数对于相同的结果,排名是一样的,但会在最后一条相同的记录和下一条记录之间空出排名
--2.DENSE_RANK策略与RANK差不多,但不会空出排名,排名是连续的。
--3.ROW_NUMBER返回一个唯一值,若遇到相同的数据,排名依次递增

--使用分析函数对各个地区内的订单总额排序
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) RANK,
       DENSE_RANK() OVER(PARTITION BY REGION_ID ORDER BY SUM(CUSTOMER_SALES) DESC) DENSE_RANK,
       ROW_NUMBER() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) ROW_NUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;

REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK ROW_NUMBER
--------- ----------- ---------- ---------- ---------- ----------
        5           2    1224992          1          1          1
        6          10    1696748          1          1          1
        6           9    1108959          2          2          2
        7          11    1590421          1          1          1
        9          23    1224992          1          1          1
        9          24    1224992          1          1          2

--Partition by 子句在排列函数中的作用是将一个结果集划分成几个部分,这样排列函数就能够应用于这各个子集。

UPDATE USER_ORDER SET CUSTOMER_SALES = NULL WHERE CUSTOMER_ID = 23;
COMMIT;
--修改用户订单表其中一条为空值,然后再来看排序结果

SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) RANK,
       DENSE_RANK() OVER(PARTITION BY REGION_ID ORDER BY SUM(CUSTOMER_SALES) DESC) DENSE_RANK,
       ROW_NUMBER() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC) ROW_NUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;
 
REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK ROW_NUMBER
--------- ----------- ---------- ---------- ---------- ----------
        5           2    1224992          1          1          1
        6          10    1696748          1          1          1
        6           9    1108959          2          2          2
        7          11    1590421          1          1          1
        9          23                     1          1          1
        9          24    1224992          2          2          2

--我们会发现空值会排第一位,需要调整下,在分析函数的结尾加上NULLS LAST(NULLS LAST/FIRST告诉Oracle空值排名)
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) TOTAL,
       RANK() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC NULLS LAST) RANK,
       DENSE_RANK() OVER(PARTITION BY REGION_ID ORDER BY SUM(CUSTOMER_SALES) DESC NULLS LAST) DENSE_RANK,
       ROW_NUMBER() OVER(PARTITION BY REGION_ID  ORDER BY SUM(CUSTOMER_SALES) DESC NULLS LAST) ROW_NUMBER
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;

 REGION_ID CUSTOMER_ID      TOTAL       RANK DENSE_RANK ROW_NUMBER
---------- ----------- ---------- ---------- ---------- ----------
         5           2    1224992          1          1          1
         6          10    1696748          1          1          1
         6           9    1108959          2          2          2
         7          11    1590421          1          1          1
         9          24    1224992          1          1          1
         9          23                     2          2          2
-------------------------------------------------------------------------------------------------------------------------------------
--三 分析函数(Top/Bottom N,First/Last,NTILE)
--删除掉空值及重复值
DELETE USER_ORDER WHERE CUSTOMER_ID IN (23,24);
COMMIT;
--查找出订单总额最多和最少的用户,若按照签名的排名分析函数,至少需要两个查询,下面使用First/last来解决这个问题
admin@ORCL> SELECT * FROM USER_ORDER;

 REGION_ID CUSTOMER_ID CUSTOMER_SALES
---------- ----------- --------------
         7          11        1590421
         6          10        1696748
         6           9        1108959
         5           2        1224992


SELECT MIN(CUSTOMER_ID) KEEP(DENSE_RANK FIRST ORDER BY SUM(CUSTOMER_SALES) DESC) FIRST,
       MIN(CUSTOMER_ID) KEEP(DENSE_RANK LAST ORDER BY SUM(CUSTOMER_SALES) DESC) LAST
  FROM USER_ORDER
 GROUP BY CUSTOMER_ID;

    FIRST       LAST
--------- ----------
       10          9

--MIN中的函数用于保证当存在多个Frist或Last的情况,返回唯一条记录(注意)
--DENSE_RANK是必须的,RANK与ROW_NUMBER无法代替




--利用分析函数找出排名1/2的客户
SELECT REGION_ID,
       CUSTOMER_ID,
       SUM(CUSTOMER_SALES) AS CUSTOMER_SALES,
       NTILE(2) OVER(ORDER BY SUM(CUSTOMER_SALES) DESC) TIL
  FROM USER_ORDER
 GROUP BY REGION_ID, CUSTOMER_ID;
 
--NTIL为各个记录在记录集中的排名计算比例,我们看到所有的记录被分为2个等级,那么我们只要取tile=1的记录值即可。

------------------------------------------------------------------------------------------------------------------------------------
--四 分析函数之窗口函数

CREATE TABLE ORDERS(
MONTH NUMBER(2),
TOT_SALES NUMBER
);

INSERT INTO ORDERS VALUES(1,610697);
INSERT INTO ORDERS VALUES(2,428676);
INSERT INTO ORDERS VALUES(3,637031);
INSERT INTO ORDERS VALUES(4,541146);
INSERT INTO ORDERS VALUES(5,592935);
INSERT INTO ORDERS VALUES(6,501485);
INSERT INTO ORDERS VALUES(7,606914);
INSERT INTO ORDERS VALUES(8,460520);
INSERT INTO ORDERS VALUES(9,392898);
INSERT INTO ORDERS VALUES(10,510117);
INSERT INTO ORDERS VALUES(11,532889);
INSERT INTO ORDERS VALUES(12,492458);
COMMIT;

SELECT * FROM ORDERS;

     MONTH  TOT_SALES
---------- ----------
         1     610697
         2     428676
         3     637031
         4     541146
         5     592935
         6     501485
         7     606914
         8     460520
         9     392898
        10     510117
        11     532889
        12     492458

--ROWS BETWEEN ..PRECEDING AND ...FOLLOWING 意思是:在xxx之前和xxx之后
SELECT MONTH,
SUM(TOT_SALES) MONTH_SALES,
SUM(SUM(TOT_SALES)) OVER (ORDER BY MONTH ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING)TOTAL_SALES1,
SUM(SUM(TOT_SALES)) OVER (ORDER BY MONTH ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) TOTAL_SALES2,
SUM(SUM(TOT_SALES)) OVER (ORDER BY MONTH ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)TOTAL_SALES3
FROM ORDERS
GROUP BY MONTH;

     MONTH MONTH_SALES TOTAL_SALES1 TOTAL_SALES2 TOTAL_SALES3
---------- ----------- ------------ ------------ ------------
         1      610697      6307766      1039373      6307766
         2      428676      6307766      1676404      6307766
         3      637031      5697069      1606853      6307766
         4      541146      5268393      1771112      6307766
         5      592935      4631362      1635566      6307766
         6      501485      4090216      1701334      6307766
         7      606914      3497281      1568919      6307766
         8      460520      2995796      1460332      6307766
         9      392898      2388882      1363535      6307766
        10      510117      1928362      1435904      6307766
        11      532889      1535464      1535464      6307766
        12      492458      1025347      1025347      6307766

--从结果中可以看出。
--TOTAL_SALES1逻辑是统计当前行+前一条+随后的所有记录。
--TOTAL_SALES2逻辑是统计当前行+当前行的前一条+当前行的后一条。
--TOTAL_SALES3逻辑是统计所有行


--统计每月订单总额及截止到当前月的订单总额
SELECT MONTH,
       SUM(TOT_SALES) MONTH_SALES,
       SUM(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING) TOTAL_SALES1,
       SUM(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) TOTAL_SALE2,
       SUM(SUM(TOT_SALES)) OVER(ORDER BY MONTH) TOTAL_SALES3,
       ROUND(AVG(SUM(TOT_SALES)) OVER(ORDER BY MONTH),2) AVG_SALES
  FROM ORDERS
 GROUP BY MONTH;

     MONTH MONTH_SALES TOTAL_SALES1 TOTAL_SALE2 TOTAL_SALES3  AVG_SALES
---------- ----------- ------------ ----------- ------------ ----------
         1      610697       610697      610697       610697     610697
         2      428676      1039373     1039373      1039373   519686.5
         3      637031      1676404     1676404      1676404  558801.33
         4      541146      2217550     2217550      2217550   554387.5
         5      592935      2810485     2810485      2810485     562097
         6      501485      3311970     3311970      3311970     551995
         7      606914      3918884     3918884      3918884  559840.57
         8      460520      4379404     4379404      4379404   547425.5
         9      392898      4772302     4772302      4772302  530255.78
        10      510117      5282419     5282419      5282419   528241.9
        11      532889      5815308     5815308      5815308  528664.36
        12      492458      6307766     6307766      6307766  525647.17
--从结果来看,前三条分析函数语句结果是相同的。
--order by 默认数据范围是当前所需的partition的第一行到当前行
--AVG_SALES用于统计滚动的月平均销量

--我们还可以用First_value和Last_value用于在窗口记录集中查找第一条和最后一条记录
SELECT MONTH,
       SUM(TOT_SALES) MONTH_SALES,
       FIRST_VALUE(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) PRE_VALUE,
       LAST_VALUE(SUM(TOT_SALES)) OVER(ORDER BY MONTH ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) NEXT_VALUE
  FROM ORDERS
 GROUP BY MONTH;
 
     MONTH MONTH_SALES  PRE_VALUE NEXT_VALUE
---------- ----------- ---------- ----------
         1      610697     610697     428676
         2      428676     610697     637031
         3      637031     428676     541146
         4      541146     637031     592935
         5      592935     541146     501485
         6      501485     592935     606914
         7      606914     501485     460520
         8      460520     606914     392898
         9      392898     460520     510117
        10      510117     392898     532889
        11      532889     510117     492458
        12      492458     532889     492458


--生成测试数据,测试窗口函数中RANGE的使用

CREATE TABLE ORDERS_TIME
(
ORDER_DATE DATE,
AMOUNT NUMBER
);

INSERT INTO ORDERS_TIME VALUES(TO_DATE('1999-01-01','YYYY-MM-DD'),100);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('2000-01-01','YYYY-MM-DD'),50);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('1999-01-01','YYYY-MM-DD'),30);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('2000-01-03','YYYY-MM-DD'),20);
INSERT INTO ORDERS_TIME VALUES(TO_DATE('2000-01-02','YYYY-MM-DD'),200);
COMMIT;

SELECT * FROM ORDERS_TIME;
ORDER_DATE         AMOUNT
-------------- ----------
      1999/1/1	100
      2000/1/1	50
      1999/1/1	30
      2000/1/3	20
      2000/1/2	200
      2000/2/2	200

SELECT ORDER_DATE,SUM(AMOUNT),
SUM(SUM(AMOUNT)) OVER(ORDER BY ORDER_DATE RANGE BETWEEN INTERVAL '1' DAY PRECEDING AND INTERVAL '1' DAY FOLLOWING) SALES
 FROM ORDERS_TIME
GROUP BY ORDER_DATE;

ORDER_DATE	SUM(AMOUNT)	SALES
----------   --------   -----
1999/1/1     	130	     130
2000/1/1	    50	     250
2000/1/2	    200      270
2000/1/3	    20	     220
2000/2/2	    200	     200
--RANGE及interval关键字用于统计当前日期的销售额+前一天+后一天


---窗口函数进阶--比较相邻

SELECT ORDER_DATE,SUM(AMOUNT),
LAG(SUM(AMOUNT),1,NULL) OVER(ORDER BY ORDER_DATE) PRE_SALES,
LEAD(SUM(AMOUNT),1,NULL) OVER(ORDER BY ORDER_DATE) NEXT_SALES
 FROM ORDERS_TIME
GROUP BY ORDER_DATE;
--,数据的偏移量默认是1,若没有下一个值,用null来代替
--也可以加partition by进行分组,然后当寻找上一个/下一个值时,只在组内寻找
----------------------------------------------------------------------------------------------------------------------------------------
--五 分析函数之报表函数


--统计每个客户的销量占区域销量的比例,采用比较常规方法

SELECT CUST_NBR,
       REGION_ID,
       SUM(TOT_SALES) CUST_SALES,
       100*ROUND(SUM(TOT_SALES)/SUM(SUM(TOT_SALES)) OVER(PARTITION BY REGION_ID),2)||'%' PERCENT
  FROM ORDERS_TMP O
 GROUP BY REGION_ID, CUST_NBR;

  CUST_NBR  REGION_ID CUST_SALES PERCENT
---------- ---------- ---------- -------------------------
         4          5      37802 100%
         7          6       3705 5%
        10          6      64315 95%
        11          7      12204 100%

--下面使用报表函数RATIO_TO_REPORT
SELECT CUST_NBR,
       REGION_ID,
       SUM(TOT_SALES) CUST_SALES,
       100*ROUND(RATIO_TO_REPORT((SUM(TOT_SALES))) OVER(PARTITION BY REGION_ID),2)||'%' PERCENT
  FROM ORDERS_TMP O
 GROUP BY REGION_ID, CUST_NBR;

  CUST_NBR  REGION_ID CUST_SALES PERCENT
---------- ---------- ---------- ---------------------
         4          5      37802 100%
         7          6       3705 5%
        10          6      64315 95%
        11          7      12204 100%
--结果是一样的,Oracle提供的Ratio_to_report函数统计每条记录在其子集中所占的比例



--本文章参考
--http://wenku.baidu.com/view/6694a7225901020207409c5d.html

猜你喜欢

转载自yangeoo.iteye.com/blog/1848142