sql 求中位数

1.问题

1.求一个页面用户曝光次数的中位数
2.求多个页面用户曝光次数的中位数

1.2数据准备

CREATE TABLE IF NOT EXISTS test_exposure_count 
(
    page STRING
    ,uid STRING
    ,c STRING
)
;

INSERT OVERWRITE TABLE test_exposure_count VALUES 
('ElementId-8GG5','211861079118587230','44'),
('ElementId-8GG5','211830851265539182','31'),
('ElementId-8GG5','211840774078481572','64'),
('ElementId-8GG5','211830852525499241','88'),
('ElementId-8GG5','211830855055469232','11'),
('ElementId-8GG5','211840770048501629','50'),
('ElementId-8GG5','211830856145479147','12'),
('ElementId-8GG5','211830870375599205','86'),
('ElementId-8GG5','211861081548657320','43'),
('ElementId-8GG5','211830877435499343','50'),
('ElementId-8GG5','211861081098677442','56'),
('ElementId-8GG5','211861080558657317','3'),
('ElementId-8GG5','211840762428551551','21'),
('ElementId-8GG5','211830878365629243','13'),
('ElementId-G5BH','211840901008611824','44'),
('ElementId-G5BH','211840747348401546','31'),
('ElementId-G5BH','211840772058511551','64'),
('ElementId-G5BH','211840786458531560','88'),
('ElementId-G5BH','211840796068591649','11'),
('ElementId-G5BH','211840806518401669','50'),
('ElementId-G5BH','211840815448461734','12'),
('ElementId-G5BH','211840826188621683','86'),
('ElementId-G5BH','211840840418571714','43'),
('ElementId-G5BH','211840862028491888','50'),
('ElementId-G5BH','211840880248391869','56'),
('ElementId-G5BH','211840739268691553','3'),
('ElementId-G5BH','211840903258401893','21');
SELECT COUNT(*) FROM test_exposure_count GROUP BY page;
ElementId-8GG5	14
ElementId-G5BH	13

2.问题解决思路

2.1问题1

2.1.1 方法一 求中间值序号

先求出中间这条数据的序列号,14条数据就是7和8,15条数据就是8,这个地方必须用ceiling向下取整,因为不好拿两个序号

SELECT  *
FROM    (
            SELECT  *
                    ,ROW_NUMBER() OVER (ORDER BY c) AS r
            FROM    test_exposure_count
        ) t1
JOIN    (
            SELECT  ceiling(AVG(r)) AS num
            FROM    (
                        SELECT  ROW_NUMBER() OVER (ORDER BY c) AS r
                        FROM    test_exposure_count
                    ) 
        ) t2
ON      t1.r = t2.num
;
ElementId-G5BH	211840901008611824	44	14	14

2.1.2 方法二 正反排序

就比较简单取巧了,正反两个排序,正反序号相等或者刚好差一位就是中间那条数据了

SELECT  AVG(c)
FROM    (
            SELECT  *
                    ,ROW_NUMBER() OVER (ORDER BY c DESC ) AS r1
                    ,ROW_NUMBER() OVER (ORDER BY c ASC ) AS r2
            FROM    test_exposure_count
        ) 
WHERE   r1 - r2 = 0
OR      ABS(r1 - r2) = 1
;
44.0

2.2问题2

2.1.1 方法一 求中间值序号

和问题一思路基本一致

SELECT  *
FROM    (
            SELECT  *
                    ,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c) AS r
            FROM    test_exposure_count
        ) t1
JOIN    (
            SELECT  page,ceiling(AVG(r)) AS num
            FROM    (
                        SELECT  *,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c) AS r
                        FROM    test_exposure_count
                    ) 
            GROUP BY page
        ) t2
ON      t1.r = t2.num
AND t1.page = t2.page
;
ElementId-8GG5	44.0
ElementId-G5BH	44.0

2.1.2 方法二 正反排序

此时用这种方法就不行了
没法解决group by的问题

SELECT  *
FROM    (
            SELECT  *
                    ,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c DESC ) AS r1
                    ,ROW_NUMBER() OVER (PARTITION BY page ORDER BY c ASC ) AS r2
            FROM    test_exposure_count
        ) 
-- 这一步之后就不行了
WHERE   r1 - r2 = 0
OR      ABS(r1 - r2) = 1
ORDER BY r1
LIMIT   1
;

最优解

两个问题都可以使用

还是求中间值序号,用了count窗口函数

SELECT  page
        ,AVG(c)
FROM    (
            SELECT  page
                    ,uid
                    ,c
                    ,row_number() OVER(PARTITION BY page ORDER BY c) AS rnk
                    ,COUNT(uid) OVER(PARTITION BY page) AS cnt
            FROM    test_exposure_count
        ) t
WHERE   rnk IN (cnt / 2,cnt / 2 + 1,cnt / 2 + 0.5)
GROUP BY page
;
ElementId-8GG5	43.5
ElementId-G5BH	44.0

猜你喜欢

转载自blog.csdn.net/weixin_43283487/article/details/118577822