前言
场景:从事算法工作,需要了解业务数据和检验算法结果,此时就要写SQL(多是一些简单统计和抽样)
编程语言:Oracle、Python
统计
非空占比
SELECT
t1.f1 amount,
t1.f1/t2.f1 proportion
FROM
(SELECT COUNT(*)f1 FROM {table} WHERE {field} IS NOT NULL)t1,
(SELECT COUNT(*)f1 FROM {table})t2
;
单字段各项占比
SELECT
t1.f2 fullname,
t1.f1 amount,
t1.f1/t2.f1 proportion
FROM
(SELECT COUNT(*)f1,{field} f2 FROM {table} GROUP BY {field})t1,
(SELECT COUNT(*)f1 FROM {table})t2
ORDER BY amount DESC
;
单字段离散化
SELECT {field},COUNT(1) FROM
(
SELECT CASE
WHEN {field}>=0.0 AND {field}<0.2 THEN '[0.0,0.2)'
WHEN {field}>=0.0 AND {field}<0.4 THEN '[0.2,0.4)'
WHEN {field}>=0.4 AND {field}<0.6 THEN '[0.4,0.6)'
WHEN {field}>=0.6 AND {field}<0.8 THEN '[0.6,0.8)'
WHEN {field}>=0.8 AND {field}<1.0 THEN '[0.8,1.0)'
END {field}
FROM {table}
)
GROUP BY {field}
ORDER BY {field}
限量取数
按序
-- 前n行
SELECT * FROM {table} WHERE ROWNUM<=5;
-- 第n行
SELECT * FROM (SELECT ROWNUM a,t.* FROM {table} t)WHERE a=3;
-- 第m~n行
SELECT * FROM (SELECT ROWNUM a,t.* FROM {table} t WHERE ROWNUM<=5)WHERE a>=3;
-- 尾行
SELECT * FROM (SELECT ROWNUM a,t.* FROM {table} t)WHERE a=(SELECT COUNT(*) FROM {table});
-- 尾n行
SELECT * FROM (SELECT ROWNUM a,t.* FROM {table} t)WHERE a=(SELECT COUNT(*)-3 FROM {table});
随机
按百分比
SELECT * FROM {table} SAMPLE(1) WHERE ROWNUM<=10;
按数量
SELECT * FROM
(SELECT * FROM {table} ORDER BY DBMS_RANDOM.VALUE)
WHERE ROWNUM<=10;
随机取某批ID,然后根据ID按序取数
SELECT {fields} FROM
(SELECT * FROM {table} WHERE {field_in} IN
(SELECT {field_in} FROM {table} SAMPLE(1) WHERE ROWNUM<={rownum})
ORDER BY {order_by})
文本
字符串匹配
SELECT {field} FROM {table} WHERE INSTR({field},'{keyword}')>0;
正则表达式
SELECT {field} FROM {table} WHERE REGEXP_INSTR({field},'{pattern}')>0;
python生成sql
按数量随机采样
sql = '''
SELECT
t1.f2 fullname,
t1.f1 amount,
t1.f1/t2.f1 proportion
FROM
(SELECT COUNT(*)f1,{field} f2 FROM {table} GROUP BY {field})t1,
(SELECT COUNT(*)f1 FROM {table})t2
ORDER BY amount DESC
'''.format(table='table_name', field='field_name')
随机取某批ID,然后根据ID按序取数
sql = '''
SELECT {fields} FROM
(SELECT * FROM {table} WHERE {field_in} IN
(SELECT {field_in} FROM {table} SAMPLE(1) WHERE ROWNUM<={rownum})
ORDER BY {order_by})
'''.format(
fields='sid,entity,title,clause,word',
field_in='id',
table='industry',
order_by='tid',
rownum=20,
)
连续数据离散化
def discretize(table, field, n=10, start=0.0, end=1.0):
scope = (end - start) / n
when = '\n'.join(
"WHEN {field}>=%.2f AND {field}<%.2f THEN '[%.2f,%.2f)'".format(field=field) % (
start+i*scope, start+(i+1)*scope, start+i*scope, start+(i+1)*scope)
for i in range(n)
)
return '''
SELECT {field},COUNT(1) FROM
(
SELECT CASE
{when}
END {field}
FROM {table}
)
GROUP BY {field}
ORDER BY {field}
'''.format(table=table, field=field, when=when)