基础数据质量评估方法
1、统计表数据的饱和度(空值率)分析
2、通过码表对照分析阈值正确率分析(实时表和维度表的关联性 )
3、时间饱和度(数据连续性)分析
4、主体表和行为表的关联性(暂无例子,根据具体场景设计)
统计表数据的空值率
# 简介:生成一个表的字段饱和度(空值率) SQL 脚本,可以选择手动跑或自动跑
# 作者:王振东
# 日期:2021-02-07
from odps import ODPS
o = ODPS('ak', 'sk', 'project_namme', endpoint='http://xxxxx/api')
def check_data_by_execute_sql(table_name, partition):
ta = o.get_table(table_name)
sql_str = 'select \n'
for col in ta.schema.columns:
col_name = col.name
col_comm = col.comment
if col_comm == 'null' or col_comm is None or col_comm == '':
continue
sql_str += "sum(case when (%s is null) or (%s in ('', 'null', 'NULL', '-')) or " \
"(trim(%s) = '') then 1 else 0 end)/count(1) as `%s`,\n" % \
(col_name, col_name, col_name, col_comm)
sql_str += "count(1) as total_cnt \nfrom %s where %s" %(table_name, partition)
# print(sql_str)
print('|字段名|空值率|\n|----|----|\n')
statistic = {'0.0': 0, '0.1': 0, '0.2': 0, '0.3': 0, '0.4': 0,
'0.5': 0, '0.6': 0, '0.7': 0, '0.8': 0, '0.9': 0, '1.0': 0}
with o.execute_sql(sql_str).open_reader() as rec:
rf = rec.to_result_frame()
n = rf.names
v = rf.values[0]
for i in range(len(n) - 1):
print("|%s|%.2f|" % (n[i], v[i]))
statistic['%.1f' % v[i]] = statistic['%.1f' % v[i]] + 1
print('数据总量', v[-1])
for i in statistic.keys():
print(i, statistic[i])
def main():
check_data_by_execute_sql('table_name', "partition_name")
check_data_by_execute_sql('table_name', "partition_name")
check_data_by_execute_sql('table_name', "partition_name")
if __name__ == '__main__':
main()
通过码表对照分析阈值正确率
# 简介:拼写 SQL 筛查码表对照情况
# 作者:王振东
# 日期:2021-01-07
"""
把某个字段的字典导入标准字典表模版,字典表结构:
create table dim_gjlcb2d0_zdbm_all (
code string,
name string)
comment '诊断代码表';
create table dim_gjlcb2d0_ssbm_all (
code string,
name string)
comment '手术代码表';
统计内容:码表对照对照失败的 name/code 对,以及对应的数据条数
"""
"""
var:
code_field 代码字段
name_field 名称字段
code_ch 代码字段注释
name_ch 名称字段注释
table_name 表名称
dim_table 字典表名称
partition 指定数据分区
"""
def fill_sql(var):
code_field, name_field, code_ch, name_ch, table_name, dim_table, partition = var
sql_model = """
-- 异常统计
select '%s_%s',count(1) 无法对照的name_code对数,sum(cnt) 涉及数据条数
from (select code, name, cnt
from (select %s code, %s name,count(1) cnt from %s
where %s and %s is not null and %s is not null and %s !='-' and %s != '-' group by %s,%s) t_1 -- 过滤掉空值
left anti join %s t_2
on t_1.code = t_2.code and t_1.name = t_2.name) t_3;
"""
detail_sql_model = """
-- 异常明细
select code %s, name %s, cnt 数据量
from (select %s code, %s name,count(1) cnt from %s
where %s and %s is not null and %s is not null and %s !='-' and %s != '-' group by %s,%s) t_1 -- 过滤掉空值
left anti join %s t_2
on t_1.code = t_2.code and t_1.name = t_2.name;
"""
# code_ch, name_ch
return sql_model % (code_ch, name_ch, code_field, name_field, table_name, partition, code_field, name_field,
code_field, name_field, code_field, name_field, dim_table)
def main():
var_list = [('c', 'n', '诊断编码', '诊断名称', 'xxxxxxxxx',
'dim_gjlcb2d0_zdbm_all', "partition'")]
for v in var_list:
print(fill_sql(v))
if __name__ == '__main__':
main()
时间饱和度
-- 注意:如果表内有脏数据(例如日期为 9999-01-01 00:00:00 的时间,需要增加 where 条件过滤掉)
-- 查看数据的时间区间
select
max(time_field),min(time_field)
from table_name
where pt = 'partition_name';
-- 统计数据持续的时间(天级别)
select
datediff(max(time_field),min(time_field),'dd') 数据持续总天数
from table_name
where pt = 'partition_name'
and time_field between '约束条件' and '约束条件';
-- 组织维度的天级别时间饱和度: 组织,有数据的天数,时间饱和度,总数据量
select org_code 组织, count(1) 有数据的天数, count(1)/数据持续总天数 时间饱和度, sum(cnt) 总数据量
from (
select
org_code, to_char(time_field, 'yyyy-mm-dd') dd, count(1) cnt
from table_name
where pt = 'partition_name'
and time_field between '约束条件' and '约束条件'
group by org_code, to_char(time_field, 'yyyy-mm'))
where cnt > 0
group by org_code;