版权声明:原创文章,转载请附带原作者信息([email protected])。 https://blog.csdn.net/jackgo73/article/details/89432363
经常被问到为什么模糊匹配不走索引,验证几种情况(Pg9.4.18)
构造测试数据
create table test(id int, info text);
insert into test select generate_series(1,1000000),md5(random()::text);
create index idx on test(info collate "C");
前模糊
规则
-
索引、查询条件的collate必须一致才能使用索引。
-
collate<>"C"的话,使用默认的索引ops是不能走索引的需要制定对应类型的ops,例如
text_pattern_ops
varchar_pattern_ops
bpchar_pattern_ops
test01 走索引
create table testc(id int, info text collate "C");
insert into testc select generate_series(1,1000000),md5(random()::text);
create index idxc on testc(info collate "C");
explain (analyze,verbose,timing,costs,buffers) select * from testc where info like 'abcd%' collate "C";
test02 不走索引
create table testcz(id int, info text);
insert into testcz select generate_series(1,1000000),md5(random()::text);
create index idxcz on testcz(info collate "zh_CN");
explain (analyze,verbose,timing,costs,buffers) select * from testcz where info like 'abcd%' collate "zh_CN";
test03 使用对应的pattern ops走索引
create table testcp (id int, info text);
insert into testcp select generate_series(1,1000000),md5(random()::text);
create index indexcp on testcp(info text_pattern_ops);
explain (analyze,verbose,timing,costs,buffers) select * from testcp where info like 'abcd%';
test 04 使用对应的pattern ops走索引在zh_CN也是列时也走索引
create table testcc (id int, info text collate "zh_CN");
insert into testcc select generate_series(1,1000000),md5(random()::text);
create index idxcc on testcc(info text_pattern_ops);
explain (analyze,verbose,timing,costs,buffers) select * from testcc where info like 'abcd%';
test05 lc_collate<>C时不走索引,需要走索引的话使用对应的pattern ops
create table testcn (id int, info text collate "zh_CN");
insert into testcn select generate_series(1,1000000),md5(random()::text);
create index idxcn on testcn(info collate "zh_CN");
explain (analyze,verbose,timing,costs,buffers) select * from testcn where info like 'abcd%';
后模糊
reverse()后同前模糊
test01 collate<>“C” 使用对应的text_pattern_ops可以搞定
drop table test;
create table test(id int, info text);
create index idx on test(info collate "C");
create index idx1 on test(reverse(info) collate "C");
insert into test select generate_series(1,1000000),md5(random()::text);
explain (analyze,verbose,timing,costs,buffers) select * from test where reverse(info) like '4152%' collate "C";
create table testz(id int, info text collate "zh_CN");
create index idxz on testz(reverse(info) text_pattern_ops);
insert into testz select generate_series(1,1000000),md5(random()::text);
explain (analyze,verbose,timing,costs,buffers) select * from testz where reverse(info) like '4152%';
前后模糊
test01 走索引
drop table test;
create extension pg_trgm;
create table test(id int, info text);
create index idx_test_text on test using gin (info gin_trgm_ops);
insert into test select generate_series(1,1000000),md5(random()::text);
explain (analyze,verbose,timing,costs,buffers) select * from test where info like '%4152%';
前后模糊中文
规则
- 对于中文,lc_ctype不能为"C",只有TOKEN分割正确效果才是OK的
- 在"C"的库中使用有严重性能问题
test01
DB : test06 | pg9002311 | UTF8 | zh_CN.utf8 | zh_CN.utf8
create or replace function gen_hanzi(int) returns text as $$
declare
res text;
begin
if $1 >=1 then
select string_agg(chr(19968+(random()*20901)::int), '') into res from generate_series(1,$1);
return res;
end if;
return null;
end;
$$ language plpgsql strict;
create extension pg_trgm;
create table test001(c1 text);
insert into test001 select gen_hanzi(20) from generate_series(1,100000);
create index idx_test001_1 on test001 using gin (c1 gin_trgm_ops);
走索引:性能没问题
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '你%'; --0.083 ms
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '%恧顤'; --0.039 ms
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '%燋邢賀%'; --0.042 ms
走索引:有性能问题!
-- DB : postgres | pg9002311 | UTF8 | C | C
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '你%'; --45.050 ms
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '%恧顤'; --61.116 ms
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '%燋邢賀%'; --60.768 ms
-- "C" 建表的时候制定collate "zh_CN"
explain (analyze,verbose,timing,costs,buffers) select * from test001 where c1 like '你%'; --46.050 ms
总结
- 如果只有前模糊查询需求(字符串 like ‘xx%’),使用collate "C"的b-tree索引;当collate不为"C"时,
可以使用类型对应的pattern ops(例如text_pattern_ops)建立b-tree索引。 - 如果只有后模糊的查询需求(字符串 like ‘%abc’ 等价于 reverse(字符串) like ‘cba%’),
使用collate "C"的reverse()表达式的b-tree索引;当collate不为"C"时,
可以使用类型对应的pattern ops(例如text_pattern_ops)建立b-tree索引。 - 如果有前后模糊查询需求,并且包含中文,请使用lc_ctype <> "C"的数据库,同时使用pg_trgm插件的gin索引。
(只有TOKEN分割正确效果才是OK的。(lc_ctype设置正确,才能够正确的逐一分割多字节字符串中的文字:
Character classification (What is a letter? Its upper-case equivalent?))。) - 如果有前后模糊查询需求,并且不包含中文,请使用pg_trgm插件的gin索引。
- 如果有正则表达式查询需求,请使用pg_trgm插件的gin索引。
- 如果有输入条件少于3个字符的模糊查询需求,可以使用GIN表达式索引,通过数组包含的方式进行搜索,性能一样非常好。
参考德哥文章,原文地址
https://github.com/digoal/blog/blob/master/201704/20170426_01.md
附:
测试这样的数据库
zfba=# \l
List of databases
Name | Owner | Encoding | Collation | Ctype | Access privileges
-----------+---------+----------+-------------+-------------+---------------------
zfba | zfba | UTF8 | zh_CN.UTF-8 | zh_CN.UTF-8 |
create database test06 with encoding 'UTF-8' lc_collate='zh_CN.utf8' lc_ctype='zh_CN.utf8' template template0;
test06=# \l+
List of databases
Name | Owner | Encoding | Collation | Ctype |
-----------+-----------+----------+------------+------------+
test06 | pg9002311 | UTF8 | zh_CN.utf8 | zh_CN.utf8 |
(5 rows)
create table testcn (id int, info text);
insert into testcn select generate_series(1,1000000),md5(random()::text);
create index idxcn on testcn(info);
-- 不走索引!
explain (analyze,verbose,timing,costs,buffers) select * from testcn where info like 'abcd%';
-- 走索引!
create index idxcx on testcn(info text_pattern_ops);
explain (analyze,verbose,timing,costs,buffers) select * from testcn where info like 'abcd%';