PostgreSQL fuzzy matching walk index

Scenes  lower(name) like 'pf%'
create table users (id int primary key, name varchar(255));

Create or replace function random_string(length integer) returns text as
$$
declare
  chars text[] := '{0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z}';
  result text := '';
  i integer := 0;
begin
  if length < 0 then
    raise exception 'Given length cannot be less than 0';
  end if;
  for i in 1..length loop
    result := result || chars[1+random()*(array_length(chars, 1)-1)];
  end loop;
  return result;
end;
$$ language plpgsql;

insert into users values(generate_series(1,50000), random_string(15));

Ordinary bt: do not go index

The pg_trgm module provides functions and operators to determine the similarity of alphanumeric text based on trigram matching, as well as an index operator class to support fast searching of similar strings. A ternary model is a set of three consecutive characters obtained from a string. We can measure their similarity by counting the number of trigrams shared by two strings. This simple idea proves to be very effective when measuring the similarity of many natural language words.

CREATE INDEX users_idx0 ON users (name);
  • 1

Whole word matching query (go index)

explain select * from users where name='pfDNQVmhqDrF1EY';
                               QUERY PLAN
-------------------------------------------------------------------------
 Index Scan using users_idx0 on users  (cost=0.29..8.31 rows=1 width=20)
   Index Cond: ((name)::text = 'pfDNQVmhqDrF1EY'::text)
(2 rows)

Add function whole word matching (without indexing)

explain select * from users where lower(name)='pfDNQVmhqDrF1EY';
                        QUERY PLAN
-----------------------------------------------------------
 Seq Scan on users  (cost=0.00..1069.00 rows=250 width=20)
   Filter: (lower((name)::text) = 'pfDNQVmhqDrF1EY'::text)
(2 rows)

Fuzzy matching (no indexing)

explain select * from users where name like 'pf%';
                       QUERY PLAN
--------------------------------------------------------
 Seq Scan on users  (cost=0.00..944.00 rows=5 width=20)
   Filter: ((name)::text ~~ 'pf%'::text)
explain select * from users where name like 'pf_';
                       QUERY PLAN
--------------------------------------------------------
 Seq Scan on users  (cost=0.00..944.00 rows=5 width=20)
   Filter: ((name)::text ~~ 'pf_'::text)

Field with function bt index: function takes the index

drop index users_idx0;
CREATE INDEX users_dex1 ON users (lower(name));

Add function whole word matching (walk index)

explain select * from users where lower(name)='pfDNQVmhqDrF1EY';
                                QUERY PLAN
---------------------------------------------------------------------------
 Bitmap Heap Scan on users  (cost=6.23..324.34 rows=250 width=20)
   Recheck Cond: (lower((name)::text) = 'pfDNQVmhqDrF1EY'::text)
   ->  Bitmap Index Scan on users_dex1  (cost=0.00..6.17 rows=250 width=0)
         Index Cond: (lower((name)::text) = 'pfDNQVmhqDrF1EY'::text)
(4 rows)

Fuzzy matching (no indexing)

explain select * from users where lower(name) like 'pf%';
                        QUERY PLAN
-----------------------------------------------------------
 Seq Scan on users  (cost=0.00..1069.00 rows=250 width=20)
   Filter: (lower((name)::text) ~~ 'pf%'::text)
(2 rows)

Declare the bt index of the operator class: like take the index

When defining an index, you can declare an operator class for each field of the index. 
CREATE INDEX name ON table (column opclass [sort options] [, …]); 
This operator class specifies the operator to use when the index is used for this field.

CREATE INDEX users_dex2 ON users (lower(name) varchar_pattern_ops);

Fuzzy matching (go index)

explain select * from users where lower(name) like 'pf%';
                                              QUERY PLAN
------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on users  (cost=4.82..144.00 rows=5 width=20)
   Filter: (lower((name)::text) ~~ 'pf%'::text)
   ->  Bitmap Index Scan on users_dex2  (cost=0.00..4.82 rows=53 width=0)
         Index Cond: ((lower((name)::text) ~>=~ 'pf'::text) AND (lower((name)::text) ~<~ 'pg'::text))
(4 rows)

scene 2 name like '%pf%'

Create or replace function random_string(length integer) returns text as
$$
declare
  chars text[] := '{0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z}';
  result text := '';
  i integer := 0;
begin
  if length < 0 then
    raise exception 'Given length cannot be less than 0';
  end if;
  for i in 1..length loop
    result := result || chars[1+random()*(array_length(chars, 1)-1)];
  end loop;
  return result;
end;
$$ language plpgsql;

create table users (id int primary key, name varchar(255));

insert into users values(generate_series(1,50000), random_string(15));

Declare operator bt: do not go index

CREATE INDEX idx_name ON users USING btree (lower(name) varchar_pattern_ops);
explain (analyze true,format yaml, verbose true, buffers true) select * from users where lower(name) like '%pf%';\
                        QUERY PLAN
-----------------------------------------------------------
 - Plan:                                                  +
     Node Type: "Seq Scan"                                +
     Parallel Aware: false                                +
     Relation Name: "users"                               +
     Schema: "public"                                     +
     Alias: "users"                                       +
     Startup Cost: 0.00                                   +
     Total Cost: 1069.00                                  +
     Plan Rows: 5                                         +
     Plan Width: 20                                       +
     Actual Startup Time: 0.320                           +
     Actual Total Time: 86.841                            +
     Actual Rows: 710                                     +
     Actual Loops: 1                                      +
     Output:                                              +
       - "id"                                             +
       - "name"                                           +
     Filter: "(lower((users.name)::text) ~~ '%pf%'::text)"+
     Rows Removed by Filter: 49290                        +
     Shared Hit Blocks: 319                               +
     Shared Read Blocks: 0                                +
     Shared Dirtied Blocks: 0                             +
     Shared Written Blocks: 0                             +
     Local Hit Blocks: 0                                  +
     Local Read Blocks: 0                                 +
     Local Dirtied Blocks: 0                              +
     Local Written Blocks: 0                              +
     Temp Read Blocks: 0                                  +
     Temp Written Blocks: 0                               +
   Planning Time: 0.188                                   +
   Triggers:                                              +
   Execution Time: 86.975

Declare pg_trgm operator bt: can walk index

CREATE EXTENSION pg_trgm;

CREATE INDEX idx_users_name_trgm_gist ON users USING gist (name gist_trgm_ops);
explain (analyze true, verbose true, buffers true) select * from users where name like '%pf%';
                                                                QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on public.users  (cost=32.19..371.08 rows=505 width=20) (actual time=19.314..53.132 rows=193 loops=1)
   Output: id, name
   Recheck Cond: ((users.name)::text ~~ '%pf%'::text)
   Rows Removed by Index Recheck: 49807
   Heap Blocks: exact=319
   Buffers: shared hit=972
   ->  Bitmap Index Scan on idx_users_name_trgm_gist  (cost=0.00..32.06 rows=505 width=0) (actual time=19.175..19.175 rows=50000 loops=1)
         Index Cond: ((users.name)::text ~~ '%pf%'::text)
         Buffers: shared hit=653
 Planning time: 0.188 ms
 Execution time: 53.231 ms
(11 rows)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325545466&siteId=291194637