5 常见创建TABLE方式

5.1 创建Heap表

drop table if exists test_head;

create table test_head(id int primary key) distributed by (id);

distributed by 表示制定分布键，便于segment储存数据

5.2 创建AO表

5.2.1 AO表不压缩

drop table if exists test_ao;

create table test_ao(id int) with (appendonly=true) distributed by (id);

appendonly=true是表示AO(Append-optimized)存储表的表示，参数为true和false，例如appendonly=true或appendonly=false

5.2.2 AO表压缩

drop table if exists test_ao;

create table test_ao(id int) with (appendonly=true, compresslevel=5) distributed by (id);

compresslevel是压缩率，取值为1~9,一般选择5就足够了，值越高压缩率越高

5.2.3 AO表列存压缩

AO表列存压缩与上表的压缩方式不同

drop table if exists test_ao;

create table test_ao(id int) with (appendonly=true,compresslevel=5, orientation=column) distributed by (id);

orientation是对列进行压缩，写法只有orientation=column

5.2.3.1 对orientation参数进行测试

5.2.3.1.1 创建表语句

创建不对列压缩的表

CREATE TABLE e_alter_recoder_out_20180810(

s_ext_nodenum varchar ,

pripid varchar ,

s_ext_sequence varchar ,

alt_id varchar ,

altitem varchar ,

altbe varchar,

altaf varchar,

altdate varchar ,

alttime varchar ,

s_ext_timestamp varchar ,

s_ext_batch varchar ,

s_ext_validflag varchar,

jobid varchar,

entid varchar,

handle_type varchar

) WITH (appendonly=true, compresstype=zlib, compresslevel=5)

DISTRIBUTED BY (pripid);

创建对列压缩的表

CREATE TABLE e_alter_recoder_out_20180812(

s_ext_nodenum varchar ,

pripid varchar ,

s_ext_sequence varchar ,

alt_id varchar ,

altitem varchar ,

altbe varchar,

altaf varchar,

altdate varchar ,

alttime varchar ,

s_ext_timestamp varchar ,

s_ext_batch varchar ,

s_ext_validflag varchar,

jobid varchar,

entid varchar,

handle_type varchar

) WITH (appendonly=true, compresstype=zlib, compresslevel=5,orientation=column)

DISTRIBUTED BY (pripid);

在以上可以看出一共有15个字段

5.2.3.1.2 查看数据的大小

$ du -sh mv_e_alter_recoder_20180922.csv

48G mv_e_alter_recoder_20180922.csv

5.2.3.1.3 使用COPY命令导入数据

$ time psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY e_alter_recoder_out_20180810 FROM '/data/oracle-export-data/DATA20180922/mv_e_alter_recoder_20180922.csv' WITH csv DELIMITER E'\001' LOG ERRORS SEGMENT REJECT LIMIT 3000 ROWS"

Password for user gpadmin:

real 11m49.978s

user 1m17.379s

sys 0m43.668s

time psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY e_alter_recoder_out_20180812 FROM '/data/oracle-export-data/DATA20180922/mv_e_alter_recoder_20180922.csv' WITH csv DELIMITER E'\001' LOG ERRORS SEGMENT REJECT LIMIT 3000 ROWS"

Password for user gpadmin:

real 12m11.227s

user 1m27.575s

sys 0m50.548s

在以上结果中可以看出不对列压缩用时11m49.978s，而对列压缩的用时12m11.227s，相差23S

5.2.3.1.4 查看在数据库中占用的大小

select pg_size_pretty(pg_relation_size('e_alter_recoder_out_20180810'));

-- 14 GB

select pg_size_pretty(pg_relation_size('e_alter_recoder_out_20180812'));

-- 11 GB

使用列压缩竟然缩小了3G的空间，好恐怖，23S节省3G空间，值得拥有。

5.2.3.1.5 查看表的行数

select count(*) from e_alter_recoder_out_20180810;

-- 156784862

select count(*) from e_alter_recoder_out_20180812;

-- 156784862

5.2.3.2 统计压缩的大小

5.2.3.2.1 原始表占用大小

查询各个schema占用大小的SQL:

select pg_size_pretty(cast(sum(pg_relation_size( schemaname || '.' || tablename)) as bigint)), schemaname from pg_tables t inner join pg_namespace d on t.schemaname=d.nspname group by schemaname;

schema下有750张占用940GB的空间，原始压缩方式为:WITH (appendonly=true, compresstype=zlib, compresslevel=5)

5.2.3.2.2 压缩后表的占用大小

修改压缩方式后:WITH (appendonly = true, compresstype = zlib, compresslevel = 7

,orientation=column, checksum = false,blocksize = 2097152),具体的参数的含义请参考以上的说明。

在以上可以看出已经节省了320GB的空间,又省钱可以买糖吃啦,别拉我，我要去买糖、、、

5.2.3.3 压缩类型比较

5.2.3.3.1 查看压缩前的大小

以下表的参数为:

with ( appendonly = true, compresstype = zlib, compresslevel = 7

,orientation=column, checksum = false,blocksize = 2097152 )

stagging=# select pg_size_pretty(pg_relation_size('xiaoxu.e_alter_recoder_out_20180810_com'));

pg_size_pretty

----------------

11 GB

(1 row)

Time: 33.880 ms

使用compresstype = zlib的压缩方式的大小为 11 GB,修改compresstype 的参数查看大小，其中参数为:zlib,quicklz,rle_type

5.2.3.3.2 使用quicklz压缩

stagging=# create table xiaoxu.e_alter_recoder_out_20180810_com_quicklz with (appendonly = true, compresstype = quicklz, compresslevel = 1 ,orientation=column, checksum = false,blocksize = 2097152) as select * from xiaoxu.e_alter_recoder_out_20180810_com Distributed by (pripid);

SELECT 156784862

Time: 59158.322 ms

stagging=# select pg_size_pretty(pg_relation_size('xiaoxu.e_alter_recoder_out_20180810_com_quicklz'));

pg_size_pretty

----------------

14 GB

(1 row)

Time: 34.632 ms

在以上看出使用quicklz压缩直接蹦到14GB了，不可忍受

5.2.3.3.3 使用rle_type压缩

stagging=# create table xiaoxu.e_alter_recoder_out_20180810_com_rle_type with (appendonly = true, compresstype = rle_type, compresslevel = 1 ,orientation=column, checksum = false,blocksize = 2097152) as select * from xiaoxu.e_alter_recoder_out_20180810_com Distributed by (pripid);

SELECT 156784862

Time: 124042.881 ms

stagging=# select pg_size_pretty(pg_relation_size('xiaoxu.e_alter_recoder_out_20180810_com_rle_type'));

pg_size_pretty

----------------

38 GB

(1 row)

Time: 10.006 ms

这个更厉害，已经飙到38GB了，吓得我赶紧删除了，爬服务器冒烟、、

5.3 创建HDFS外表实例

5.3.1 创建外部表实例

CREATE EXTERNAL TABLE e_alter_recoder_out_20180812(

s_ext_nodenum varchar ,

pripid varchar ,

s_ext_sequence varchar ,

alt_id varchar ,

altitem varchar ,

altbe varchar,

altaf varchar,

altdate varchar ,

alttime varchar ,

s_ext_timestamp varchar ,

s_ext_batch varchar ,

s_ext_validflag varchar,

jobid varchar,

entid varchar,

handle_type varchar )

LOCATION ('gphdfs://nameservice1/tmp/e_alter_recoder_20180812/E_ALTER_RECODER/*') format 'text' (delimiter E'\u0001' FILL MISSING FIELDS) NULL as 'null string' ESCAPE as 'OFF' LOG ERRORS SEGMENT REJECT LIMIT 3000 ROWS;

EXTERNAL外表需要添加关键字

nameservice1是HDFS的HA的地址，需要先配置好,或者写成192.168.209.105:8090

tmp/e_alter_recoder_20180812/E_ALTER_RECODER/是HDFS上的路径

delimiter分隔符是 E'\u0001'，也就是隐藏符SOH

LOG ERRORS SEGMENT REJECT说明吧错误数据放到GP默认的gp_read_error_log中

LIMIT 3000 ROWS 表示允许错误的最大的错误数，可以调大也可以调小，最小为2

5.3.2 查看错误数据的实例

SELECT gp_read_error_log('tableName');

错误表字段解释:

Column | Type | Modifiers

----------+--------------------------+-----------

cmdtime | timestamp with time zone | --操作时间

relname | text | --表名

filename | text | --文件名

linenum | integer | --错误行号

bytenum | integer |

errmsg | text | --错误信息

rawdata | text | --整行数据

rawbytes | bytea | --行大小

Distributed randomly

详细的说明请查看:Greenplum加载数据常见错误及解决方法

5.3.3 创建可写外部表实例

5.3.3.1 创建可写外部表实例

CREATE WRITABLE EXTERNAL TABLE table_name

( column_name data_type [, ...] | LIKE other_talbe )

LOCATION ('gpfdist://outputhost[:port]/fillename'[, ...]) | ('gpfdist://hdfs_host[:port]/path')

FORMAT 'TEXT'

[( [DELIMITER [AS] 'delimiter']

[NULL [AS] 'null string']

[ESCAPE [AS] 'escape' | 'OFF'] )]

| 'CSV'

[([QUETE [AS] 'quote']

[DELIMITER [AS] 'delimiter']

[NULL [AS] 'null string']

[FORCE QUOTE column [, ...]]

[ESCAPE [AS] 'escape'] )]

[ ENCODING 'write_encoding' ]

[DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY]

5.3.3.2 创建可写外部表

stagging=# drop external table xiaoxu.hdfs_writable_test;

DROP EXTERNAL TABLE

Time: 85.875 ms

stagging=# CREATE WRITABLE EXTERNAL TABLE xiaoxu.hdfs_writable_test (LIKE xiaoxu.e_alter_recoder_out_20180810

) LOCATION ('gphdfs://nameservice1/tmp/xiaoxu/hdfs_writable_test'

) format 'text' (DELIMITER E'\u0001' NULL as 'null string' ESCAPE as 'OFF'

) Distributed by (pripid);

Time: 55.628 ms

stagging=# insert into xiaoxu.hdfs_writable_test select * from xiaoxu.e_alter_recoder_out_20180810 limit 10;

INSERT 0 10

Time: 1582.187 ms

xiaoxu.hdfs_writable_test : 表明

tmp/xiaoxu/hdfs_writable_test : HDFS上路径的名字

在以上可以看出只需要WRITABLE 关键字既可以实现外部表

5.3.3.3 查看HDFS上的数据

在以上中可以看出HDFS已经正确的分割了数据

5.4 创建分区表

5.4.1 分区表概念

分区表意思是将一个大表在物理上分割成几块，GPDB中的分区表和PostgreSQL中实现原理一样，都是用过表继承、约束来实现。但是与PostgreSQL也有所不同，在PostgreSQL中，一个父表，多个子表来实现分区表，需要手动向子表插入数据，如果向父表插入数据，则直接会被插入到父表中，在GPDB中，可以直接想父表插入数据，便可以根据约束直接自动向对应的子表插入数据，当分区子表不存在时，插入失败。

5.4.2 创建分区表

drop table if exists t05;

create table t05(id int, createOn date) with (appendonly=true, compresslevel=5,orientation=column) distributed by (id)

Partition By Range(createOn)(

partition p2016 start ('2016/01/01'::date) inclusive end ('2016/12/31'::date) exclusive,

partition p2017 start ('2017/01/01'::date) inclusive end ('2017/12/31'::date) exclusive,

partition p2018 start ('2018/01/01'::date) inclusive end ('2018/12/31'::date) exclusive

-- default partition otherTime

);

Range 分区表的关键字

p2016 分区表的名称

start 开始分区表的开始时间

end 分区表的结束时间

default partition otherTime 也可以使用默认的分区

inclusive:指定包含，例如上面的 start ('2016/01/01') inclusive 则是包含'2016/01/01'

exclusive:指定不包含，例如上面的 end ('2016/12/31') exclusive 则是不包含'2016/12/31'

5.4.3 查看创建的分区表

5.4.3.1 查看创建表的名称

可以看出GP已经把分区的前缀给设置好了，t05_1_prt_p2016，p2016即使当前分区的名字

5.4.3.2 查看表的结构

stagging=# \d+ t05_1_prt_p2016

Append-Only Columnar Table "public.t05_1_prt_p2016"

----------+---------+-----------+---------+------------------+-------------------+------------+-------------

id | integer | | plain | zlib | 5 | 32768 |

Checksum: t

Check constraints:

"t05_1_prt_p2016_check" CHECK (createon >= '2016-01-01'::date AND createon < '2016-12-31'::date)

Inherits: t05

Has OIDs: no

Options: appendonly=true, compresslevel=5, orientation=column

Distributed by: (id)

在以上的信息中可以看出分区表其实就是主表管理者分区表的一个约束范围，partition p2016 start ('2016/01/01'::date) end ('2016/12/31'::date)

写法表示createon >= '2016-01-01'::date AND createon < '2016-12-31'::date的范围，分布键与主表的分布键一致

5.4.4 插入分区表数据

5.4.4.1 生成数据脚本

$ cat date_partition.sh

#!bin/bash

# $1 是开始时间

# $2 是结束时间

# $3 生成数据保存的文件

begin_date=$1

end_date=$2

while [ "$begin_date" != "$end_date" ]

echo $begin_date"|"$begin_date >> $3

let begin_date=`date -d "-1 days ago ${begin_date}" +%Y%m%d`

done

5.4.4.2 插入数据

在插入数据时只需要向主表中插入数据即可

$ psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY t05 FROM '/home/*******/partition_data/20161231.csv' WITH csv DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 5000 ROWS"

如果插入一个没有分布表的会报一下错误信息

$ psql -d stagging -h 192.168.209.11 -p 5432 -U gpadmin -c "\COPY t05 FROM '/home/******/partition_data/20191231.csv' WITH csv DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 5000 ROWS"

NOTICE: Found 364 data formatting errors (364 or more input rows). Rejected related input data

5.4.5 查看数据及数据分布情况

5.4.5.1 查看主表的数据

select * from t05 where id BETWEEN '20160101' AND '20181231'

5.4.5.2 查看分布表的数据

select * from t05_1_prt_p2016;

5.4.5.3 查看数据分布情况

查看分区表的分布情况

select gp_segment_id,count(1) from t05_1_prt_p2016 group by 1;

查看主表的分布情况

select gp_segment_id,count(1) from t05 group by 1;

5.4.6 对分区表常用操作

5.4.6.1 增加分区

ALTER TABLE t05 ADD PARTITION p2019

START (date '2019/01/01') END (date '2019/12/31') EXCLUSIVE;

5.4.6.2 删掉分区

alter table t05 drop partition p2016;

5.4.6.3 清空分区

相当于DROP掉又重新创建的分区

alter table t05 truncate partition p2016;

5.4.6.4 修改分区

alter table t05 split partition p2019 at ('2019-06-30') into (partition p2019, partition p2020);

5.4.6.5 修改default分区

alter table t05 split default partition start ('2017-03-01') end ('2017-03-31') into (partition p4, default partition);

5.4.7 创建数字范围的分区表

drop table if exists t06;

create table t06(id int,dayOfPeriod int) distributed by (id)

partition by range(dayOfPeriod)

(

start (1) end (31) every(1),

default partition none

);

every 表示每次增加一,可以拿此作为主键

5.4.8 创建字母范围的分区表

drop table if exists t07;

create table t07(id int , gender char(2)) with(appendonly=true,compresslevel=5) distributed by (id)

partition by list(gender)

(

partition man values('m'),

partition woman values('f'),

default partition Unkown

);

5.5 快速复制表

CREATE TABLE e_alter_recoder_out_20180814 WITH (

appendonly = TRUE,

compresstype = zlib,

compresslevel = 5,

orientation = column

) AS SELECT

FROM

e_alter_recoder_out_20180812 Distributed BY (pripid)

查看执行的执行的时间

**************

FROM

e_alter_recoder_out_20180812 Distributed BY (pripid)

时间: 69.977s

受影响的行: 1,5678,4862

可以看出14 GB的文件用时 69.977s导入1,5678,4862行的数据