Hive 基础数据结构和基本语法DDL详解

前面介绍了Hive的理论基础以及安装Hive,这篇文章主要介绍Hive的基础数据结构和建表,插入数据等一些基本的操作,这里附上Hive官网的语法手册

:本人使用的Hive版本为Hive1.2.1,hadoop版本为2.6.4,mysql版本为5.7.26

数据结构(DATA TYPE)

data_type
  : primitive_type             //基本数据类型
  | array_type                 //Array类型
  | map_type                   //map类型(key:value)
  | struct_type                 //结构体类型
  | union_type  -- (Note: Available in Hive 0.7.0 and later)  //联合类型
 
primitive_type                 //基本数据类型跳过,只解释与mysql有差别的类型
  : TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
  | STRING                //String类型,mysql中并没有这个类型,我们定义字段时,可以不用varchar而直接像java一样使用String
  | BINARY      -- (Note: Available in Hive 0.8.0 and later)
  | TIMESTAMP   -- (Note: Available in Hive 0.8.0 and later)
  | DECIMAL     -- (Note: Available in Hive 0.11.0 and later)
  | DECIMAL(precision, scale)  -- (Note: Available in Hive 0.13.0 and later)
  | DATE        -- (Note: Available in Hive 0.12.0 and later)
  | VARCHAR     -- (Note: Available in Hive 0.12.0 and later)
  | CHAR        -- (Note: Available in Hive 0.13.0 and later)
array_type                      //数组类型,这使得我们存数组时,不需要像mysql一样对字符串进行分割来获取数组元素
  : ARRAY < data_type >
 
map_type                     //直接存的<k,v>类型
  : MAP < primitive_type, data_type >
 
struct_type                  //struct类型类似于java类,可以存储多种数据类型,
  //如果字段值既有String类型又有int类型,就可以使用struct类型,如address: struct<Country:String,City:String,Street:Int>
  : STRUCT < col_name : data_type [COMMENT col_comment], ...>
 
union_type             //与struct类似,区别是union虽然可以定义很多数据类型,但是同一时间只有其中一个能生效
   : UNIONTYPE < data_type, data_type, ... >  -- (Note: Available in Hive 0.7.0 and later)

创建数据库(Create Database)

//使用DATABASE和SCHEMA效果是一样的
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
  [COMMENT database_comment]
  [LOCATION hdfs_path]
  [WITH DBPROPERTIES (property_name=property_value, ...)];

删除数据库(Drop Database)

DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];

更改数据库(Alter Databases)

//修改配置的属性值
ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...);   -- (Note: SCHEMA added in Hive 0.14.0)
 
 //修改owner
ALTER (DATABASE|SCHEMA) database_name SET OWNER [USER|ROLE] user_or_role;   -- (Note: Hive 0.13.0 and later; SCHEMA added in Hive 0.14.0)
  
  //修改数据库在HDFS中的路径
ALTER (DATABASE|SCHEMA) database_name SET LOCATION hdfs_path; -- (Note: Hive 2.2.1, 2.4.0 and later)

创建表(Create Table)

CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])] //列名
  [COMMENT table_comment]   //列注释
  
  //根据列名分区
  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]  
  
  //根据列名分桶
  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]   
  
  //创建倾斜列,通过指定经常出现的值(严重倾斜),hive将会在元数据中记录这些倾斜的列名和值,在join时能够进行优化
  [SKEWED BY (col_name, col_name, ...)                  -- (Note: Available in Hive 0.10.0 and later)]  
  
   //倾斜列名称和值
     ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)    
     [STORED AS DIRECTORIES]
  [

   [ROW FORMAT row_format]    //行格式
   [STORED AS file_format]    //表数据的存储格式
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]  -- (Note: Available in Hive 0.6.0 and later)
  ]
  [LOCATION hdfs_path]        //存储的HDFS路径
  
  //实际上就是table properties,TBLPROPERTIES允许开发者定义一些自己的键值对信息。可以对TBLPROPERTIES进行查看和修改(部分可修改)
  [TBLPROPERTIES (property_name=property_value, ...)]   -- (Note: Available in Hive 0.6.0 and later)
  
  //用于创建普通表或临时表,并物化select的结果(慎用)
  [AS select_statement];   -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)

这里举个例子

//创建一个普通表
CREATE TABLE page_view(viewTime INT, userid BIGINT,
     page_url STRING, referrer_url STRING,
     ip STRING COMMENT 'IP Address of the User')
 COMMENT 'This is the page view table'
 PARTITIONED BY(dt STRING, country STRING)
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\001'
STORED AS SEQUENCEFILE;

//创建一个外部表
CREATE EXTERNAL TABLE page_view(viewTime INT, userid BIGINT,
     page_url STRING, referrer_url STRING,
     ip STRING COMMENT 'IP Address of the User',
     country STRING COMMENT 'country of origination')
 COMMENT 'This is the staging page view table'
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '\054'
 STORED AS TEXTFILE
 LOCATION '<hdfs_location>';

//创建倾斜表
CREATE TABLE list_bucket_multiple (col1 STRING, col2 int, col3 STRING)
  SKEWED BY (col1, col2) ON (('s1',1), ('s3',3), ('s13',13), ('s78',78)) [STORED AS DIRECTORIES];

//创建临时表
CREATE TEMPORARY TABLE list_bucket_multiple (col1 STRING, col2 int, col3 STRING);

删除表(Drop Table)

DROP TABLE [IF EXISTS] table_name [PURGE];     -- (Note: PURGE available in Hive 0.14.0 and later)

清空表(Truncate Table)

//从表或分区中删除所有行。但不会删除表结构
TRUNCATE [TABLE] table_name [PARTITION partition_spec];
 
partition_spec:
  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)

修改表(Alter Table)

/**
*这些命令只会修改Hive的元数据,不会重组或重新格式化现有数据。
*/

//修改表名
ALTER TABLE table_name RENAME TO new_table_name;

//修改表属性
ALTER TABLE table_name SET TBLPROPERTIES table_properties;
 
table_properties:
  : (property_name = property_value, property_name = property_value, ... )

//修改表注释
ALTER TABLE table_name SET TBLPROPERTIES ('comment' = new_comment);

//修改表存储属性
ALTER TABLE table_name CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name, ...)]
  INTO num_buckets BUCKETS;

//修改表倾斜,STORED AS DIRECTORIES选项确定倾斜表是否使用列表存储功能,该功能为倾斜值创建子目录。
ALTER TABLE table_name SKEWED BY (col_name1, col_name2, ...)
  ON ([(col_name1_value, col_name2_value, ...) [, (col_name1_value, col_name2_value), ...]
  [STORED AS DIRECTORIES];

//修改表不倾斜
ALTER TABLE table_name NOT SKEWED;

//修改倾斜列的存储路径
ALTER TABLE table_name SET SKEWED LOCATION (col_name1="location1" [, col_name2="location2", ...] );

//添加主键约束
ALTER TABLE table_name ADD CONSTRAINT constraint_name PRIMARY KEY (column, ...) DISABLE NOVALIDATE;
//添加外键
ALTER TABLE table_name ADD CONSTRAINT constraint_name FOREIGN KEY (column, ...) REFERENCES table_name(column, ...) DISABLE NOVALIDATE RELY;
//添加一个列,列的值不允许为空
ALTER TABLE table_name ADD CONSTRAINT constraint_name UNIQUE (column, ...) DISABLE NOVALIDATE;
//更改一个列,新的列的值为非空
ALTER TABLE table_name CHANGE COLUMN column_name column_name data_type CONSTRAINT constraint_name NOT NULL ENABLE;
//更改一个列,并为这个列赋默认值
ALTER TABLE table_name CHANGE COLUMN column_name column_name data_type CONSTRAINT constraint_name DEFAULT default_value ENABLE;
//更改一个列,并为这个列赋检查表达式
ALTER TABLE table_name CHANGE COLUMN column_name column_name data_type CONSTRAINT constraint_name CHECK check_expression ENABLE;
 //为表移除一个约束
ALTER TABLE table_name DROP CONSTRAINT constraint_name;

修改分区(Alter Partition)

//为表添加一个新的分区
//ADD PARTITION会更改表元数据,但不会加载实际数据。如果分区位置中不存在数据,查询将不会返回任何结果。
//如果表的partition_spec已经存在,则会引发错误。
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
 
partition_spec:
  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)

//更改分区的名字
ALTER TABLE table_name PARTITION partition_spec RENAME TO PARTITION partition_spec;

//将一个分区中的数据从一个表移动到另一个具有相同架构并且还没有该分区的表。
ALTER TABLE table_name_2 EXCHANGE PARTITION (partition_spec) WITH TABLE table_name_1;

//另一个具有相同架构的表交换分区
ALTER TABLE table_name_2 EXCHANGE PARTITION (partition_spec, partition_spec2, ...) WITH TABLE table_name_1;

//恢复分区(MSCK修复表)
//它将把HDFS上存在但metastore中不存在的所有分区添加到metastore中
MSCK [REPAIR] TABLE table_name [ADD/DROP/SYNC PARTITIONS];

删除分区(Drop Partitions)

// 删除表的分区。这将删除该分区的数据和元数据
ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec[, PARTITION partition_spec, ...]
  [IGNORE PROTECTION] [PURGE];            -- (Note: PURGE available in Hive 1.2.0 and later, IGNORE PROTECTION not available 2.0.0 and later)
  
//删除指定的分区,而不管保护状态如何
ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec IGNORE PROTECTION;

// 如果指定了PURGE,则分区数据不会进入.Trash / Current目录(这个目录类似于回收站,可以用来恢复数据),慎用,因为无法rollback
ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec PURGE

更改表或分区(Alter Table/Partition)

//修改表名或分区的文件格式
ALTER TABLE table_name [PARTITION partition_spec] SET FILEFORMAT file_format;

//修改表或分区的位置
ALTER TABLE table_name [PARTITION partition_spec] SET LOCATION "new location";

//修改表或分区的安全级别, NO_DROP 可以防止表被删除,
ALTER TABLE table_name [PARTITION partition_spec] ENABLE|DISABLE NO_DROP [CASCADE];

//修改表或分区的安全级别,启用OFFLINE可以防止查询表或分区中的数据,但是仍然可以访问元数据。
ALTER TABLE table_name [PARTITION partition_spec] ENABLE|DISABLE OFFLINE;

//这个功能通常不会使用,但当表压缩服务被关闭了时,或者想在系统无法选择的时间压缩表,就可以考虑使用这个功能
//启动压缩
ALTER TABLE table_name [PARTITION (partition_key = 'partition_value' [, ...])]
  COMPACT 'compaction_type'[AND WAIT]
  [WITH OVERWRITE TBLPROPERTIES ("property"="value" [, ...])];

//如果表或分区包含许多小的RCFiles(一种hive文件的存储格式)或ORC文件(RCFile的优化),则这个命令会将它们合并为更大的文件。
//对于RCFile,合并发生在块级别,而对于ORC文件,合并发生在条带级别,从而避免了对数据进行解压缩和解码的开销。
ALTER TABLE table_name [PARTITION (partition_key = 'partition_value' [, ...])] CONCATENATE;

更改列(Alter Column)

/**
*column change命令将仅修改Hive的元数据,而不会修改数据。用户应确保表/分区的实际数据布局符合元数据定义。
*/

//更改列名称/类型/位置/注释
ALTER TABLE table_name [PARTITION partition_spec] CHANGE [COLUMN] col_old_name col_new_name column_type
  [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];

//添加/替换列
ALTER TABLE table_name 
  [PARTITION partition_spec]                 -- (Note: Hive 0.14.0 and later)
  ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
  [CASCADE|RESTRICT]                         -- (Note: Hive 1.1.0 and later)

创建/删除/更改视图(Create/Drop/Alter View)

//创建视图
CREATE VIEW [IF NOT EXISTS] [db_name.]view_name [(column_name [COMMENT column_comment], ...) ]
  [COMMENT view_comment]
  [TBLPROPERTIES (property_name = property_value, ...)]
  AS SELECT ...;

//删除视图
DROP VIEW [IF EXISTS] [db_name.]view_name;

//更改视图属性
ALTER VIEW [db_name.]view_name SET TBLPROPERTIES table_properties;
table_properties:
  : (property_name = property_value, property_name = property_value, ...)

//将一个已经存在视图更改为另一个视图,原视图会被删除
ALTER VIEW [db_name.]view_name AS select_statement;

创建/更改/删除物化视图(Create/Drop/Alter Materialized View)

//创建一个物化视图
CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db_name.]materialized_view_name
  [DISABLE REWRITE]
  [COMMENT materialized_view_comment]
  [PARTITIONED ON (col_name, ...)]
  [CLUSTERED ON (col_name, ...) | DISTRIBUTED ON (col_name, ...) SORTED ON (col_name, ...)]
  [
    [ROW FORMAT row_format]
    [STORED AS file_format]
      | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (property_name=property_value, ...)]
AS SELECT ...;

//删除物化视图
DROP MATERIALIZED VIEW [db_name.]materialized_view_name;

//更改物化视图
ALTER MATERIALIZED VIEW [db_name.]materialized_view_name ENABLE|DISABLE REWRITE;

创建/删除/更改索引(Create/Drop/Alter Index)

//创建索引
CREATE INDEX index_name
  ON TABLE base_table_name (col_name, ...)
  AS index_type
  [WITH DEFERRED REBUILD]
  [IDXPROPERTIES (property_name=property_value, ...)]
  [IN TABLE index_table_name]
  [
     [ ROW FORMAT ...] STORED AS ...
     | STORED BY ...
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (...)]
  [COMMENT "index comment"];

//删除索引
DROP INDEX [IF EXISTS] index_name ON table_name;

//更改索引
ALTER INDEX index_name ON table_name [PARTITION partition_spec] REBUILD;

创建/删除Macro(Create/Drop Macro)

/**
*Macro类似于lambda函数
*/
//创建临时Macro
CREATE TEMPORARY MACRO macro_name([col_name col_type, ...]) expression;

//删除Macro
DROP TEMPORARY MACRO [IF EXISTS] macro_name;

创建/删除/重新加载函数UDF(Create/Drop/Reload Function)

//创建临时函数,次函数只存活于session期间,session关闭就消失,使用此功能,您可以注册用户定义功能(UDF)
CREATE TEMPORARY FUNCTION function_name AS class_name;

//删除临时函数
DROP TEMPORARY FUNCTION [IF EXISTS] function_name;

//创建由class_name实现的永久函数,USING子句指定需要添加到环境中的jar,
//当Hive会话首次引用该功能时,这些资源将被添加到环境中,就像发出了ADD JAR / FILE一样。如果Hive不在本地模式下,则资源位置必须是非本地URI,例如HDFS位置。
CREATE FUNCTION [db_name.]function_name AS class_name
  [USING JAR|FILE|ARCHIVE 'file_uri' [, JAR|FILE|ARCHIVE 'file_uri'] ];

显示(Show)

//显示库
SHOW (DATABASES|SCHEMAS) [LIKE 'identifier_with_wildcards'];

//显示表
SHOW TABLES [IN database_name] ['identifier_with_wildcards'];

//显示视图
SHOW VIEWS [IN/FROM database_name] [LIKE 'pattern_with_wildcards'];

//显示物化视图
SHOW MATERIALIZED VIEWS [IN/FROM database_name] [LIKE 'pattern_with_wildcards’];

//显示分区
SHOW PARTITIONS table_name;

//显示拓展表/分区,如果使用LIKE指定,则由正则表达式过滤
SHOW TABLE EXTENDED [IN|FROM database_name] LIKE 'identifier_with_wildcards' [PARTITION(partition_spec)];

//显示表属性
SHOW CREATE TABLE ([db_name.]table_name|view_name);

//显示索引
SHOW [FORMATTED] (INDEX|INDEXES) ON table_with_index [(FROM|IN) db_name];

//显示列
SHOW COLUMNS (FROM|IN) table_name [(FROM|IN) db_name];

//显示功能,如果使用LIKE指定,则由正则表达式过滤
SHOW FUNCTIONS [LIKE "<pattern>"];

//显示锁
SHOW LOCKS <table_name>;
SHOW LOCKS <table_name> EXTENDED;
SHOW LOCKS <table_name> PARTITION (<partition_spec>);
SHOW LOCKS <table_name> PARTITION (<partition_spec>) EXTENDED;
SHOW LOCKS (DATABASE|SCHEMA) database_name;     -- (Note: Hive 0.13.0 and later; SCHEMA added in Hive 0.14.0)

//显示配置
SHOW CONF <configuration_name>;

//显示事务
SHOW TRANSACTIONS;

//显示当前正在压缩或计划进行压缩的所有表和分区的列表
SHOW COMPACTIONS;

描述(Describe)

//描述数据库/架构
DESCRIBE DATABASE [EXTENDED] db_name;
DESCRIBE SCHEMA [EXTENDED] db_name;     -- (Note: Hive 1.1.0 and later)

//描述表/视图/材料化视图/列
DESCRIBE [EXTENDED|FORMATTED] 
  table_name[.col_name ( [.field_name] | [.'$elem$'] | [.'$key$'] | [.'$value$'] )* ];
                                        -- (Note: Hive 1.x.x and 0.x.x only. See "Hive 2.0+: New Syntax" below)

//显示列统计(如果表已分区,则计算所有分区的列统计信息)
DESCRIBE FORMATTED [db_name.]table_name column_name;                              -- (Note: Hive 0.14.0 and later)
DESCRIBE FORMATTED [db_name.]table_name column_name PARTITION (partition_spec);   -- (Note: Hive 0.14.0 to 1.x.x)               

//描述分区
DESCRIBE [EXTENDED|FORMATTED] table_name[.column_name] PARTITION partition_spec;
                                        -- (Note: Hive 1.x.x and 0.x.x only. See "Hive 2.0+: New Syntax" below)                        

举个例子

hive> show partitions part_table;
OK
d=abc
 
 
hive> DESCRIBE extended part_table partition (d='abc');
OK
i                       int                                        
d                       string                                     
                  
# 分区信息       
# col_name              data_type               comment            
                  
d                       string                                     
                  
Detailed Partition Information  Partition(values:[abc], dbName:default, tableName:part_table, createTime:1459382234, lastAccessTime:0, sd:StorageDescriptor(cols:[FieldSchema(name:i, type:int, comment:null), FieldSchema(name:d, type:string, comment:null)], location:file:/tmp/warehouse/part_table/d=abc, inputFormat:org.apache.hadoop.mapred.TextInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), parameters:{numFiles=1, COLUMN_STATS_ACCURATE=true, transient_lastDdlTime=1459382234, numRows=1, totalSize=2, rawDataSize=1})  
Time taken: 0.325 seconds, Fetched: 9 row(s)
 
 
hive> DESCRIBE formatted part_table partition (d='abc');
OK
# col_name              data_type               comment            
                  
i                       int                                        
                  
# 分区信息    
# col_name              data_type               comment            
                  
d                       string                                     
                  
# 详细分区信息             
Partition Value:        [abc]                   
Database:               default                 
Table:                  part_table              
CreateTime:             Wed Mar 30 16:57:14 PDT 2016    
LastAccessTime:         UNKNOWN                 
Protect Mode:           None                    
Location:               file:/tmp/warehouse/part_table/d=abc    
Partition Parameters:           
        COLUMN_STATS_ACCURATE   true               
        numFiles                1                  
        numRows                 1                  
        rawDataSize             1                  
        totalSize               2                  
        transient_lastDdlTime   1459382234         
                  
# 存储信息           
SerDe Library:          org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe      
InputFormat:            org.apache.hadoop.mapred.TextInputFormat        
OutputFormat:           org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat      
Compressed:             No                      
Num Buckets:            -1                      
Bucket Columns:         []                      
Sort Columns:           []                      
Storage Desc Params:            
        serialization.format    1                  
Time taken: 0.334 seconds, Fetched: 35 row(s)
发布了17 篇原创文章 · 获赞 0 · 访问量 339

猜你喜欢

转载自blog.csdn.net/qq_37163925/article/details/105674766