Hive入门详解（二）

Hive的DDL操作

创建表（四种表）

内部表
创建产品表

create table t_product(id int,name string,price double,category string)
row format delimited
fields terminated by ','
stored as textfile;

导入数据(从本地)
load data local inpath '/home/hadoop/product_data' into table t_product;
导入数据(从hdfs)
load data inpath '/data/hive/test/product_data' into table t_product;
查看表数据
select * from t_product;
删除表
drop table t_product;

外部表
创建手机表

create external table t_phone(id int,name string,price double)
row format delimited
fields terminated by ','
stored as textfile
location '/hive/test/'; 
注：在hdfs的指定位置上创建表

导入数据
load data local inpath '/home/hadoop/phone_data' into table t_phone;

分区表

创建表(分区表)
根据月份分区
create table t_order(id int,name string,cost double)
partitioned by (month string)
row format delimited 
fields terminated by ',';

导入数据到分区6
load data local inpath '/home/hadoop/phone_data' into table t_order
partition(month='6');

查看所有订单的分区
show partitions t_order;

桶表

创建表(桶表)
create table t_product_bucket(id int,name string ,price string,category string)
clustered by(id) into 3 buckets
row format delimited 
fields terminated by ',';

桶表汇中的数据，只能从其他表中用子查询进行插入
set hive.enforce.bucketing=true;
insert into table t_product_bucket select * from t_product;

查询2上的数据
select * from t_product_bucket tablesample(bucket 2 out of 3 on id);

创建表（其它常用表）

子查询创建表

create table t_product_back
as
select * from t_product;

数组

创建表(数组)
create table tab_array (a array<int>,b array<string>)
row format delimited
fields terminated by '\t'
collection items terminated by ',';

数据样式
1,2,3   hello,world,briup

导入数据
load data local inpath '/home/hadoop/array_data' into table tab_array;

查询数据
select a[2],b[1] from tab_array;

创建表(map)
create table tab_map (name string,info map<string,string>)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':';

数据样式
zhangsan    name:zhangsan,age:18,gender:male

导入数据
load data local inpath '/home/hadoop/map_data' into table tab_map;

查询数据
select  info['name'],info['gender'] from tab_map;

struct

创建表(struct)
create table tab_struct(name string,info struct<age:int,tel:string,salary:double>)
row format delimited
fields terminated by '\t'
collection items terminated by ',';
 
数据样式
zhangsan    18,18945883365,22.3

导入数据
load data local inpath '/home/hadoop/struct_data' into table tab_struct;

查询数据
select info.age,info.tel from tab_struct;

修改表

增加/删除分区
alter table t_phone add partition(color='red');
alter table t_phone drop partition(color='blue'),partirion(color='black');
重命名表
alter table t_phone rename to t_phone2;
增加/替换列
alter table t_phone add colums (comment string);
alter table t_phone replace colums (name string,price double);
注：replace会替换该表的所有列

显示命令

展示表：show tables
展示数据库：show databases
展示分区：show partitions
展示内置函数：show functions

Hive的DML操作

Load

基本语法
load [local] inpath '/home/hadoop/test_data' [overwrite] into table t_test
local关键字
如果指定了local关键字，local命令会去查找本地文件系统中的filepath，如果没有指定local关键字，则根据inpath中的uri查找文件
overwrite关键字
目标表（或者分区）中的内容会被删除，然后再将filepath指向的文件目录中的内容添加到表/分区中，相当于覆盖。

Insert

基本模式插入

insert overwrite table t_phone partition(color='red')
select name,price from t_test where color='red';

多插入模式

insert overwrite table t_phone partition(color='red')
select name,price from t_test where color='red'
insert overwrite table t_phone partition(color='blue')
select name,price from t_test where color='blue';

自动分区模式

insert overwrite table t_phone partition(color)
select name,price from t_test where color='red';

导出文件到本地

insert overwrite local directory '/home/hadoop/test/'
select * from t_phone;

导出数据到HDFS

insert overwrite directory 'hdfs://hadoop1:9000/hive/test'
select * from t_phone;

Select

准备范例

范例表big_data
create table big_data(id int,point double)
row format delimited
fields terminated by ','
stored as textfile;

范例数据
big_data
1,80.0
4,50.0
3,60.0
8,40.0
6,85.0
2,100.0
5,80.0
7,60.0

导入数据
load data local inpath '/home/hadoop/big_data' into table big_data;

查看数据
select * from big_data;

排序
order by id asc 全局排序
ex:
select * from big_data order by id;
注：order by会对输入做全局排序，因此只有一个reducer，会导致当输入规模较大时，需要较长的计算时间。

sort by id desc 局部排序
ex:
set mapred.reduce.tasks=2;
select * from big_data sort by id;

分区
分区
distribute by 按照指定的字段或表达式对数据进行划分，输出到对应的Reduce或者文件中，且分发的算法是hash散列
ex:
set mapred.reduce.tasks=2;
insert overwrite local directory ‘/home/hadoop/data’
select id from big_data
distribute BY id;
注：overwrite使用千万注意，不要把家目录给覆盖了
分区+排序
cluster by
除了兼具distribute by的功能，还兼具sort by的排序功能
ex:
set mapred.reduce.tasks=2;
insert overwrite local directory ‘/home/hadoop/data’
select id from big_data
cluster by id;
去重
group by
select point from big_data group by point;

distinct
select distinct point from big_data;
注：如果数据较多，distinct效率会更低一些，一般推荐使用group by。

虚拟列
INPUT__FILE__NAME：数据对应的HDFS文件名；
BLOCK__OFFSET__INSIDE__FILE：该行记录在文件中的偏移量；
ex:
select id,INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE from big_data;

Join

Hive支持等值连接（equality join）、内连接（inner join）外连接（outer join）和（left/right join）。Hive不支持非等值的连接，因为非等值连接非常难转化到mapreduce任务。另外，Hive支持多于2个表的连接。

只支持等值join
可以join多于两个表
- 案例一：
  如果join中多个表的join的key是同一个，则join会被转化为单个mapreduce任务，如下所示，只使用了b.id作为join key
```
select a.name,b.name,c.name
from a 
join b on (a.id = b.id)
join c on (c.id = b.id) 
```
- 案例二：
  如果join中多个表的join的key不是同一个，则join会被转化为多个mapreduce任务，如下所示，b.id1作为第一次join的条件，而b.id2作为第二次join的条件。因此，会产生两个mapreduce任务。
```
select a.name,b.name,c.name
from a 
join b on (a.id = b.id1)
join c on (c.id = b.id2) 
```
join对应产生的mapreduce的逻辑
reducer会缓存join序列中除了最后一个表的所有表的记录，再通过最后一个表将结果序列化到文件系统。这一实现有助于在reduce端减少内存的使用量。实践中，应该把最大的那个表写在最后（否则会因为缓存而浪费大量内存）
join是不能交换位置的
无论是left outer join还是right outer join，都是左外连接的，必须按照左连接的顺序

具体实践

准备范例

范例数据
user_name_data
1   zhangsan
2   lisi
3   wangwu

范例表user_name
create table user_name(id int,name string)
row format delimited
fields terminated by '\t'
stored as textfile;

导入数据
load data local inpath '/home/hadoop/user_name_data' into table user_name;

查看数据
select * from user_name;

范例数据
user_age_data
1   30
2   29
4   21

范例表user_age
create table user_age(id int,age int)
row format delimited
fields terminated by '\t'
stored as textfile;

导入数据
load data local inpath '/home/hadoop/user_age_data' into table user_age;

查看数据
select * from user_age;

内连接

select a.id,
a.name,
b.age
from user_name a
inner join user_age b
on (a.id = b.id);

注：当b中找不到等值的a.id时，没有输出

左外连接

select a.id,
a.name,
b.age
from user_name a
left outer join user_age b
on (a.id = b.id);

注：当b中找不到等值的a.id时，记录也会输出：a.id,a.name,null

右外连接

select a.id,
a.name,
b.age
from user_name a
right outer join user_age b
on (a.id = b.id);

注：当b中找不到等值的a.id时，记录也会输出：null,null,b.age

全外连接

select a.id,
a.name,
b.age
from user_name a
full outer join user_age b
on (a.id = b.id);

注：当b中找不到等值的a.id时，记录也会输出：null,null,null

半连接

left semi join
以left semi join关键字前面的表为主表
返回主表的KEY也在副表中的记录
相当于in和exists
select a.id,
a.name
from user_name a
left semi join user_age b
on (a.id = b.id);
--等价于：
select a.id,
a.name
from user_name a
where a.id in (select id from user_age);

笛卡尔积关联（CROSS JOIN）

select a.id,
a.name,
b.age
from user_name a
cross join user_age b;