Hadoop-Hive学习笔记（2）

1.Hive基本操作

#创建数据库
hive>create database name;
#创建新表
hive> create table students(id int,name string);
#创建一个新表，结构与其他一样
hive> create table new_students like students;

#创建分区表
hive> create table students(id int,name string) partitioned by(region string)；
#删除分区
hive> ALTER TABLE students DROP partition_spec;

#从本地文件加载数据,load是单纯的复制/移动操作，overtite会覆盖已有数据
hive> load data local inpath'path1' OVERWRITE INTO TABLE students;
#加载分区表的数据
hive> load data local inpath'paht1' into table students partition(region="Shanghai");

#展示数据库
hive> show databases;
#展示表的分区
hive> show partitions students;
#展示所有表
hive> show tables;
hive> show tables '.*s';
#显示表的结构
hive> describe students;
#显示所有函数
hive> show functions;
#查看函数用法
hive> describe function name_function;
#更新表名
hive> ALTER TABLE name1 RENAME TO name2;
#添加新的一列，并注释
hive> ALTER TABLE students ADD COLUMNS(columns_new int comment "a comment");
#修改列的名字，类型，位置，注释
hive>

#删除表
hive> DROP TABLE students;
#删除表但保留表的结构定义
hive> dfs -rmr /user/hive/warehouse/students;

#将查询结果存入Hive表
hive> INSERT OVERWRITE TABLE student [PARTITION(partcol1=val1,partcol2=val2...)] select_statement from from_statement;
#将查询结果存入HDFS文件系统
hive> insert overwrite [local] DIRECTORY directory1 select_statement from from_statement;
#将结果插入不同表中,还能写入分区，hdfs和本地目录
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE DIRECTORY '/tmp/dest2.out' SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;

#内连接
hive> select sales.*,things.* from sales,things on(sales.id=things.id);

#外连接：
hive> SELECT sales.*, things.* FROM sales LEFT OUTER JOIN things ON (sales.id = things.id);
hive> SELECT sales.*, things.* FROM sales RIGHT OUTER JOIN things ON (sales.id = things.id);
hive> SELECT sales.*, things.* FROM sales FULL OUTER JOIN things ON (sales.id = things.id);

#查看hive为某个查询使用多少个MR作业
hive> Explain select sales.*,things.* from sales,things on(sales.id=things.id);

#创建视图
hive> CREATE VIEW valid_records AS SELECT * FROM records2 WHERE temperature !=9999;
#查看视图详细信息
hive> DESCRIBE EXTENDED valid_records;
#删除视图
hive> drop view name_view;

2. Select操作详解

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
   FROM table_reference
   [WHERE where_condition]
   [GROUP BY col_list [HAVING condition]]
   [ CLUSTER BY col_list
  | [DISTRIBUTE BY col_list]]
   [SORT BY| ORDER BY col_list]
   [limit number]

a.使用ALL和DISTINCT选项区分对重复记录的处理，默认是ALL。

b.Where条件，支持AND，OR，IN ，NOT IN，不支持EXIST 和 NOT EXIST。

c.Order by全局排序，只有一个Reduce任务。Sort by 只在本机做排序。

d.Limit 限制查询记录数

select * from t1 limit 5；

　实现Top K查询

SET mapred.reduce.tasks=1
SELECT * FROM test SORT BY amount DESC LIMIT k;

e.UNION ALL合并多个select的查询结果

select_statement UNION ALL select_statement UNION ALL select_statement ...f.

f. LEFT SEMI JOIN 是 in/exist子查询的更高效的实现。

SELECT a.key,a.value 
   FROM a
   WHERE a.key in
     (select b.key from B);

=>>
select a.key,a.value from a LEFT SEMI JOIN b on(a.key=b.key);

Hadoop-Hive学习笔记（2）

猜你喜欢