Hadoop-Hive学习笔记(2)

1.Hive基本操作

#创建数据库
hive>create database name;
#创建新表
hive> create table students(id int,name string); #创建一个新表,结构与其他一样 hive> create table new_students like students;
#创建分区表 hive> create table students(id int,name string) partitioned by(region string);
#删除分区
hive> ALTER TABLE students DROP partition_spec; #从本地文件加载数据,load是单纯的复制/移动操作,overtite会覆盖已有数据 hive> load data local inpath'path1' OVERWRITE INTO TABLE students; #加载分区表的数据 hive> load data local inpath'paht1' into table students partition(region="Shanghai"); #展示数据库 hive> show databases; #展示表的分区 hive> show partitions students; #展示所有表 hive> show tables; hive> show tables '.*s'; #显示表的结构 hive> describe students; #显示所有函数 hive> show functions; #查看函数用法 hive> describe function name_function; #更新表名 hive> ALTER TABLE name1 RENAME TO name2; #添加新的一列,并注释 hive> ALTER TABLE students ADD COLUMNS(columns_new int comment "a comment");
#修改列的名字,类型,位置,注释
hive>
#删除表
hive> DROP TABLE students;
#删除表但保留表的结构定义
hive> dfs -rmr /user/hive/warehouse/students;

#将查询结果存入Hive表
hive> INSERT OVERWRITE TABLE student [PARTITION(partcol1=val1,partcol2=val2...)] select_statement from from_statement;
#将查询结果存入HDFS文件系统
hive> insert overwrite [local] DIRECTORY directory1 select_statement from from_statement;
#将结果插入不同表中,还能写入分区,hdfs和本地目录
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE DIRECTORY '/tmp/dest2.out' SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
#内连接 hive> select sales.*,things.* from sales,things on(sales.id=things.id); #外连接: hive> SELECT sales.*, things.* FROM sales LEFT OUTER JOIN things ON (sales.id = things.id); hive> SELECT sales.*, things.* FROM sales RIGHT OUTER JOIN things ON (sales.id = things.id); hive> SELECT sales.*, things.* FROM sales FULL OUTER JOIN things ON (sales.id = things.id);
#查看hive为某个查询使用多少个MR作业 hive> Explain select sales.*,things.* from sales,things on(sales.id=things.id);
#创建视图 hive> CREATE VIEW valid_records AS SELECT * FROM records2 WHERE temperature !=9999; #查看视图详细信息 hive> DESCRIBE EXTENDED valid_records;
#删除视图
hive> drop view name_view;

2. Select操作详解

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
   FROM table_reference
   [WHERE where_condition]
   [GROUP BY col_list [HAVING condition]]
   [ CLUSTER BY col_list
  | [DISTRIBUTE BY col_list]]
   [SORT BY| ORDER BY col_list]
   [limit number]

a.使用ALL和DISTINCT选项区分对重复记录的处理,默认是ALL。

b.Where条件,支持AND,OR,IN ,NOT IN,不支持EXIST 和 NOT EXIST。

c.Order by全局排序,只有一个Reduce任务。Sort by 只在本机做排序。

d.Limit 限制查询记录数

select * from t1 limit 5;

 实现Top K查询

SET mapred.reduce.tasks=1
SELECT * FROM test SORT BY amount DESC LIMIT k;

e.UNION ALL合并多个select的查询结果

select_statement UNION ALL select_statement UNION ALL select_statement ...f.

f. LEFT SEMI JOIN 是 in/exist子查询的更高效的实现。

SELECT a.key,a.value 
   FROM a
   WHERE a.key in
     (select b.key from B);

=>>
select a.key,a.value from a LEFT SEMI JOIN b on(a.key=b.key);
  

  

  

 

猜你喜欢

转载自www.cnblogs.com/luhaojie/p/9343330.html