导入数据

LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename 
[PARTITION (partcol1=val1, partcol2=val2 ...)]
eg：
//1. 加载本地文件到hive表
load data local inpath '/opt/datas/log.txt'into table default.log_20150913;
//2. 加载hdfs文件到hive表（表加载完之后会删除hdfs中的文件）
load data inpath '/user/root/hive/datas/log.txt'into table default.log_20150913;
//3. 加载数据覆盖表中已有的数据
load data local inpath '/opt/datas/log.txt' OVERWRITE into table default.log_20150913;
//4. 创建表的时候通过insert进行加载
create table default.emp_ci like emp;
insert into table default.emp_ci select * from default.emp;
//5. 创建表的时候通过location指定加载
create table IF NOT EXISTS default.bf_log_20150913( 
ip string COMMENT 'remote ip address', 
users string COMMENT 'users',
req_url string COMMENT 'user request url')
COMMENT 'beifeng web access logs'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' 
STORED AS TEXTFILE 
LOCATION 'user/root/hive/warehouse/bf_log'
//6. import 
IMPORT [[EXTERNAL] TABLE new_or_original_tablename [PARTITION (part_column="value"[, ...])]]
  FROM 'source_path'
  [LOCATION 'import_target_path']
eg: import table imported_dept from 'hdfs_exports_location/department';

[LOCAL] ------------原始文件的存储位置：
- 本地 local
- hdfs
[OVERWRITE] ---------------对表中的数据是否覆盖：
- 覆盖–OVERWRITE
- 追加–默认
[PARTITION (partcol1=val1, partcol2=val2 …)]–分区表中加载，特殊性

导出数据

//1. 导出至本地
insert overwrite local directory '/opt/datas/hive_exp_emp' 
select * from default.emp
//2. 带格式导出
insert overwrite local directory '/opt/datas/hive_exp_emp' 
select * from default.emp
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' \t'
COLLECTION ITEMS TERMINATED BY '\n' 
//3. 
bin/hive -e "select * from default.emp ;" > /opt/datas/exp_res.txt
//4. 
insert overwrite directory 'user/root/hive/datas/hive_exp_emp' 
select * from default.emp
//5. sqoop
//6. export
EXPORT TABLE tablename [PARTITION (part_column="value"[, ...])]
  TO 'export_target_path' [ FOR replication('eventid') ]
eg: EXPORT TABLE default.emp TO '/user/root/hive/export/emp_exp' ;

Hive常见查询

wiki

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
  FROM table_reference
  [WHERE where_condition]
  [GROUP BY col_list]
  [ORDER BY col_list]
  [CLUSTER BY col_list
    | [DISTRIBUTE BY col_list] [SORT BY col_list]
  ]
 [LIMIT [offset,] rows]

全表查询
- select * from emp ;
- select t.empno from emp t ;
= > >= <=
- select * from emp where sal between 800 and 1500
is null/not null /in /not in
- select * from emp where comm is null
max /min /count /sum /avg
- select count(*) cntfrom emp
- select max(sal) max_sal from emp
group by/having
- select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
- select t.deptno, t.job, max(t.sal) max_sal from emp t group by t.deptno, t.job
- ** having
  - where 是针对单条记录进行筛选
  - having 是针对分组结果进行筛选
- //求每个部门的平均薪水大于2000的部门
- select deptno ,avg(sal) avg_sal from emp group by deptno having avg_sal>2000;
join
- 两个表进行连接
- 等值连接 join…on
  - select * from emp e join dept d on e.deptno = d.deptno ;
- 左连接 left join
  - select * from emp e left join dept d on e.deptno = d.deptno ;
- 右连接 right join
  - select * from emp e right join dept d on e.deptno = d.deptno ;
- 全连接
  - select * from emp e full join dept d on e.deptno = d.deptno ;
order by 对全局数据的排序，仅有一个reduce
- select * from emp order by empno desc ;
sort by 对每一个reduce内部数据进行排序，全局的结果集没有排序
- set ma[reduce.job.reduces = ? 设置hive执行时reduce的个数
- select * from emp sort by empno asc ;
distribute by 类似于mapreduce中partition的作用，对数据进行分区，结合sort by使用
注意事项：distribute by 必须在sort by之前
- select * from emp distribute by deptno sort by empno asc ;
cluster by 其实就是sort by 和distribute by的结合，当distribute by和sort by 字段相同时，可以使用cluster by
- select * from emp cluster by empno ;

Hive表导入、导出数据

导入数据

导出数据

Hive常见查询

猜你喜欢