版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/yuanyi0501/article/details/83216331
导入数据
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename
[PARTITION (partcol1=val1, partcol2=val2 ...)]
eg:
//1. 加载本地文件到hive表
load data local inpath '/opt/datas/log.txt'into table default.log_20150913;
//2. 加载hdfs文件到hive表(表加载完之后会删除hdfs中的文件)
load data inpath '/user/root/hive/datas/log.txt'into table default.log_20150913;
//3. 加载数据覆盖表中已有的数据
load data local inpath '/opt/datas/log.txt' OVERWRITE into table default.log_20150913;
//4. 创建表的时候通过insert进行加载
create table default.emp_ci like emp;
insert into table default.emp_ci select * from default.emp;
//5. 创建表的时候通过location指定加载
create table IF NOT EXISTS default.bf_log_20150913(
ip string COMMENT 'remote ip address',
users string COMMENT 'users',
req_url string COMMENT 'user request url')
COMMENT 'beifeng web access logs'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
STORED AS TEXTFILE
LOCATION 'user/root/hive/warehouse/bf_log'
//6. import
IMPORT [[EXTERNAL] TABLE new_or_original_tablename [PARTITION (part_column="value"[, ...])]]
FROM 'source_path'
[LOCATION 'import_target_path']
eg: import table imported_dept from 'hdfs_exports_location/department';
- [LOCAL] ------------原始文件的存储位置:
- 本地 local
- hdfs
- [OVERWRITE] ---------------对表中的数据是否覆盖 :
- 覆盖–OVERWRITE
- 追加–默认
- [PARTITION (partcol1=val1, partcol2=val2 …)]–分区表中加载,特殊性
导出数据
//1. 导出至本地
insert overwrite local directory '/opt/datas/hive_exp_emp'
select * from default.emp
//2. 带格式导出
insert overwrite local directory '/opt/datas/hive_exp_emp'
select * from default.emp
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' \t'
COLLECTION ITEMS TERMINATED BY '\n'
//3.
bin/hive -e "select * from default.emp ;" > /opt/datas/exp_res.txt
//4.
insert overwrite directory 'user/root/hive/datas/hive_exp_emp'
select * from default.emp
//5. sqoop
//6. export
EXPORT TABLE tablename [PARTITION (part_column="value"[, ...])]
TO 'export_target_path' [ FOR replication('eventid') ]
eg: EXPORT TABLE default.emp TO '/user/root/hive/export/emp_exp' ;
Hive常见查询
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY col_list]
]
[LIMIT [offset,] rows]
- 全表查询
select * from emp ;
select t.empno from emp t ;
- = > >= <=
select * from emp where sal between 800 and 1500
- is null/not null /in /not in
select * from emp where comm is null
- max /min /count /sum /avg
select count(*) cntfrom emp
select max(sal) max_sal from emp
- group by/having
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
select t.deptno, t.job, max(t.sal) max_sal from emp t group by t.deptno, t.job
- ** having
- where 是针对单条记录进行筛选
- having 是针对分组结果进行筛选
- //求每个部门的平均薪水大于2000的部门
select deptno ,avg(sal) avg_sal from emp group by deptno having avg_sal>2000;
- join
- 两个表进行连接
- 等值连接 join…on
select * from emp e join dept d on e.deptno = d.deptno ;
- 左连接 left join
select * from emp e left join dept d on e.deptno = d.deptno ;
- 右连接 right join
select * from emp e right join dept d on e.deptno = d.deptno ;
- 全连接
select * from emp e full join dept d on e.deptno = d.deptno ;
- order by 对全局数据的排序,仅有一个reduce
select * from emp order by empno desc ;
- sort by 对每一个reduce内部数据进行排序,全局的结果集没有排序
- set ma[reduce.job.reduces = ? 设置hive执行时reduce的个数
select * from emp sort by empno asc ;
- distribute by 类似于mapreduce中partition的作用,对数据进行分区,结合sort by使用
- 注意事项:distribute by 必须在sort by之前
select * from emp distribute by deptno sort by empno asc ;
- cluster by 其实就是sort by 和distribute by的结合,当distribute by和sort by 字段相同时,可以使用cluster by
select * from emp cluster by empno ;