第二节:大数据仓库HIVE《02》

1、Hive中数据库Database基本操作


CREATE TABLE IF NOT EXISTS DEFAULT.log_20180605(
ip string comment 'remote ip address',
user string,
req_url string comment 'user request url'
)
comment 'web access logs'
row format delimited fields terminated by ' '
stored as textfile;

load data local inpath '/opt/datas/log.txt' into table default.log_20180605;

create table if not exists default.log_20180605_sa
as select ip,req_url from default.log_20180605;

2、Hive中常见的三种表创建方式


create database db_hive_01;
create database if not exists db_hive_02;
create database if not exists db_hive_03 location '/user/hadoop/hive/warehouse/db_hive_03.db';

show databases;
show databases like 'db_hive*';

use db_hive;

desc databases db_hive_03;
desc databases extended db_hive_03;

drop database db_hive_03;
drop database db_hive_03 cascade;
drop database if exists db_hive_03;

3、Hive的数据类型


Numeric Types

    TINYINT (1-byte signed integer, from -128 to 127)
    SMALLINT (2-byte signed integer, from -32,768 to 32,767)

    INT/INTEGER (4-byte signed integer, from -2,147,483,648 to 2,147,483,647)

    BIGINT (8-byte signed integer, from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807)
    FLOAT (4-byte single precision floating point number)
    DOUBLE (8-byte double precision floating point number)

    DOUBLE PRECISION (alias for DOUBLE, only available starting with Hive 2.2.0)

    DECIMAL
        Introduced in Hive 0.11.0 with a precision of 38 digits
        Hive 0.13.0 introduced user-definable precision and scale
    NUMERIC (same as DECIMAL, starting with Hive 3.0.0)

Date/Time Types

    TIMESTAMP (Note: Only available starting with Hive 0.8.0)
    DATE (Note: Only available starting with Hive 0.12.0)
    INTERVAL (Note: Only available starting with Hive 1.2.0)

String Types

    STRING
    VARCHAR (Note: Only available starting with Hive 0.12.0)
    CHAR (Note: Only available starting with Hive 0.13.0)

Misc Types

    BOOLEAN
    BINARY (Note: Only available starting with Hive 0.8.0)

Complex Types

    arrays: ARRAY<data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
    maps: MAP<primitive_type, data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
    structs: STRUCT<col_name : data_type [COMMENT col_comment], ...>
    union: UNIONTYPE<data_type, data_type, ...> (Note: Only available starting with Hive 0.7.0.)

4、Hive中的表操作


员工表
create table if not exists default.emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
row format delimited fields terminated by '\t';
部门表
create table if not exists default.dept(
deptno int,
dname string,
loc string
)
row format delimited fields terminated by '\t';

loca data local inpath '/opt/datas/emp.txt' overwrite into table default.emp;
loca data local inpath '/opt/datas/dept.txt' overwrite into table default.dept;

create table if not exists default.dept_cats
as
select * from dept;

truncate table dept_cats;

create table if not exists default.dept_like
like
default.dept;

alter table dept_like rename to dept_like_rename;
drop table if exists dept_like_rename;

5、在Hive中表的类型

5.1、管理表

5.2、托管表(外部表)


create EXTERNAL table IF NOT EXISTS default.emp_ext(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

5.3、内部表和外部表区别


内部表:
内部表也称之为MANAGED_TABLE;
默认存储在/user/hive/warehouse下,也可以通过location指定;
删除表时,会删除表数据以及元数据;
外部表:
外部表称之为EXTERNAL_TABLE;
在创建表时可以自己指定目录位置(LOCATION);
删除表时,只会删除元数据,不会删除表数据;

create EXTERNAL table IF NOT EXISTS default.emp_ext2(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '/user/hadoop/hive/warehouse/emp_ext2';

dfs -put /opt/datas/emp.txt /user/hadoop/hive/warehouse/emp_ext2;

drop table default.emp_ext2;
这时候数据还在;

6、分区表

6.1、分区表实际上就是对应一个HDFS文件系统上的独立的文件夹,该文件夹下是该分区所有的数据文件。Hive中的分区就是分目录,把一个大数据集根据业务需要分割成更下的数据集。

在查询时通过where子句中的表达式来选择查询所需要的指定的分区,这样查询效率会提高很多。


create EXTERNAL table IF NOT EXISTS default.emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int
)
partitioned by (month string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

load data local inpath '/opt/datas/emp.txt' into table default.emp_partition partition (month='201805');

select * from default.emp_partition where month='201805';

select count(distinct ip) from default.emp_partition where month='201805'

union

select count(distinct ip) from default.emp_partition where month='201804'

union

select count(distinct ip) from default.emp_partition where month='201803';

直接执行;也可以存储为xx.sql,bin/hive -f xx.sql

二级分区:

建表时partitioned by (month string,day string)

select * from emp_partition where month='201805' and day='12';

6.2、注意事项


1.无分区
create table if not exists default.dept_nopart(
deptno int,
dname string,
loc string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_nopart;
select * from dept_nopart;
有数据

2.有分区
create table if not exists default.dept_part(
deptno int,
dname string,
loc string
)
partitioned by(day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

dfs -mkdir -p /user/hive/warehouse/dept_part/day=20180512;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20180512;
select * from dept_nopart;
无数据

----------------------------------------------------------------------------------------------------------------
use metastore;
select * from PARTITIONS;//记录分区
处理方法1:
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20180512;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20180512;

msck repair table dept_part;
select * from PARTITIONS;
select * from dept_part;
有数据了
处理方法2:
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20180513;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20180513;

alter table dept_part add partition(day='20180513');
select * from dept_part;
有数据了

7、导入数据进入Hive的六大方式


LOAD DATA [LOCAL] INPATH 'filepath' 
[OVERWRITE] INTO TABLE tablename 
[PARTITION (partcol1=val1, partcol2=val2 ...)]

* 原始文件存储的位置
	* 本地 local
	* hdfs
* 对表的数据是否覆盖
	* 覆盖 overwrite
	* 追加
* 分区表加载,特殊性
	PARTITION (partcol1=val1, partcol2=val2 ...)

1> 加载本地文件到hive表
	locad data local inpath '/opt/datas/emp.txt' into table default.emp;
2> 加载hdfs文件到hive中
	load data inpath '/user/hadoop/hive/datas/emp.txt' into table default.emp;
3> 加载数据覆盖表中已有的数据
	load data inpath '/user/hadoop/hive/datas/emp.txt' overwrite into table default.emp;
4> 创建表是通过select加载
	create table default.emp_ci like emp;
	insert into table default.emp_cli select * from default.emp;
5> 创建表是通过insert加载
	insert into table default.emp_cli values(a,b,c);
6> 创建表的时候通过location指定加载
	create table default.emp_ce(
	id int,
	name string
	)
	location '/user/hadoop/hive/datas/emp.txt'

8、导出Hive表数据的几种方式


1.第一种导出
insert overwrite local directory '/opt/datas/hive_exp_emp'
select * from default.emp;

insert overwrite local directory '/opt/datas/hive_exp_emp2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '\n'
select * from default.emp;
2.第二种导出
bin/hive -e "select * from default.emp;" > /opt/datas/exp_res.txt
3.第三中导出
insert overwrite directory '/user/hive/warehouse/hive_exp_emp'
select * from default.emp;
=============
sqoop
	hdfs/hive -> rdbms
	rdbms -> hdfs/hive/hbase

9、Hive中常见的查询

 
全表查询.指定字段查询
 =, >=, <=, between and,limt
 (not) in, is(not) null, in, not in
 max,min,count,sum,avg
 group by, having
 join

GROUP BY
 分组
 emp表
 * 每个部门的平均工资
 SELECT T.DEPTNO,AVG(T.SAL) AVG_SAL FROM EMP T GROUP BY T.DEPTNO;
 * 每个部门中每个岗位的最高薪水
 SELECT T.DEPTNO, T.JOB, MAX(T.SAL) MAX_SAL FROM EMP T GROUP BY T.DEPTNO,T.JOB; 

HAVING
	* where		是针对单条记录进行筛选
	* having	是针对分组结果进行筛选
求每个部门的平均薪水大于2000的部门
SELECT DEPTNO, AVG(SAL) FROM EMP GROUP BY DEPTNO;
SELECT DEPTNO, AVG(SAL) AVG_SAL FROM EMP GROUP BY DEPTNO HAVING AVG_SAL>2000 ;

JOIN
	两个表进行链接
	m n
	m表中的一条记录和n表中的一条记录组成一条记录
等值JOIN
	join ... on
	select e.empno,e.ename,d.deptno,d.dname from emp e join dept d on e.deptno=d.deptno;
左链接
left join
select e.empno,e.ename,d.deptno,d.dname from emp e left join dept d on e.deptno=d.deptno;

右链接
right join
select e.empno,e.ename,d.deptno,d.dname from emp e right join dept d on e.deptno=d.deptno;

全连接
full join
select e.empno,e.ename,d.deptno,d.dname from emp e full join dept d on e.deptno=d.deptno;

10、数据导入导出


Export
	导出,将hive表中的数据,导出到外部
	EXPORT TABLE TABLENAME TO 'export_target_path';
	export_target_path:指的是HDFS上的路径
	export table default.emp to '/user/hadoop/hive/export/emp_exp';
Import
	导入,将外部数据导入hive表中
	create table db_hive.emp like default.emp;
	import table db_hive.emp from '/user/hadoop/hive/export/emp_exp';

11、Hive中order by,sort by,distribute by和cluster by


* order by
	对全局数据的一个排序,仅仅只有一个reduce
	SELECT * FROM EMP ORDER BY EMONO DESC;
* sort by
	对每一个reduce内部数据进行排序的,全局结果集来说不是排序
	set mapreduce.job.reduces = 3;
	select * from emp sort by empno asc;
	insert overwrite local directory '/opt/datas/sortby-res' select * from emp sort by empno asc;
* distribute by
	分区partition
	类似于MapReduce中分区partition,对数据进行分区,结合sort by进行使用
	insert overwrite local directory '/opt/datas/distby-res' select * from emp distribute by deptno sort by empno asc;
注意事项:
		distribute by 必须要在sort by 前面.

* cluster by
	当distribute by 和 sort by字段相同时,可以使用cluster by;
	insert overwrite local directory '/opt/datas/clusterby-res' select * from emp cluster by empno;

12、Hive中自带Function使用及自定义UDF编程及使用


Hive自带了一些函数,比如max/min,但是数量有限,自己可以通过
自定义UDF来方便的扩展.
UDF:用户自定义函数,允许用户扩展HiveQL功能:
UDF(User-Defined-Function)
一进一出
UDAF(User-Defined Aggregation Funcation)
聚集函数,多进一出;
类似于:count/max/min
UDTF(User-Defined Table-Generating Functions)
一进多出;
如lateral view explore()

	User Definition Function
	show functions;
	desc function split;
	desc function extended split;

https://cwiki.apache.org/confluence/display/Hive/HivePlugins
编程步骤:
1.继承org.apache.hadoop.hive.ql.UDF
2.需要事先evaluate函数,evaluate函数支持重载
注意事项:
1.UDF必须要有返回类型,可以返回null,但是返回类型不能为void;
2.UDF中蝉蛹Text/LongWritable等类型,不推荐使用java类型;

1>First, you need to create a new class that extends UDF, with one or more methods named evaluate.
2>Usage
add jar /opt/datas/hiveudf.jar
create temporary function my_lower as "com.senior.hive.udf.LowerUDF";
select ename,my_lower(ename) lowername from emp limit 5;

--CREATE FUNCTION myfunc AS 'myclass' USING JAR 'hdfs:///path/to/jar';
dfs -mkdir -p /user/hadoop/hive/jars/;
dfs -put /opt/datas/hiveudf.jar /user/hadoop/hive/jars;
CREATE function self_lower AS 'com.senior.hive.udf.LowerUDF' USING JAR 'hdfs://hadoop-senior:8020/user/hadoop/hive/jars/hiveudf.jar';

package com.senior.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

/**
 * 1. Implement one or more methods named
 * "evaluate" which will be called by Hive. 
 * 2."evaluate" should never be a void method.
 *  However it can return "null" if needed.
 * @author Guardian
 *
 */
public class LowerUDF extends UDF{

	public Text evaluate(Text str){
		//validate
		if (null == str.toString()) {
			return null;
		}
		//lower
		return new Text(str.toString().toLowerCase());
	}
	
	public static void main(String[] args) {
		System.out.println(new LowerUDF().evaluate(new Text("HIVE")));
	}
}





猜你喜欢

转载自blog.csdn.net/hekaihaw/article/details/80628567