hive advanced learning

create database

 create database db_hive_01 ;
 create database if not exists db_hive_02 ;
 #指定数据的存贮位置
 create database if not exists db_hive_03 location '/user/hive/warehouse/db_hive_03.db' ;
 #http://hadoop.jianxin.com:50070/explorer.html#/user/hive/warehouse/db_hive_03.db浏览器打开可看到创建的数据库
 

View database

show databases ;
#查看这一类数据库
show databases like 'db_hive*' ;
desc database db_hive_03;
#  db_name comment location        parameters  
#  db_hive_03              hdfs://hadoop.jianxin.com:9000/user/hive/warehouse/db_hive_03.db        root

desc database extended db_hive_03 ;#查看的信息更加全面

Creation of hive data table

create table IF NOT EXISTS default.jianxin_log_20170413(
ip string COMMENT 'remote ip address' ,
user string ,
req_url string COMMENT 'user request url')
COMMENT 'Jianxin Web Access Logs'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
STORED AS TEXTFILE ;

create table IF NOT EXISTS default.jianxin_log_20170415
AS select ip,req_url from default.jianxin_log_20170413 ;

create table if not exists default.dept_cats
as
select * from dept ;

type of hive table

Types of tables in Hive Management table (internal table) Managed table (external table)
Internal table data is stored in /user/hive/warehouse by default and
can be executed through location
Table data and metadata are deleted when a table is deleted

When an external table creates a table, it is generally necessary to specify a path. When
an external table deletes a table, the table data will not be deleted, only the metadata will be deleted.

  • Create external table

create EXTERNAL table IF NOT EXISTS default.emp_ext2(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/user/jianxin/hive/warehouse/emp_ext2';
  • Create a partitioned table
create EXTERNAL table IF NOT EXISTS default.emp_partition4(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
partitioned by (month string,day string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
location '/user/jianxin/hive/warehouse/emp_partition4';
  • Import partition table data
LOAD DATA [LOCAL] INPATH 'filepath' 
[OVERWRITE] INTO TABLE tablename 
[PARTITION (partcol1=val1, partcol2=val2 ...)]

#第一种
load data local inpath '/opt/modules/data/emp.txt' into table default.emp_partition4 partition (month='201804',day='12') ;
load data local inpath '/opt/modules/data/emp.txt' into table default.emp_partition4 partition (month='201804',day='11') ;
hive (default)> msck repair table emp_partition4 ;
#第二种
dfs -mkdir -p /user/jianxin/hive/warehouse/emp_partition4/month=201804/day=03 ;
dfs -put /opt/modules/data/emp.txt /user/jianxin/hive/warehouse/emp_partition4/month=201804/day=03 ;
hive (default)> msck repair table emp_partition4 ;
hive (default)>alter table default.emp_partition4(month='201804',day='03')

  • Query partition table data
select * from emp_partition where month = '201804' and day = '12' ;
select * from emp_partition where month = '201804';
  • display table partitions
show partitions emp_partition4 ;

Import Data

load data [local] inpath 'filepath' [overwrite] into table tablename [partition (partcol1=val1,...)];

- 原始文件存储的位置
	- 本地 local
	- hdfs
- 对表的数据是否覆盖
	- 覆盖 overwrite
	- 追加
- 分区表加载,特殊性(修复分区表)
	partition (partcol1=val1,...)
load data local inpath '/opt/modules/data/emp.txt' overwrite into table   emp_ext2;

hive data export

#导出的路径指的是hdfs中的路径,不是本地路径

EXPORT TABLE default.emp TO '/user/jianxin/hive/export/emp_exp' ;

hive (default)> export table emp to '/user/jianxin/hive/export/emp_exp'
              > ;
Copying data from file:/tmp/root/hive_2018-04-09_21-16-02_852_1347531230491859372-1/-local-10000/_metadata
Copying file: file:/tmp/root/hive_2018-04-09_21-16-02_852_1347531230491859372-1/-local-10000/_metadata
Copying data from hdfs://hadoop.jianxin.com:9000/user/hive/warehouse/emp
Copying file: hdfs://hadoop.jianxin.com:9000/user/hive/warehouse/emp/emp.txt


hive (default)> dfs -ls -R  /user/jianxin/
              > ;
drwxr-xr-x   - root supergroup          0 2018-04-09 21:16 /user/jianxin/hive
drwxr-xr-x   - root supergroup          0 2018-04-09 21:16 /user/jianxin/hive/export
drwxr-xr-x   - root supergroup          0 2018-04-09 21:16 /user/jianxin/hive/export/emp_exp
-rw-r--r--   1 root supergroup       1580 2018-04-09 21:16 /user/jianxin/hive/export/emp_exp/_metadata
drwxr-xr-x   - root supergroup          0 2018-04-09 21:16 /user/jianxin/hive/export/emp_exp/data
-rw-r--r--   1 root supergroup        656 2018-04-09 21:16 /user/jianxin/hive/export/emp_exp/data/emp.txt

Common operations of hive

  • order by
    is a sort of global data, only one reduce
select * from emp order by empno desc ;
  • sort by
    sorts the internal data of each reduce, the global result set is not sorted
set mapreduce.job.reduces= 3;  
select * from emp sort by empno asc ;  
insert overwrite local directory '/opt/datas/sortby-res' select * from emp sort by empno asc ;  
  • The distribute by
    partition partition
    is similar to the partition partition in MapReduce. It partitions the data and uses it in combination with sort by.
insert overwrite local directory '/opt/datas/distby-res' select * from emp distribute by deptno sort by empno asc ;
  • cluster by
    When the distribute by and sort by fields are the same, cluster by can be used;
insert overwrite local directory '/opt/datas/cluster-res' select * from emp cluster by empno asc ;

hive UDF programming

Inheriting org.apache.hadoop.hive.ql.exec.UDF needs to implement the evaluate function, which supports overloading. (the case where the function or method has the same name, but a different parameter list)

package com.jianxin.senior.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

/**
 * 1. Implement one or more methods named
 * "evaluate" which will be called by Hive.
 * 
 * 2. "evaluate" should never be a void method. 
 * However it can return "null" if needed.
 * @author XuanYu
 *
 */
public class LowerUDF extends UDF {
	
	public Text evaluate(Text str){
		// validate 
		if(null == str.toString()){
			return null ;
		}
		// lower
		return new Text (str.toString().toLowerCase())  ;
	}
	
	public static void main(String[] args) {
		System.out.println(new LowerUDF().evaluate(new Text("HIVE")));
	}

}


add jar /opt/datas/hiveudf.jar ;
create temporary function my_lower as "com.jianxin.senior.hive.udf.LowerUDF" ;
select ename, my_lower(ename) lowername from emp limit 5 ;

dfs -put  /opt/datas/hiveudf.jar /user/jianxin/hive/jars/hiveudf.jar
CREATE FUNCTION self_lower AS 'com.jianxin.senior.hive.udf.LowerUDF' USING JAR 'hdfs://hadoop.jianxin.com:9000/user/beifeng/hive/jars/hiveudf.jar';
select ename, self_lower(ename) lowername from emp limit 5 ;

hive display function

show functions;
#查看某个函数的功能
describe function year;
#结果如下  
tab_name
year(date) - Returns the year of date
Time taken: 0.031 seconds, Fetched: 1 row(s)

describe  function extended year;
#结果如下  
tab_name
year(date) - Returns the year of date
date is a string in the format of 'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'.
Example:
   > SELECT year('2009-30-07', 1) FROM src LIMIT 1;
  2009
Time taken: 0.057 seconds, Fetched: 5 row(s)


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324465169&siteId=291194637