hive安装-配置-使用

Hive只在一个节点上安装即可

1.上传tar包

2.解压

[hadoop@Linux1 ~]$ tar -zxvf apache-hive-1.2.1-bin.tar.gz -C apps/

3.安装mysql来作为元信息数据库，替换默认derby数据库

mysql -uroot -p
1.设置root的密码为root
2.删除匿名用户
3.允许用户远程连接
    #(执行下面的语句  *.*:所有库下的所有表   %：任何IP地址或主机都可以连接)
    mysql> grant all privileges on *.* to root@'%' identified by 'root’;
    mysql> flush privileges;

4.配置hive
（a）配置环境变量 HIVE_HOME 和 HADOOP_HOME

/etc/profile 中配置HIVE_HOME
export HIVE_HOME=/home/hadoop/apps/hive-1.2.1
export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH:$HIVE_HOME/bin

vconf/hive-env.shzh 中配置HADOOP_HOME
[hadoop@Linux1 ~]$ cd apps/hive-1.2.1/conf/
[hadoop@Linux1 conf]$ cp hive-env.sh.template  hive-env.sh
[hadoop@Linux1 conf]$ vi  hive-env.sh
# Set HADOOP_HOME to point to a specific hadoop install directory
HADOOP_HOME=/home/hadoop/apps/hadoop-2.6.4/

（b）配置元数据库信息 vi hive-site.xml

 添加如下内容：
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>

<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>

<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>username to use against metastore database</description>
</property>

<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
<description>password to use against metastore database</description>
</property>
</configuration>

5.安装hive和mysq完成后，将mysql的连接jar包拷贝到$HIVE_HOME/lib目录下

[hadoop@Linux1 ~]$ mv mysql-connector-java-5.1.28.jar apps/hive-1.2.1/lib/

6. Jline包版本不一致的问题，需要拷贝hive的lib目录中jline.2.12.jar的jar包替换掉hadoop中的

[hadoop@Linux1 ~]$ rm ~/apps/hadoop-2.6.4/share/hadoop/yarn/lib/jline-0.9.94.jar
[hadoop@Linux1 ~]$ cd apps/hive-1.2.1/lib/
[hadoop@Linux1 lib]$ cp jline-2.12.jar ~/apps/hadoop-2.6.4/share/hadoop/yarn/lib/

启动hive

[hadoop@Linux1 ~]$ apps/hive-1.2.1/bin/hive

----------------------------------------------------------------------------------------------------

hive操作

//显示所有数据库
hive> show databases;
//创建数据库
hive> create database demo1;
//进入数据库
hive> use demo1;
//显示表
hive> show tables;
//在数据库中建表
hive> create table test1(id int,name string);
//数据放入表中，即hdfs相对应文件夹下
[hadoop@Linux1 ~]$ vi sz.dat
[hadoop@Linux1 ~]$ hadoop fs -put sz.dat /user/hive/warehouse/demo1.db/test1
//只清空表中数据
hive> truncate table test1;
//删除表
hive> drop table test1;

//建表,指定格式，以一行为单位，字段用逗号隔开
hive> create table t_sz01(id int,name string) row format delimited fields terminated by ',';
//数据放入表中，即hdfs相对应文件夹下
[hadoop@Linux1 ~]$ hadoop fs -put sz.dat /user/hive/warehouse/demo1.db/t_sz01
//查询表中内容
hive> select * from t_sz01;


7.建表(默认是内部表)
    create table trade_detail(id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t';
    建分区表
    create table td_part(id bigint, account string, income double, expenses double, time string) partitioned by (logdate string) row format delimited fields terminated by '\t';
    建外部表
    create external table td_ext(id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t' location '/td_ext';

8.创建分区表
    普通表和分区表区别：有大量数据增加的需要建分区表
    create table book (id bigint, name string) partitioned by (pubdate string) row format delimited fields terminated by '\t'; 

    分区表加载数据
    load data local inpath './book.txt' overwrite into table book partition (pubdate='2010-08-22');
    
    load data local inpath '/root/data.am' into table beauty partition (nation="USA");

    
    select nation, avg(size) from beauties group by nation order by avg(size);




分桶    最大的作用是用来提高join操作的效率
    导入的数据，需要已经被分桶（insert into t_buck select id,name from t_p distribute by (id) sort by (id);这样导入数据，才可以分桶）
0: jdbc:hive2://localhost:10000> create table t_buk(id string,name string)
0: jdbc:hive2://localhost:10000> clustered by(id)    #根据id分桶    相当于指定map的Partition进行分区
0: jdbc:hive2://localhost:10000> sorted by(id)       #根据id排序    四个区中，每个区中的顺序，根据id排序
0: jdbc:hive2://localhost:10000> into 4 buckets      #四个桶
0: jdbc:hive2://localhost:10000> row format delimited fields terminated by ',';    #以一行为单位，字段之间用逗号隔开
No rows affected (0.316 seconds)
0: jdbc:hive2://localhost:10000> desc extended  t_buk;    #查看详情

0: jdbc:hive2://localhost:10000> ALTER TABLE t_buk RENAME TO t_buck; #修改表名
0: jdbc:hive2://localhost:10000> load data local inpath '/home/hadoop/sz.dat' into table t_buck;    #导入数据
0: jdbc:hive2://localhost:10000> truncate table t_buck;    #清空表中数据


0: jdbc:hive2://localhost:10000> create table t_p(id string,name string)
0: jdbc:hive2://localhost:10000> row format delimited fields terminated by ','; 
0: jdbc:hive2://localhost:10000> load data local inpath '/home/hadoop/sz.dat' into table t_p;    #没有分桶，因为需要开启分桶，需要指定reduce的数量

#设置变量,设置分桶为true, 设置reduce数量是分桶的数量个数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;
//查看设置
set hive.enforce.bucketing;
set mapreduce.job.reduces;
set mapreduce.job.maps;


注：1、order by 会对输入做全局排序，因此只有一个reducer，会导致当输入规模较大时，需要较长的计算时间。
2、sort by不是全局排序，其在数据进入reducer前完成排序。因此，如果用sort by进行排序，并且设置mapred.reduce.tasks>1，则sort by只保证每个reducer的输出有序，不保证全局有序。
3、distribute by(字段)根据指定的字段将数据分到不同的reducer，且分发算法是hash散列。
4、Cluster by(字段) 除了具有Distribute by的功能外，还会对该字段进行排序。

因此，如果分桶和sort字段是同一个时，此时，cluster by = distribute by + sort by

0: jdbc:hive2://localhost:10000> select id,name from t_p sort by (id);    #如果有分区的话，先分区（分发算法为？？？？？？），然后排序(int从小到大,string字典）
    hash分区（4、8、12）（2、6）（1、3、5、10、11）（7、9）
        //如果id类型为int，则sort后，每个分区中由小到大排序
        +-----+-----------+--+
        | id  |   name    |
        +-----+-----------+--+
        | 4   | furong    |
        | 8   | bbbb      |
        | 12  | ffff      |

        | 2   | lisi      |
        | 6   | zengye    |

        | 1   | zhangsan  |
        | 3   | fengjie   |
        | 5   | chunge    |
        | 10  | dddd      |
        | 11  | eeee      |

        | 7   | aaaa      |
        | 9   | cccc      |
        +-----+-----------+--+
        //如果id类型为string，则sort后，每个分区中，进行字典排序
        +-----+-----------+--+
        | id  |   name    |
        +-----+-----------+--+
        | 12  | ffff      |
        | 4   | furong    |
        | 8   | bbbb      |

        | 2   | lisi      |
        | 6   | zengye    |

        | 1   | zhangsan  |
        | 10  | dddd      |
        | 11  | eeee      |
        | 3   | fengjie   |
        | 5   | chunge    |

        | 7   | aaaa      |
        | 9   | cccc      |
        +-----+-----------+--+
0: jdbc:hive2://localhost:10000> select id,name from t_p distribute by (id) sort by (id);    #先hash分区（分发算法是hash散列），再排序(int从小到大,string字典）
              int a = "10".hashCode();//2
              int b = Integer.MAX_VALUE;//2147483647
              System.out.println((a & b) % 4);

              int a = "10".hashCode();
              System.out.println((a) % 4);
        +-----+-----------+--+
        | id  |   name    |
        +-----+-----------+--+
        | 11  | eeee      |
        | 4   | furong    |
        | 8   | bbbb      |

        | 1   | zhangsan  |
        | 12  | ffff      |
        | 5   | chunge    |
        | 9   | cccc      |

        | 2   | lisi      |
        | 6   | zengye    |

        | 10  | dddd      |
        | 3   | fengjie   |
        | 7   | aaaa      |
        +-----+-----------+--+
0: jdbc:hive2://localhost:10000> select id,name from t_p cluster by (id);    #先hash分区（分发算法是hash散列），再排序(int从小到大,string字典）
        +-----+-----------+--+
        | id  |   name    |
        +-----+-----------+--+
        | 11  | eeee      |
        | 4   | furong    |
        | 8   | bbbb      |

        | 1   | zhangsan  |
        | 12  | ffff      |
        | 5   | chunge    |
        | 9   | cccc      |

        | 2   | lisi      |
        | 6   | zengye    |

        | 10  | dddd      |
        | 3   | fengjie   |
        | 7   | aaaa      |
        +-----+-----------+--+


0: jdbc:hive2://localhost:10000> insert into t_buck select id,name from t_p distribute by (id) sort by (id);
0: jdbc:hive2://localhost:10000> dfs -cat /user/hive/warehouse/t_buck/000000_0;
+-------------+--+
| DFS Output  |
+-------------+--+
| 11,eeee     |
| 4,furong    |
| 8,bbbb      |
+-------------+--+

启动hive服务，其他服务器可进行连接

一台服务端启动 hiveserver2
bin/hiveserver2
[hadoop@Linux1 ~]$ hiveserver2

另一台客户端启动 beeline
[hadoop@Linux1 ~]$ beeline
//连接
beeline> !connect jdbc:hive2://localhost:10000
//验证用户名和密码，没有配置，默认输入服务端启动了hive的用户，密码为空 
Enter username for jdbc:hive2://localhost:10000: hadoop
Enter password for jdbc:hive2://localhost:10000:

hive安装-配置-使用

猜你喜欢