基于hive的hadoop日志分析

1.日志格式
日期时间级别相关类信息
2011-08-01 08:39:08,020 INFO org.apache.hadoop.ipc.Server:IPC server Responder......
2.存储结构
日期时间级别相关类各占一列信息占3列
create table if not exists loginfo11(rdate string,time array<string>,type string,relateclass string,infomation1 string,infomation2 string,infomation3 string)
row format delimited fields terminated by '' collection items terminated by ',' map keys terminated by ':';
3.getConnect.java
新建工程hiveAction，导入 hadoop的jar包，导入mysql的驱动包
getConnect类负责建立与Hive和MYSQL的连接，由于每个连接的开销比较大，所以此类的设计采用的是单例模式：

package com.cstore.transToHive;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;

public class getConnect {
	private static Connection conn = null;
	private static Connection conntomysql = null;
	private static final String DBURL="jdbc:hive://192.168.1.13:50031/default";
	private static final String DBUSERNAME = "hive";
	private static final String DBPASSWORD = "123456";
	private getConnect() {
	}

	// 获得与Hive的连接，如果连接已经初始化，则直接返回
	public static Connection getHiveConn() throws SQLException {
		if (conn == null) {
			try {
				Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver");
			} catch (ClassNotFoundException e) {
				e.printStackTrace();
				System.exit(1);
			}
			conn = DriverManager.getConnection(
					DBURL, "hive", "123456");
			System.out.println(1111);
		}
		return conn;
	}

	// 获得与MYSQL的连接
	public static Connection getMysqlConn() throws SQLException {
		if (conntomysql == null) {
			try {
				Class.forName("com.mysql.jdbc.Driver");
			} catch (ClassNotFoundException e) {
				e.printStackTrace();
				System.exit(1);
			}
			conntomysql = DriverManager
					.getConnection(
							"jdbc:mysql://192.168.1.12:3306/hadoop?useUnicode=true&characterEncoding=GBK",
							"hive", "123456");
			System.out.println(1111);
		}
		return conntomysql;
	}

	// 在完成所有操作之后，调用次方法关闭本次会话的连接
	public static void closeHive() throws SQLException {
		if (conn != null)
			conn.close();
	}

	public static void closemysql() throws SQLException {
		if (conntomysql != null)
			conntomysql.close();
	}

}

4.HiveUtil.java

package com.cstore.transToHive;

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

public class HiveUtil {
	public static void createTable(String hiveql) throws SQLException // 创建表
	{
		Connection con = getConnect.getHiveConn();
		Statement stmt = con.createStatement();
		ResultSet res = stmt.executeQuery(hiveql);
	}

	public static ResultSet queryHive(String hiveql) throws SQLException // 查询表
	{
		Connection con = getConnect.getHiveConn();
		Statement stmt = con.createStatement();
		ResultSet res = stmt.executeQuery(hiveql);
		return res;
	}

	public static void loadDate(String hiveql) throws SQLException // 加载数据
	{
		Connection con = getConnect.getHiveConn();
		Statement stmt = con.createStatement();
		ResultSet res = stmt.executeQuery(hiveql);
	}

	public static void hiveTomysql(ResultSet Hiveres) throws SQLException // 数据转存至mysql
	{
		Connection con = getConnect.getMysqlConn();
		Statement stmt = con.createStatement();
		while (Hiveres.next()) {
			String rdate = Hiveres.getString(1);
			String time = Hiveres.getString(2);
			String type = Hiveres.getString(3);
			String relateclass = Hiveres.getString(4);
			String information = Hiveres.getString(5) + Hiveres.getString(6)
					+ Hiveres.getString(7);// 可以使用udaf实现
			System.out.println(rdate + " " + time + " " + type + " "
					+ relateclass + " " + information + " ");
			int i = stmt.executeUpdate("insert into hadooplog values(0,'"
					+ rdate + "','" + time + "','" + type + "','" + relateclass
					+ "','" + information + "')");
		}
	}
}

5.exeHiveQL.java

新建类，其中包含main方法，他是个驱动类，运行时需要两个参数，日志级别和日期
程序执行时：程序首先在Hive数据仓库中建立表，然后加载hadoop日志，过滤有用的日志信息后并转存到mysql数据库中

package com.cstore.transToHive;

import java.sql.ResultSet;
import java.sql.SQLException;

public class exeHiveQL {
	public static void main(String[] args) throws SQLException {
		if (args.length < 2) {
			System.out.print("请输入你要查询的条件：日志级别 日志信息");
			System.exit(1);
		}
		String type = args[0];
		String date = args[1];
		// 在Hive中创建表
		HiveUtil.createTable("create table if not exists loginfo11 ( rdate String,time ARRAY<string>,type STRING,relateclass STRING,information1 STRING,information2 STRING,information3 STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':'");
		// 加载Hadoop日志文件，*表示加载所有的日志文件
		HiveUtil.loadDate("load data local inpath '/usr/local/hadoop2/logs/*' overwrite into table loginfo11");
		// 查询有用的信息，这里依据日期和日志级别过滤信息
		ResultSet res1 = HiveUtil
				.queryHive("select rdate,time[0],type,relateclass,information1,information2,information3 from loginfo11 where type='ERROR' and rdate='2011-07-29' ");
		// 查询的信息经过变换后存如mysql中
		HiveUtil.hiveTomysql(res1);
		// 最后关闭此次会话的hive连接
		getConnect.closeHive();
		// 关闭mysql连接
		getConnect.closemysql();
	}

}

6.测试

运行程序前需要在装有Hive的机器上启动HiveServer服务并指定一个端口监听
hive --service hiveserver 50031
运行exeHiveQL.java，输入参数作为查询的条件查找用户所关注的信息，如查询2011年7月29日所有的ERROR信息，那么参数就是 ERROR 2011-07-29，运行后可以看到Hiveserver的控制台上输入了运行时的信息，程序执行完毕后去mysql的控制台，查看hadooplog表中的结果

7.分析总结
本次程序的实现其实mapreduce也能够做，因为Hive的底层调用的就是mapreduce，所以hive的效率没有mapreduce的高
mapreduce：效率高编程复杂
hive ：效率低编程简单
（上文中的效率高或者效率低仅仅是mapreduce和hive之间相对来说的）

基于hive的hadoop日志分析

猜你喜欢