数据平台之sqoop实践

sqoop目前用的比较多的是数据仓库的建立,在很多数情况下,同步一个表,需要些sqoop脚本,来同步表,有的时候执行成功与否,数据量多少,数据源表是否更新,都不清楚,而且脚本文件不便于管理。

本次介绍的数据平台具有专业的管理功能。

下面是表的设计

DROP TABLE IF EXISTS task_info; CREATE TABLE task_info(task_id INT,database_id INT,db_table VARCHAR(100), hive_table VARCHAR(100), db_columns VARCHAR(5000), where_express VARCHAR(200)
, is_hive_import VARCHAR(2), hive_partition_key VARCHAR(10), mappers INT, split_by VARCHAR(10) , is_use VARCHAR(2), priority INT, cron VARCHAR(100), alter_date longtext, meta_data longtext, PRIMARY KEY (task_id));

DROP TABLE IF EXISTS db_info; CREATE TABLE db_info(id INT, uname VARCHAR(100) , password VARCHAR(500), url VARCHAR(200),PRIMARY KEY (id));

DROP TABLE IF EXISTS task_log; CREATE TABLE task_log(name VARCHAR(100) , begin_time VARCHAR(100), end_time VARCHAR(100), result VARCHAR(100), data_num VARCHAR(50), log longtext);

将sqoop需要的相关参数可配置化,包括数据源表也实行配置化。当然如果要改变就执行执行一个sql,让调度程序注意到就行了。这是一个定时执行的。

接下来是核心的具体代码

public class SecureEncrypter implements Encrypter{
	
	private int seed = 0;
	
	public SecureEncrypter(int seed){
		this.seed=seed;	
	}	

	@Override
	public String encode(String input) {
		StringBuilder builder =new  StringBuilder();
		for(int i=0;i<input.length();i++){
			int k =(int)input.charAt(i);
			//int code = new Integer(k^flag);
			builder.append("u" + new Integer(k^seed).toString());
		}
		return builder.substring(1);
	}

	@Override
	public String decode(String input) {
		String[] arr=input.split("u");
		StringBuilder builder= new StringBuilder();
		for(String str:arr){
			int t=Integer.valueOf(str);
			t = t ^ seed;
			builder.append((char)t);
		}		
		return builder.toString();
	}

	
	
}

上面是密码的加密解密,有些密码不能轻易透漏给他人必须加密处理,然后解密作为sqoop的密码参数

public class HDFSExtract implements Extract {

	/**
	 * @param args
	 * 该类负责管理HDFS的数据抽取工作
	 */
	public static Logger log = Logger.getLogger(HDFSExtract.class);
	private Connection con = null;
	private Statement st = null;
	private ResultSet rs = null;
	
	private int	task_id;
	private String	db_user             = null;
	private String	db_password         = null;
        
	private String	db_table            = null;
	private String	db_columns          = null;
	private String	url                 = null;
	private String	where               = null;
        
	private String	is_hive_import      = null;
	private String	hive_table          = null;
	private String	hive_partition_key  = null;
	private String split_by             = null;
	private String count_date           = null;
	
	private String	mappers= null;
	
	public void extract(TaskInfo tInfo)
	{
		// 1 读取配置表内容 yhd_extract_to_hdfs
		//count_date 默认为yesterday
		count_date = ManaUtils.getData(-1);
		try {
			if(tInfo.getAlter_date() != null && !"".equals(tInfo.getAlter_date())) {
				count_date = tInfo.getAlter_date();
			}
			String sql = "select d.uname,d.password,d.url from db_info d where d.id = " ;
			sql += tInfo.getDatabase_id();
			log.info("sql: "+sql);
			con = DBConnection.getInstance();
			st = con.createStatement();
			rs = st.executeQuery(sql);
			Runtime run = null;
			if(rs.next())
			{
				ETL_vo vo = new ETL_vo();
				setFromResultSet(rs, tInfo);
				vo.setTableName(hive_table);
					run = Runtime.getRuntime();
					try {
						run.exec("hadoop fs -rmr "+db_table.trim());
						run.exec("hadoop fs -rmr "+db_table.trim().toUpperCase());
						run.exec("hadoop fs -rmr "+db_table.trim().toLowerCase());
						log.info("Begin excuete task "+task_id+" "+hive_table+" ......"+ManaUtils.sdf.format(new Date())) ;
						vo.setBeginTime(ManaUtils.sdf.format(new Date()));
						vo.setResult("SUCCESS");
						System.out.println(toString(createOptions()));
						Process p = run.exec(createOptions());
						ManaUtils.exeOption(p, tInfo, st, null);
					} catch (Exception e) {
						e.printStackTrace();
					}
			}
			log.info("Finish Extracting ! "+ManaUtils.sdf.format(new Date())) ;
			
		} catch (Exception e) {
			log.info("Error: extract fail !");
			e.printStackTrace();
			return ;
		}
		finally{
//			try {
//				rs.close();
//				st.close();
//				con.close();
//			} catch (SQLException e) {
//				e.printStackTrace();
//			}
			
		}
	}
	
	public void setFromResultSet(ResultSet rs, TaskInfo tInfo) throws SQLException
	{
		task_id             = tInfo.getTask_id();
		db_user             = rs.getString("uname").trim();
		db_password         = rs.getString("password").trim();
		url                 = rs.getString("url");
		                    
		db_table            = tInfo.getDb_table();
		db_columns          = tInfo.getDb_columns();
		where               = tInfo.getWhere_express();
		                    
		is_hive_import      = tInfo.getIs_hive_import();
		hive_table          = tInfo.getHive_table();
		hive_partition_key  = tInfo.getHive_partition_key();
		mappers              = tInfo.getMappers().toString(); 
		split_by             = tInfo.getSplit_by(); 
	}
	
	public String toString(String[] args) {
		StringBuffer stringBuffer = new StringBuffer();
//		System.out.println("---shell---");
		for(String a : args) {
//			System.out.println(a);
			stringBuffer.append(a);
			stringBuffer.append(" ");
		}
//		System.out.println("---shell---");
		return stringBuffer.toString();
	}
	
	public String[] createOptions() throws Exception
	{
		List<String> optSb = new ArrayList<String>();
		optSb.add("sqoop");
		optSb.add("import");
		optSb.add("-D");
		optSb.add("mapred.job.queue.name=pms");
		optSb.add("--connect");
		optSb.add(url);
		optSb.add("--username");
		optSb.add(db_user);
		optSb.add("--password");
		optSb.add(EncrypterFactory.getEncryperInstance(1).decode(db_password));
		if (mappers != null && ! "".equals(mappers)) 
		{  
			optSb.add("-m");
			optSb.add(mappers);
		}
		if (split_by != null && !"".equals(split_by)) {
			optSb.add("--split-by");
			optSb.add(split_by.toUpperCase());
		}
		
		optSb.add("--null-string");
		optSb.add("''");
		
		optSb.add("--table");
		optSb.add(db_table);
		//以下是数据表非必填项,需要判断
		if(db_columns != null && ! "".equals(db_columns)){
			optSb.add("--columns");
			optSb.add("\""+db_columns.toUpperCase()+"\"");
		}
		if (where != null && ! "".equals(where)) {
			optSb.add("--where");
			where = where.replaceAll("\\$TXDATE", "'"+count_date+"'") ;
			optSb.add("\""+where.trim()+"\"");
		}
		
		if (is_hive_import != null && ! "".equals(is_hive_import)
				&& "Y".equalsIgnoreCase(is_hive_import))
		{
			optSb.add("--hive-overwrite");  // 统一为覆盖模式,要求hive 表必须存在
			optSb.add("--hive-import");
			optSb.add("--hive-drop-import-delims");
			
			if (hive_table == null || "".equals(hive_table)) {
				log.info("Error: hive_table must be set 当--hive-import时 !");
			}else {
				optSb.add("--hive-table");
				optSb.add(hive_table.trim());
			}
			if (hive_partition_key != null && !"".equals(hive_partition_key)) {
				optSb.add("--hive-partition-key");
				optSb.add(hive_partition_key.trim());
				optSb.add("--hive-partition-value");
				optSb.add("\""+count_date.trim()+"\"");
			}
		}
		optSb.add("--null-string");
		optSb.add("'\\\\N'");
		optSb.add("--null-non-string");
		optSb.add("'\\\\N'");
		return optSb.toArray(new String[0]);
	}
	
}

在很多情况下,需要每天增量的同步,条件里需要加上日期相关

猜你喜欢

转载自xiangjinqi.iteye.com/blog/2119636