版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_29726869/article/details/82844849
package com.zx.utils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.*;
public class HdfsUnZip {
public static void loadZipFileData(String hdfsFilePath) throws IOException {
Configuration conf = new Configuration();
String basePath = "hdfs://master:9000/";
Path hdfs = new Path(basePath);
//设置hdfs指定用户名root,否则没有权限访问hdfs
System.setProperty("HADOOP_USER_NAME", "root");
FileSystem fs = hdfs.getFileSystem(conf);
FSDataInputStream fsInputStream = null;
ZipArchiveInputStream zipInputStream = null;
ZipArchiveEntry zipEntry = null;
FSDataOutputStream mergerout = null;
System.out.println(hdfsFilePath);
try {
//读取hdfs上文件到输入流
fsInputStream = fs.open(new Path(hdfsFilePath));
//将输入流转成字节数组
byte[] b = toByteArray(fsInputStream);
//去除字节数组中xx无用信息(实际为压缩包无用的头信息)
b = offXXData(b);
//将字节数组读入流,并创建zip处理流对象
zipInputStream = new ZipArchiveInputStream( new ByteArrayInputStream(b));
//判断是否能获取zipEntity对象
while ((zipEntry = zipInputStream.getNextZipEntry()) != null) {
//获取当前解压的文件名
String entryName = zipEntry.getName();
System.out.println("fileName:"+entryName);
System.out.println(basePath+entryName);
//判断解压文件是否非文件夹
if (!zipEntry.isDirectory()) {
System.out.println("is file");
//在hdfs上创建指定文件
mergerout = fs.create(new Path(basePath+entryName));
int bygeSize1 = 10 * 1024 * 1024;
byte[] buffer1 = new byte[bygeSize1];
int nNumber;
//并将解压的内容写入hdfs文件
while ((nNumber = zipInputStream.read(buffer1, 0, bygeSize1)) != -1) {
mergerout.write(buffer1, 0, nNumber);
}
mergerout.flush();
}else{
System.out.println("is Direcotry");
}
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if(mergerout!=null){
mergerout.close();
}
if(zipInputStream!=null){
zipInputStream.close();
}
if(fsInputStream!=null){
fsInputStream.close();
}
}
}
public static void zipHdfsFile(String hdfsFilePath) throws IOException {
Configuration conf = new Configuration();
String basePath = "hdfs://master:9000/";
Path hdfs = new Path(basePath);
//设置hdfs指定用户名root,否则没有权限访问hdfs
System.setProperty("HADOOP_USER_NAME", "root");
FileSystem fs = hdfs.getFileSystem(conf);
FSDataInputStream fsInputStream = null;
ZipArchiveInputStream zipInputStream = null;
ZipArchiveEntry zipEntry = null;
FSDataOutputStream mergerout = null;
System.out.println(hdfsFilePath);
try {
//读取hdfs上文件到输入流
fsInputStream = fs.open(new Path(hdfsFilePath));
zipInputStream = new ZipArchiveInputStream(fsInputStream);
//判断是否能获取zipEntity对象
while ((zipEntry = zipInputStream.getNextZipEntry()) != null) {
//获取当前解压的文件名
String entryName = zipEntry.getName();
System.out.println("fileName:"+entryName);
System.out.println(basePath+"process/"+entryName);
//判断解压文件是否非文件夹
if (!zipEntry.isDirectory()) {
System.out.println("is file");
//在hdfs上创建指定文件
mergerout = fs.create(new Path(basePath+"process/"+entryName));
int bygeSize1 = 10 * 1024 * 1024;
byte[] buffer1 = new byte[bygeSize1];
int nNumber;
//并将解压的内容写入hdfs文件
while ((nNumber = zipInputStream.read(buffer1, 0, bygeSize1)) != -1) {
mergerout.write(buffer1, 0, nNumber);
}
mergerout.flush();
}else{
System.out.println("is Direcotry");
}
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if(mergerout!=null){
mergerout.close();
}
if(zipInputStream!=null){
zipInputStream.close();
}
if(fsInputStream!=null){
fsInputStream.close();
}
}
}
public static void main(String[]args) throws IOException {
zipHdfsFile("hdfs://master:9000/input.zip");
loadZipFileData("hdfs://master:9000/input/2_74_1517985979_002_00_805690.zip");
}
/**
* 获得去除XX后的数据
*
* @param data
* @return
* @throws IOException
*/
public static byte[] offXXData(byte[] data) throws IOException {
ByteArrayOutputStream out = null;
CustomLineInputStream lis = null;
try {
out = new ByteArrayOutputStream();
lis = new CustomLineInputStream(new ByteArrayInputStream(data));
String line = null;
List<Byte> bytes = new ArrayList<Byte>();
boolean isFirstLine = true;
while ((line = lis.readLineBytes(bytes)) != null) {
if (isFirstLine && (!line.startsWith("XX-"))) {
return data;
} else {
isFirstLine = false;
}
if (!line.startsWith("XX-")) {
break;
}
bytes.clear();
}
byte[] bs = new byte[1024];
int i = 0, flag = 0;
for (int j = 0; j < bytes.size(); j++) {
byte b = bytes.get(j);
if ((b == '\n' || b == '\r') && flag == 0) {
continue;
} else {
flag++;
}
out.write(bytes.get(j));
}
while ((i = lis.read(bs)) != -1) {
out.write(bs, 0, i);
}
out.flush();
data = out.toByteArray();
} catch (IOException e) {
e.printStackTrace();
throw e;
} finally {
if (out != null) {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (lis != null) {
try {
lis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return data;
}
//将文件读入流,然后换成byte数组
public static byte[] File2byte(File file)
{
byte[] buffer = null;
try
{
FileInputStream fis = new FileInputStream(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] b = new byte[1024];
int n;
while ((n = fis.read(b)) != -1)
{
bos.write(b, 0, n);
}
fis.close();
bos.close();
buffer = bos.toByteArray();
}
catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
return buffer;
}
//输入流转成byte数组
public static byte[] toByteArray(InputStream input) throws IOException {
ByteArrayOutputStream output = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int n = 0;
while (-1 != (n = input.read(buffer))) {
output.write(buffer, 0, n);
}
return output.toByteArray();
}
}