一、周末没事记录下本周遇到的一个问题:需要把一个很大的xlsx文件数据写入hbase,文件数据60W+。
1、一开始用xls方式处理,发现不行,xls文件只能处理到65535行。
2、然后使用SXSSFWorkbook方式,发现这个只能写或者操作很大的xlsx文件,没有读的功能,而且在本地测试的时候,报内存溢出,看文档这个类会把临时数据写入本地磁盘,多次修改测试还是报内存溢出。
3、最后在apache官网找到了解决方法:
官网地址:大xlsx文件或者xml文件读取
4、做简单的修改后,写入hbase只用不到一分钟的时间。当然需要引入apache的poi-ooxml跟rj.jar这个包,下面的代码拿来就可以用,xlsx文件第一行需要是列名:
package com.xlsx.utils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.function.Consumer;
public class ReadBigDataXlsxFile {
private Consumer<Map<String,String>> consumer;
private String fileName;
private boolean useTextReadFile;
public ReadBigDataXlsxFile(String fileName,Consumer<Map<String,String>> consumer){
this.fileName = fileName;
this.consumer = consumer;
}
public void processOneSheet(String sheetId) throws OpenXML4JException, IOException, SAXException {
OPCPackage open = OPCPackage.open(fileName);
XSSFReader xssfReader = new XSSFReader(open);
SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sharedStringsTable);
InputStream sheet = xssfReader.getSheet(sheetId);
InputSource inputStream = new InputSource(sheet);
parser.parse(inputStream);
sheet.close();
}
public void processAllSheets() throws OpenXML4JException, IOException, SAXException {
OPCPackage open = OPCPackage.open(fileName);
XSSFReader xssfReader = new XSSFReader(open);
SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sharedStringsTable);
Iterator<InputStream> sheetsData = xssfReader.getSheetsData();
while (sheetsData.hasNext()){
InputStream next = sheetsData.next();
InputSource inputSource = new InputSource(next);
parser.parse(inputSource);
next.close();
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader xmlReader = XMLReaderFactory.createXMLReader();
SheetHandler sheetHandler = new SheetHandler(sst, consumer);
xmlReader.setContentHandler(sheetHandler);
return xmlReader;
}
private static class SheetHandler extends DefaultHandler{
private SharedStringsTable sst;
private String lastConetents;
private boolean nextIsString;
//存储xlsx文件的第一行作为字段名
private Map<String, String> tableHead = new HashMap<>();
private Map<String,String> rowData = new HashMap<>();
//记录行,如果是第一行,就写入tableHead,作为数据key
private int rowNum = 0;
private String key,value;
//每行数据的处理逻辑
private Consumer<Map<String,String>> consumer;
private SheetHandler(SharedStringsTable sst,Consumer<Map<String,String>> consumer){
this.sst = sst;
this.consumer = consumer;
}
@Override
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
if(name.equals("row")){
rowNum ++;
consumer.accept(rowData);
}
if(name.equals("c")){
key = attributes.getValue("r").substring(0,1);
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")){
nextIsString = true;
}else{
nextIsString = false;
}
lastConetents = "";
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if(nextIsString){
if(lastConetents == ""){
//处理表格列值为空的情况
lastConetents = " ";
}else{
int indx = Integer.parseInt(lastConetents);
lastConetents = sst.getItems().get(indx).getT();
}
nextIsString = false;
}
if(name.equals("v")){
if(rowNum == 1){
tableHead.put(key,lastConetents);
}else{
rowData.put(tableHead.get(key),lastConetents);
}
}
if(name.equals("c") && lastConetents.equals(" ")){
rowData.put(tableHead.get(key),lastConetents);
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
lastConetents = new String(ch,start,length);
}
}
}
使用:new ReadBigDataXlsxFile(fileName,e-{}).processAllSheets();