JAVA读取百万数据的xlsx大文件

一、周末没事记录下本周遇到的一个问题:需要把一个很大的xlsx文件数据写入hbase,文件数据60W+。
1、一开始用xls方式处理,发现不行,xls文件只能处理到65535行。
2、然后使用SXSSFWorkbook方式,发现这个只能写或者操作很大的xlsx文件,没有读的功能,而且在本地测试的时候,报内存溢出,看文档这个类会把临时数据写入本地磁盘,多次修改测试还是报内存溢出。
3、最后在apache官网找到了解决方法:
官网地址:大xlsx文件或者xml文件读取
4、做简单的修改后,写入hbase只用不到一分钟的时间。当然需要引入apache的poi-ooxml跟rj.jar这个包,下面的代码拿来就可以用,xlsx文件第一行需要是列名:

package com.xlsx.utils;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.function.Consumer;

public class ReadBigDataXlsxFile {
    
    

    private Consumer<Map<String,String>> consumer;

    private String fileName;

    private boolean useTextReadFile;

    public ReadBigDataXlsxFile(String fileName,Consumer<Map<String,String>> consumer){
    
    
        this.fileName = fileName;
        this.consumer = consumer;
    }

    public void processOneSheet(String sheetId) throws OpenXML4JException, IOException, SAXException {
    
    
        OPCPackage open = OPCPackage.open(fileName);
        XSSFReader xssfReader = new XSSFReader(open);
        SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
        XMLReader  parser = fetchSheetParser(sharedStringsTable);
        InputStream sheet = xssfReader.getSheet(sheetId);
        InputSource inputStream = new InputSource(sheet);
        parser.parse(inputStream);
        sheet.close();

    }
    
    public void processAllSheets() throws OpenXML4JException, IOException, SAXException {
    
    
        OPCPackage open = OPCPackage.open(fileName);
        XSSFReader xssfReader = new XSSFReader(open);
        SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
        XMLReader  parser = fetchSheetParser(sharedStringsTable);
        Iterator<InputStream> sheetsData = xssfReader.getSheetsData();
        while (sheetsData.hasNext()){
    
    
            InputStream next = sheetsData.next();
            InputSource inputSource = new InputSource(next);
            parser.parse(inputSource);
            next.close();
        }
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
    
    
        XMLReader xmlReader = XMLReaderFactory.createXMLReader();
        SheetHandler sheetHandler = new SheetHandler(sst, consumer);
        xmlReader.setContentHandler(sheetHandler);
        return xmlReader;

    }
    
    
    
    private static class SheetHandler extends DefaultHandler{
    
    
        
        private SharedStringsTable sst;
        
        private String lastConetents;
        
        private boolean nextIsString;

        //存储xlsx文件的第一行作为字段名
        private Map<String, String> tableHead = new HashMap<>();
        
        private Map<String,String> rowData = new HashMap<>();
        
        //记录行,如果是第一行,就写入tableHead,作为数据key
        private int rowNum = 0;
        
        private String key,value;
        
        //每行数据的处理逻辑
        private Consumer<Map<String,String>> consumer;
        
        private SheetHandler(SharedStringsTable sst,Consumer<Map<String,String>> consumer){
    
    
            this.sst = sst;
            this.consumer = consumer;
        }

        @Override
        public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
    
    
            if(name.equals("row")){
    
    
                rowNum ++;
                consumer.accept(rowData);
            }
            if(name.equals("c")){
    
    
                key = attributes.getValue("r").substring(0,1);
                String cellType = attributes.getValue("t");
                if(cellType != null && cellType.equals("s")){
    
    
                    nextIsString = true;
                }else{
    
    
                    nextIsString = false;
                }
                lastConetents = "";
            }
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
    
    
            if(nextIsString){
    
    
                if(lastConetents == ""){
    
    //处理表格列值为空的情况
                    lastConetents = " ";
                }else{
    
    
                    int indx = Integer.parseInt(lastConetents);
                    lastConetents = sst.getItems().get(indx).getT();
                }
                nextIsString = false;
            }
            if(name.equals("v")){
    
    
                if(rowNum == 1){
    
    
                    tableHead.put(key,lastConetents);
                }else{
    
    
                    rowData.put(tableHead.get(key),lastConetents);
                }
            }
            if(name.equals("c") && lastConetents.equals(" ")){
    
    
                rowData.put(tableHead.get(key),lastConetents);
            }
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
    
    
            lastConetents = new String(ch,start,length);
        }
    }

}

使用:new ReadBigDataXlsxFile(fileName,e-{}).processAllSheets();

猜你喜欢

转载自blog.csdn.net/u013326684/article/details/123289476