解决poi解析过大文件导致内存溢出问题

解析excel的时候,我们一般都会用poi提供的api来实现,也很方便使用,但是如果excel文件较大,文件行数过多,那么很可能会造成内存溢出,解析失败。
本文只要是针对这种情况给出另外一种性能更好的方式

import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * XSSF and SAX (Event API)
 */
public abstract class XxlsAbstractReader extends DefaultHandler {

    private Log logger = LogFactory.getLog(getClass());

    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;

    private int sheetIndex = -1;
    private List<Object> rowlist = new ArrayList<Object>();
    private int curRow = 0; // 当前行
    private int curCol = 0; // 当前列索引
    private int preCol = 0; // 上一列列索引
    private int titleRow = 0; // 标题行,一般情况下为0
    private int rowsize = 0; // 列数

    // excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型
    public abstract void optRows(int sheetIndex, int curRow, List<Object> rowlist) throws SQLException;

    // 只遍历一个sheet,其中sheetId为要遍历的sheet索引,从1开始,1-3

    /**
     * @param filename
     * @param sheetId sheetId为要遍历的sheet索引,从1开始,1-3
     * @throws Exception
     */
    public void processOneSheet(String filename, int sheetId) throws Exception {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = fetchSheetParser(sst);

        // rId2 found by processing the Workbook
        // 根据 rId# 或 rSheet# 查找sheet
        InputStream sheet2 = r.getSheet("rId" + sheetId);
        sheetIndex++;
        InputSource sheetSource = new InputSource(sheet2);
        parser.parse(sheetSource);
        sheet2.close();
    }

    /**
     * 遍历 excel 文件
     */
    public void process(String filename) throws Exception {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = fetchSheetParser(sst);

        Iterator<InputStream> sheets = r.getSheetsData();
        while (sheets.hasNext()) {
            curRow = 0;
            sheetIndex++;
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
    }

    /**
     * 遍历 excel 文件
     */
    public void process(InputStream in) throws Exception {
        OPCPackage pkg = OPCPackage.open(in);
        XSSFReader r = new XSSFReader(pkg);
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = fetchSheetParser(sst);

        Iterator<InputStream> sheets = r.getSheetsData();
        while (sheets.hasNext()) {
            curRow = 0;
            sheetIndex++;
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
        XMLReader parser = XMLReaderFactory.createXMLReader();
        // .createXMLReader("org.apache.xerces.parsers.SAXParser");
        this.sst = sst;
        parser.setContentHandler(this);
        return parser;
    }

    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
        // c => 单元格
        if ("c".equals(name)) {
            // 如果下一个元素是 SST 的索引,则将nextIsString标记为true
            String cellType = attributes.getValue("t");
            String rowStr = attributes.getValue("r");
            curCol = this.getRowIndex(rowStr);
            if (cellType != null && "s".equals(cellType)) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 置空
        lastContents = "";
    }

    public void endElement(String uri, String localName, String name) throws SAXException {
        // 根据SST的索引值的到单元格的真正要存储的字符串
        // 这时characters()方法可能会被调用多次
        if (nextIsString) {
            try {
                int idx = Integer.parseInt(lastContents);
                lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
            } catch (Exception e) {

            }
        }
        // v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
        // 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
        if ("v".equals(name)) {
            String value = lastContents.trim();
            value = "".equals(value) ? " " : value;
            int cols = curCol - preCol;
            if (cols > 1) {
                for (int i = 0; i < cols - 1; i++) {
                    rowlist.add(preCol, "");
                }
            }
            preCol = curCol;
            rowlist.add(curCol - 1, value);
        } else {
            // 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
            if ("row".equals(name)) {
                int tmpCols = rowlist.size();
                if (tmpCols > 0) {
                    if (curRow > this.titleRow && tmpCols < this.rowsize) {
                        for (int i = 0; i < this.rowsize - tmpCols; i++) {
                            rowlist.add(rowlist.size(), "");
                        }
                    }
                    try {
                        optRows(sheetIndex, curRow, rowlist);
                    } catch (SQLException e) {
                        logger.error("endElement error", e);
                    }
                } else {// 跳过空白行
                    logger.info("jump blank row ,curRow:" + (curRow + 1));
                }
                if (curRow == this.titleRow) {
                    this.rowsize = rowlist.size();
                    if (this.rowsize == 0) {
                        throw new RuntimeException("excel 未被编辑或者内容为空");
                    }
                }

                rowlist.clear();
                curRow++;
                curCol = 0;
                preCol = 0;
            }
        }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
        // 得到单元格内容的值
        lastContents += new String(ch, start, length);
    }

    // 得到列索引,每一列c元素的r属性构成为字母加数字的形式,字母组合为列索引,数字组合为行索引,
    // 如AB45,表示为第(A-A+1)*26+(B-A+1)*26列,45行
    public int getRowIndex(String rowStr) {
        rowStr = rowStr.replaceAll("[^A-Z]", "");
        byte[] rowAbc = rowStr.getBytes();
        int len = rowAbc.length;
        float num = 0;
        for (int i = 0; i < len; i++) {
            num += (rowAbc[i] - 'A' + 1) * Math.pow(26, len - i - 1);
        }
        return (int) num;
    }

    public int getTitleRow() {
        return titleRow;
    }

    public void setTitleRow(int titleRow) {
        this.titleRow = titleRow;
    }

    public int getSheetIndex() {
        return sheetIndex;
    }

}

FileObjectReader类继承XxlsAbstractReader类

public class FileObjectReader extends XxlsAbstractReader {
    private List<Temp> voList = new ArrayList<>();
    public FileObjectReader() {
    }

    @Override
    public void optRows(int sheetIndex, int curRow, List<Object> rowlist) throws SQLException {
        if (curRow == 0) {
            return;
        }

        Temp vo = new Temp();
        vo.setField1(String.valueOf(rowlist.get(0));
        vo.setField2(rowlist.get(1);
        vo.setField3(rowlist.get(2);
        ……
        voList.add(vo);
    }

    public List<Temp> getVoList() {
        return voList;
    }
}

文件入口类,上传文件后调用

public void importFile(InputStream inputStream){
    FileObjectReader reader = new FileObjectReader();
    reader.process(inputStream);

    reader.getVoList(); // 拿到解析excel后的集合
}

猜你喜欢

转载自blog.csdn.net/huangdi1309/article/details/80146303