Apache POI 用eventmodel 读取大文件Excel (4) 示例代码

前提条件

使用场景:
1,大文件Excel,比如100MB,几十万行
2,仅限于.xlsx格式的Excel
3,要求低内存消耗

日期:2020.12.07
版本:JDK1.8

<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-ooxml</artifactId>
	<version>4.1.2</version>
</dependency>

第一步,参考一下官方文档的示例代码

相信在看到这篇博客之前,你已经了解了POI官网对于Event API的代码示例:
https://poi.apache.org/components/spreadsheet/how-to.html#xssf_sax_api

import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.util.XMLHelper;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException;
public class ExampleEventUserModel {
    
    
    public void processOneSheet(String filename) throws Exception {
    
    
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader( pkg );
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        // To look up the Sheet Name / Sheet Order / rID,
        //  you need to process the core Workbook stream.
        // Normally it's of the form rId# or rSheet#
        InputStream sheet2 = r.getSheet("rId2");
        InputSource sheetSource = new InputSource(sheet2);
        parser.parse(sheetSource);
        sheet2.close();
    }
    public void processAllSheets(String filename) throws Exception {
    
    
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader( pkg );
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        Iterator<InputStream> sheets = r.getSheetsData();
        while(sheets.hasNext()) {
    
    
            System.out.println("Processing new sheet:\n");
            InputStream sheet = sheets.next();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
            System.out.println("");
        }
    }
    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException, ParserConfigurationException {
    
    
        XMLReader parser = XMLHelper.newXMLReader();
        ContentHandler handler = new SheetHandler(sst);
        parser.setContentHandler(handler);
        return parser;
    }
    /**
     * See org.xml.sax.helpers.DefaultHandler javadocs
     */
    private static class SheetHandler extends DefaultHandler {
    
    
        private SharedStringsTable sst;
        private String lastContents;
        private boolean nextIsString;
        private SheetHandler(SharedStringsTable sst) {
    
    
            this.sst = sst;
        }
        public void startElement(String uri, String localName, String name,
                                 Attributes attributes) throws SAXException {
    
    
            // c => cell
            if(name.equals("c")) {
    
    
                // Print the cell reference
                System.out.print(attributes.getValue("r") + " - ");
                // Figure out if the value is an index in the SST
                String cellType = attributes.getValue("t");
                if(cellType != null && cellType.equals("s")) {
    
    
                    nextIsString = true;
                } else {
    
    
                    nextIsString = false;
                }
            }
            // Clear contents cache
            lastContents = "";
        }
        public void endElement(String uri, String localName, String name)
                throws SAXException {
    
    
            // Process the last contents as required.
            // Do now, as characters() may be called more than once
            if(nextIsString) {
    
    
                int idx = Integer.parseInt(lastContents);
                lastContents = sst.getItemAt(idx).getString();
                nextIsString = false;
            }
            // v => contents of a cell
            // Output after we've seen the string contents
            if(name.equals("v")) {
    
    
                System.out.println(lastContents);
            }
        }
        public void characters(char[] ch, int start, int length) {
    
    
            lastContents += new String(ch, start, length);
        }
    }
    public static void main(String[] args) throws Exception {
    
    
        ExampleEventUserModel example = new ExampleEventUserModel();
        example.processOneSheet(args[0]);
        example.processAllSheets(args[0]);
    }
}

第二步,改吧改吧变成自己的代码

如下代码处理每一个Element的数据,并将其拼接成为SQL
很抱歉,暂时为了保密,只能展示部分代码,希望这些代码能对你有所帮助。

SheetHandler

package com.airde.handler;
/**
 * Author : Airde
 * Date: 2020/12/7 14:36
 */

import com.airde.dto.Constants;
import com.airde.dto.DBInfo;
import com.airde.pojo.Column;
import com.airde.pojo.JDBC;
import com.airde.util.JdbcUtil;
import com.airde.util.SqlUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.*;

/**
 * @ClassName XlsxHandler
 * @Description 采用SAX方式依次处理每一个element
 * 使用map限定元素数量,避免VALUES长度不一致的问题。
 * @Author airde
 * @Date 2020/12/7 14:36
 * @Version 1.0
 **/
@Slf4j
public class SheetHandler extends DefaultHandler {
    
    

    //共享区
    private SharedStringsTable sharedStringsTable;
    private String nowCellColumnNo;

    //内容区
    private String lastContents;
    private boolean nextIsSharedString;


    //拼接SQL
    private StringBuilder insertSql;

    private String nowRowNum;


    //通用属性区
    private JDBC jdbc;
    private DBInfo dbInfo;
    private String insertSqlHead;


    //计数器
    private int valuesNum;
    private int commitNum;


    private int valuesNumCount = 0;
    private int commitNumCount = 0;


    private int dataStartNum;
    private int dataStopNum;
    private int skipNumCount = 0;

    //拼接区

    private List<String> columnNoList = new ArrayList<>();
    private Map<String, String> valuesMap = new HashMap<>();

    //对ColumnList的每一个Column,都产生对应的值
    //在每一个columnNo都生成对应的值之后,检查每一对值是否为空
    //如果出现为空的值,那就填补为空字符串,最后再进行拼接(endEle遇到row进行整理拼接)

    public void initValuesMap() {
    
    
        for (String columnNo : columnNoList) {
    
    
            valuesMap.put(columnNo, "");
        }
    }


    @Override
    public void startDocument() throws SAXException {
    
    
        super.startDocument();
        //初始化
        this.insertSqlHead = SqlUtil.generateInsertSqlHead(dbInfo);

        insertSql = new StringBuilder("");
        insertSql.append(insertSqlHead);

        initValuesMap();

        JdbcUtil.openFullConnection(jdbc);

    }

    @Override
    public void endDocument() throws SAXException {
    
    
        //防止没到指定次数的剩余记录未提交,只要不为Null或空字符串,就执行提交
        if (insertSql.indexOf(",") != -1) {
    
    
            if (insertSql != null && !"".equals(insertSql.toString())) {
    
    
                //最后一次拼接与执行
                insertSql.delete(insertSql.lastIndexOf(","), insertSql.length());
                insertSql.append(";");
                //执行
                try {
    
    
                    JdbcUtil.executeByFullConnection(insertSql.toString());
                } catch (Exception e) {
    
    
                    e.printStackTrace();
                    log.error("execute sql error!" + nowRowNum);
                }

            }
        }


        //用完连接就关上
        JdbcUtil.commitFullConnection();

        JdbcUtil.closeFullConnection();
        super.endDocument();
    }

    public SheetHandler(SharedStringsTable sst, DBInfo dbInfo, JDBC jdbc, List<Column> columnList, String dataStartNum, String dataStopNum) {
    
    
        //默认一条insert 包含50个记录
        this.valuesNum = 50;

        //默认每1000个insert commit一次
        this.commitNum = 1000;

        //默认起始数据行
        this.dataStartNum = Integer.parseInt(dataStartNum) - 1;
        this.dataStopNum = Integer.parseInt(dataStopNum) - 2;


        for (Column column : columnList) {
    
    
            columnNoList.add(column.getNo());
        }

        this.dbInfo = dbInfo;
        this.jdbc = jdbc;
        this.sharedStringsTable = sst;
    }


    /**
     * 遇到一个元素之前,进行什么动作
     *
     * @param uri        XML命名空间标识符
     * @param localName  不带前缀的元素名
     * @param qName      当前Element的元素名
     * @param attributes Cell里对应的属性列表
     * @throws SAXException SAX解析异常
     */
    @Override
    public void startElement(String uri, String localName, String qName,
                             Attributes attributes) throws SAXException {
    
    

        if ("row".equals(qName)) {
    
    
            nowRowNum = attributes.getValue("r");
            //新的row对应新的一条插入语句

        } else if ("c".equals(qName)) {
    
    
            // c => cell
            String cellType = attributes.getValue("t");


            nowCellColumnNo = attributes.getValue("r");
            //去掉结尾的数字
            nowCellColumnNo = nowCellColumnNo.substring(0, nowCellColumnNo.lastIndexOf(nowRowNum));

            //下一个元素是否为sharedString?
            nextIsSharedString = "s".equals(cellType);

        }
        // Clear contents cache
        lastContents = "";
    }

    /**
     * 遇到一个元素之后,进行什么动作
     *
     * @param uri       XML命名空间标识符
     * @param localName 不带前缀的元素名
     * @param qName     当前Element的元素名
     * @throws SAXException SAX解析异常
     */
    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
    
    



        if ("row".equals(qName)) {
    
    
            //如果当前已积累skipNumCount不等于skipNum,说明当前读取的行还不是数据行,那就不拼接
            if (skipNumCount < dataStartNum || skipNumCount >= dataStopNum) {
    
    
                //log.error("skip this row!");
                //每个row完了,就+1
                skipNumCount += 1;

            }else {
    
    
                //map形式拼接
                StringBuilder singleMapSql = new StringBuilder("");
                singleMapSql.append("(");
                for (String columnNo : columnNoList) {
    
    
                    if ("-1".equals(columnNo)){
    
    
                        singleMapSql.append("'',");
                    }else {
    
    
                        singleMapSql.append("'");
                        singleMapSql.append(valuesMap.get(columnNo));
                        singleMapSql.append("',");
                    }

                }

                //删除多出来的逗号,闭合
                singleMapSql.delete(singleMapSql.lastIndexOf(","), singleMapSql.length());
                singleMapSql.append("),");


                //每处理完一行,就要初始化map
                initValuesMap();

                //拼接完毕,就把它摁到InsertSql里
                insertSql.append(singleMapSql);


                //每一行结束,都得把值+1
                commitNumCount += 1;
                valuesNumCount += 1;
            }

            //如果达到了对应执行的阈值,就进行执行处理
            if (valuesNumCount == valuesNum) {
    
    
                insertSql.delete(insertSql.lastIndexOf(","), insertSql.length());
                insertSql.append(";");


                //执行
                try {
    
    
                    //log.info(insertSql.toString());
                    JdbcUtil.executeByFullConnection(insertSql.toString());
                } catch (Exception e) {
    
    
                    e.printStackTrace();
                    log.error("execute sql error" + nowRowNum);
                }


                //执行完初始化insertSql
                insertSql = new StringBuilder("");
                insertSql.append(insertSqlHead);

                //初始化valuesNumCount
                valuesNumCount = 0;
            }
            //如果达到了对应commit的阈值,就进行commit处理
            if (commitNumCount == commitNum) {
    
    
                JdbcUtil.commitFullConnection();
                //执行完初始化commitNumCount
                commitNumCount = 0;
            }

        } else {
    
    
            // TODO 这里否则之后默认为cell的情况?

            //处理SharedString的情况,可能也可以换成
            //if(Constants.CELL_TYPE_SHAREDSTRING.equals(nowCellType)){
    
    
            //}
            if (nextIsSharedString) {
    
    
                int idx = Integer.parseInt(lastContents);
                lastContents = sharedStringsTable.getItemAt(idx).getString();
                nextIsSharedString = false;
            }

            //如果nowCellNo不在columnNoList里,则不拼接这个属性,否则才拼接这个属性

            boolean ifInColumnNoList = false;
            for (String columnNo : columnNoList) {
    
    
                if (nowCellColumnNo != null && nowCellColumnNo.equals(columnNo)) {
    
    
                    ifInColumnNoList = true;
                    break;
                }
            }
            // v => contents of a cell
            // Output after we've seen the string contents
            // 拼接SQL
            if ("v".equals(qName)) {
    
    
                if (ifInColumnNoList) {
    
    
                    valuesMap.put(nowCellColumnNo, lastContents);
                    //同时检查put进去的是否为Null,不为null就行为null就替换为""空字符串
                    if (valuesMap.get(nowCellColumnNo) == null) {
    
    
                        valuesMap.put(nowCellColumnNo, "");
                    }
                }

            }


        }


    }

    @Override
    public void characters(char[] ch, int start, int length) {
    
    
        lastContents += new String(ch, start, length);
    }



}

SheetHandlerWrapper

package com.airde.handler;
/**
 * Author : Airde
 * Date: 2020/12/9 9:52
 */

import com.airde.pojo.Task;
import com.airde.util.SqlUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import javax.xml.parsers.ParserConfigurationException;
import java.io.InputStream;

/**
 * @ClassName SheetHandlerWrapper
 * @Description 用于触发sheethandler的执行
 * @Author airde
 * @Date 2020/12/9 9:52
 * @Version 1.0
 **/
@Slf4j
public class SheetHandlerWrapper {
    
    


    public static void generateInsertSqlAndExecute(Task task){
    
    
        try {
    
    
            if ("false".equals(task.getIsFtp() )){
    
    
                OPCPackage opcPackage = OPCPackage.open(task.getSrcPath());
                XSSFReader xssfReader = new XSSFReader( opcPackage );
                SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
                XMLReader xmlReader = getXmlReader(sharedStringsTable, task);

                //虽然现在必须使用rId1,rId2这种形式,但是可以通过workbookdata根据sheetname查找sheetid,
                InputStream sheet = xssfReader.getSheet("rId"+task.getSheet().getSheetId());
                InputSource sheetSource = new InputSource(sheet);
                //智能产生插入SQL并执行
                xmlReader.parse(sheetSource);
                sheet.close();

            }else {
    
    
                log.error("This is a ftp srcPath ,this version can't handle it, skip this Task");
                return;
            }

        }catch (Exception e){
    
    
            e.printStackTrace();
            log.error("GenerateInsertSqlAndExecute ERROR!");
        }

    }


    public static XMLReader getXmlReader(SharedStringsTable sst,Task task) throws SAXException, ParserConfigurationException {
    
    
        try {
    
    
            XMLReader xmlReader = XMLReaderFactory.createXMLReader();
            ContentHandler handler = new SheetHandler(sst,SqlUtil.getDbInfoFromTask(task),task.getJdbc(),task.getColumnList(),task.getSheet().getDataLine(),task.getSheet().getDataEndLine());
            xmlReader.setContentHandler(handler);
            return xmlReader;
        }catch (Exception e){
    
    
            e.printStackTrace();
            log.error("The function getXmlReader Error,it's weired to find this problem.");
        }
       return null;
    }

}

结语

此代码来源于我开发的一个小工具jar包,项目名称为ExcelToDB,顾名思义,从各种文件系统读取Excel提取数据插入到各种数据库。

特点:轻量级jar包(小jar包灵活性强,而不是kettle那么笨重),配置好后一键运行,可搭载到Job调度中一键调用。
内存消耗小,速度快,灵活性强,编写代码自动化配置既可实现一键导入,否则需要简单的手工配置。

目前版本v1.0.3,暂未开源,功能完善优化中

读取几十万行的.xlsx文件,并将数据转换为标准SQL并执行的一个小jar包,打包后17MB左右,优化后可以更小,可配置日志记录内容,可写到多种目标数据库,从多种文件系统读取Excel,只需要简单的XML配置即可。

转换并执行的速度:
实际测试中,8172行,8MB,每行60个字段的Excel里的真实业务数据,8700ms即可导入到数据库,初版未优化,预计优化后速度可以更快。

有兴趣了解的,或者有类似业务场景的请在评论区回复,如果这个需求很普遍,那它值得成为开源项目,防止重复造轮子。

猜你喜欢

转载自blog.csdn.net/weixin_42072754/article/details/110815855