前提条件
使用场景:
1,大文件Excel,比如100MB,几十万行
2,仅限于.xlsx格式的Excel
3,要求低内存消耗
日期:2020.12.07
版本:JDK1.8
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
第一步,参考一下官方文档的示例代码
相信在看到这篇博客之前,你已经了解了POI官网对于Event API的代码示例:
https://poi.apache.org/components/spreadsheet/how-to.html#xssf_sax_api
import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.util.XMLHelper;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException;
public class ExampleEventUserModel {
public void processOneSheet(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
// To look up the Sheet Name / Sheet Order / rID,
// you need to process the core Workbook stream.
// Normally it's of the form rId# or rSheet#
InputStream sheet2 = r.getSheet("rId2");
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
public void processAllSheets(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while(sheets.hasNext()) {
System.out.println("Processing new sheet:\n");
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
System.out.println("");
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException, ParserConfigurationException {
XMLReader parser = XMLHelper.newXMLReader();
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private static class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
if(name.equals("c")) {
// Print the cell reference
System.out.print(attributes.getValue("r") + " - ");
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = sst.getItemAt(idx).getString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if(name.equals("v")) {
System.out.println(lastContents);
}
}
public void characters(char[] ch, int start, int length) {
lastContents += new String(ch, start, length);
}
}
public static void main(String[] args) throws Exception {
ExampleEventUserModel example = new ExampleEventUserModel();
example.processOneSheet(args[0]);
example.processAllSheets(args[0]);
}
}
第二步,改吧改吧变成自己的代码
如下代码处理每一个Element的数据,并将其拼接成为SQL
很抱歉,暂时为了保密,只能展示部分代码,希望这些代码能对你有所帮助。
SheetHandler
package com.airde.handler;
/**
* Author : Airde
* Date: 2020/12/7 14:36
*/
import com.airde.dto.Constants;
import com.airde.dto.DBInfo;
import com.airde.pojo.Column;
import com.airde.pojo.JDBC;
import com.airde.util.JdbcUtil;
import com.airde.util.SqlUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.util.*;
/**
* @ClassName XlsxHandler
* @Description 采用SAX方式依次处理每一个element
* 使用map限定元素数量,避免VALUES长度不一致的问题。
* @Author airde
* @Date 2020/12/7 14:36
* @Version 1.0
**/
@Slf4j
public class SheetHandler extends DefaultHandler {
//共享区
private SharedStringsTable sharedStringsTable;
private String nowCellColumnNo;
//内容区
private String lastContents;
private boolean nextIsSharedString;
//拼接SQL
private StringBuilder insertSql;
private String nowRowNum;
//通用属性区
private JDBC jdbc;
private DBInfo dbInfo;
private String insertSqlHead;
//计数器
private int valuesNum;
private int commitNum;
private int valuesNumCount = 0;
private int commitNumCount = 0;
private int dataStartNum;
private int dataStopNum;
private int skipNumCount = 0;
//拼接区
private List<String> columnNoList = new ArrayList<>();
private Map<String, String> valuesMap = new HashMap<>();
//对ColumnList的每一个Column,都产生对应的值
//在每一个columnNo都生成对应的值之后,检查每一对值是否为空
//如果出现为空的值,那就填补为空字符串,最后再进行拼接(endEle遇到row进行整理拼接)
public void initValuesMap() {
for (String columnNo : columnNoList) {
valuesMap.put(columnNo, "");
}
}
@Override
public void startDocument() throws SAXException {
super.startDocument();
//初始化
this.insertSqlHead = SqlUtil.generateInsertSqlHead(dbInfo);
insertSql = new StringBuilder("");
insertSql.append(insertSqlHead);
initValuesMap();
JdbcUtil.openFullConnection(jdbc);
}
@Override
public void endDocument() throws SAXException {
//防止没到指定次数的剩余记录未提交,只要不为Null或空字符串,就执行提交
if (insertSql.indexOf(",") != -1) {
if (insertSql != null && !"".equals(insertSql.toString())) {
//最后一次拼接与执行
insertSql.delete(insertSql.lastIndexOf(","), insertSql.length());
insertSql.append(";");
//执行
try {
JdbcUtil.executeByFullConnection(insertSql.toString());
} catch (Exception e) {
e.printStackTrace();
log.error("execute sql error!" + nowRowNum);
}
}
}
//用完连接就关上
JdbcUtil.commitFullConnection();
JdbcUtil.closeFullConnection();
super.endDocument();
}
public SheetHandler(SharedStringsTable sst, DBInfo dbInfo, JDBC jdbc, List<Column> columnList, String dataStartNum, String dataStopNum) {
//默认一条insert 包含50个记录
this.valuesNum = 50;
//默认每1000个insert commit一次
this.commitNum = 1000;
//默认起始数据行
this.dataStartNum = Integer.parseInt(dataStartNum) - 1;
this.dataStopNum = Integer.parseInt(dataStopNum) - 2;
for (Column column : columnList) {
columnNoList.add(column.getNo());
}
this.dbInfo = dbInfo;
this.jdbc = jdbc;
this.sharedStringsTable = sst;
}
/**
* 遇到一个元素之前,进行什么动作
*
* @param uri XML命名空间标识符
* @param localName 不带前缀的元素名
* @param qName 当前Element的元素名
* @param attributes Cell里对应的属性列表
* @throws SAXException SAX解析异常
*/
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if ("row".equals(qName)) {
nowRowNum = attributes.getValue("r");
//新的row对应新的一条插入语句
} else if ("c".equals(qName)) {
// c => cell
String cellType = attributes.getValue("t");
nowCellColumnNo = attributes.getValue("r");
//去掉结尾的数字
nowCellColumnNo = nowCellColumnNo.substring(0, nowCellColumnNo.lastIndexOf(nowRowNum));
//下一个元素是否为sharedString?
nextIsSharedString = "s".equals(cellType);
}
// Clear contents cache
lastContents = "";
}
/**
* 遇到一个元素之后,进行什么动作
*
* @param uri XML命名空间标识符
* @param localName 不带前缀的元素名
* @param qName 当前Element的元素名
* @throws SAXException SAX解析异常
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("row".equals(qName)) {
//如果当前已积累skipNumCount不等于skipNum,说明当前读取的行还不是数据行,那就不拼接
if (skipNumCount < dataStartNum || skipNumCount >= dataStopNum) {
//log.error("skip this row!");
//每个row完了,就+1
skipNumCount += 1;
}else {
//map形式拼接
StringBuilder singleMapSql = new StringBuilder("");
singleMapSql.append("(");
for (String columnNo : columnNoList) {
if ("-1".equals(columnNo)){
singleMapSql.append("'',");
}else {
singleMapSql.append("'");
singleMapSql.append(valuesMap.get(columnNo));
singleMapSql.append("',");
}
}
//删除多出来的逗号,闭合
singleMapSql.delete(singleMapSql.lastIndexOf(","), singleMapSql.length());
singleMapSql.append("),");
//每处理完一行,就要初始化map
initValuesMap();
//拼接完毕,就把它摁到InsertSql里
insertSql.append(singleMapSql);
//每一行结束,都得把值+1
commitNumCount += 1;
valuesNumCount += 1;
}
//如果达到了对应执行的阈值,就进行执行处理
if (valuesNumCount == valuesNum) {
insertSql.delete(insertSql.lastIndexOf(","), insertSql.length());
insertSql.append(";");
//执行
try {
//log.info(insertSql.toString());
JdbcUtil.executeByFullConnection(insertSql.toString());
} catch (Exception e) {
e.printStackTrace();
log.error("execute sql error" + nowRowNum);
}
//执行完初始化insertSql
insertSql = new StringBuilder("");
insertSql.append(insertSqlHead);
//初始化valuesNumCount
valuesNumCount = 0;
}
//如果达到了对应commit的阈值,就进行commit处理
if (commitNumCount == commitNum) {
JdbcUtil.commitFullConnection();
//执行完初始化commitNumCount
commitNumCount = 0;
}
} else {
// TODO 这里否则之后默认为cell的情况?
//处理SharedString的情况,可能也可以换成
//if(Constants.CELL_TYPE_SHAREDSTRING.equals(nowCellType)){
//}
if (nextIsSharedString) {
int idx = Integer.parseInt(lastContents);
lastContents = sharedStringsTable.getItemAt(idx).getString();
nextIsSharedString = false;
}
//如果nowCellNo不在columnNoList里,则不拼接这个属性,否则才拼接这个属性
boolean ifInColumnNoList = false;
for (String columnNo : columnNoList) {
if (nowCellColumnNo != null && nowCellColumnNo.equals(columnNo)) {
ifInColumnNoList = true;
break;
}
}
// v => contents of a cell
// Output after we've seen the string contents
// 拼接SQL
if ("v".equals(qName)) {
if (ifInColumnNoList) {
valuesMap.put(nowCellColumnNo, lastContents);
//同时检查put进去的是否为Null,不为null就行为null就替换为""空字符串
if (valuesMap.get(nowCellColumnNo) == null) {
valuesMap.put(nowCellColumnNo, "");
}
}
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
lastContents += new String(ch, start, length);
}
}
SheetHandlerWrapper
package com.airde.handler;
/**
* Author : Airde
* Date: 2020/12/9 9:52
*/
import com.airde.pojo.Task;
import com.airde.util.SqlUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.InputStream;
/**
* @ClassName SheetHandlerWrapper
* @Description 用于触发sheethandler的执行
* @Author airde
* @Date 2020/12/9 9:52
* @Version 1.0
**/
@Slf4j
public class SheetHandlerWrapper {
public static void generateInsertSqlAndExecute(Task task){
try {
if ("false".equals(task.getIsFtp() )){
OPCPackage opcPackage = OPCPackage.open(task.getSrcPath());
XSSFReader xssfReader = new XSSFReader( opcPackage );
SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
XMLReader xmlReader = getXmlReader(sharedStringsTable, task);
//虽然现在必须使用rId1,rId2这种形式,但是可以通过workbookdata根据sheetname查找sheetid,
InputStream sheet = xssfReader.getSheet("rId"+task.getSheet().getSheetId());
InputSource sheetSource = new InputSource(sheet);
//智能产生插入SQL并执行
xmlReader.parse(sheetSource);
sheet.close();
}else {
log.error("This is a ftp srcPath ,this version can't handle it, skip this Task");
return;
}
}catch (Exception e){
e.printStackTrace();
log.error("GenerateInsertSqlAndExecute ERROR!");
}
}
public static XMLReader getXmlReader(SharedStringsTable sst,Task task) throws SAXException, ParserConfigurationException {
try {
XMLReader xmlReader = XMLReaderFactory.createXMLReader();
ContentHandler handler = new SheetHandler(sst,SqlUtil.getDbInfoFromTask(task),task.getJdbc(),task.getColumnList(),task.getSheet().getDataLine(),task.getSheet().getDataEndLine());
xmlReader.setContentHandler(handler);
return xmlReader;
}catch (Exception e){
e.printStackTrace();
log.error("The function getXmlReader Error,it's weired to find this problem.");
}
return null;
}
}
结语
此代码来源于我开发的一个小工具jar包,项目名称为ExcelToDB
,顾名思义,从各种文件系统读取Excel提取数据插入到各种数据库。
特点:轻量级jar包(小jar包灵活性强,而不是kettle那么笨重),配置好后一键运行,可搭载到Job调度中一键调用。
内存消耗小,速度快,灵活性强,编写代码自动化配置既可实现一键导入,否则需要简单的手工配置。
目前版本v1.0.3,暂未开源,功能完善优化中
读取几十万行的.xlsx文件,并将数据转换为标准SQL并执行的一个小jar包,打包后17MB左右,优化后可以更小,可配置日志记录内容,可写到多种目标数据库,从多种文件系统读取Excel,只需要简单的XML配置即可。
转换并执行的速度:
实际测试中,8172行,8MB,每行60个字段的Excel里的真实业务数据,8700ms即可导入到数据库,初版未优化,预计优化后速度可以更快。
有兴趣了解的,或者有类似业务场景的请在评论区回复,如果这个需求很普遍,那它值得成为开源项目,防止重复造轮子。