java excel import and export of large data volume


java excel import and export of large data volume
 

 Large amount of data import and export, pro test can solve the import and export of millions of excel!

 

 basis:

 The maximum length of each sheet of the 03 version (xls) excel file is 65535

 07 version (xlsx) excel file maximum length of each sheet is 1.04 million

 

 After version 07, the underlying file is xml, you can change the xlsx suffix to rar and open it with decompression software to view the source file

 

 Import:

 Required jar package: 3
 
poi-ooxml-3.12-beta1-sources.jar
poi-3.12-beta1-sources.jar

 <!-- Jar required for analyzing excel with large data volume -->

 <dependency>

 <groupId>org.eclipse.birt.runtime.3_7_1</groupId>

 <artifactId>org.apache.xerces</artifactId>

 <version>2.9.0</version>

 </dependency>
xerces-2.7.1.jar
 

 Tool code:
BigDataParseExcel.java
 

 package com.winit.sms.common.excel;

 import java.io.InputStream;

 import java.sql.SQLException;

 import java.util.ArrayList;

 import java.util.Iterator;

 import java.util.List;

 import org.apache.poi.xssf.eventusermodel.XSSFReader;

 import org.apache.poi.xssf.model.SharedStringsTable;

 import org.apache.poi.xssf.usermodel.XSSFRichTextString;

 import org.apache.poi.openxml4j.opc.OPCPackage;

 import org.xml.sax.Attributes;

 import org.xml.sax.InputSource;

 import org.xml.sax.SAXException;

 import org.xml.sax.XMLReader;

 import org.xml.sax.helpers.DefaultHandler;

 import org.xml.sax.helpers.XMLReaderFactory;

 /**

 * XSSF and SAX (Event API)

 */

 public abstract class BigDataParseExcel extends DefaultHandler {

 private SharedStringsTable sst;

 private String lastContents;

 private boolean nextIsString;

 private int sheetIndex = -1;

 private List<String> rowlist = new ArrayList<String>();

 private int curRow = 0; //当前行

 private int curCol = 0; //当前列索引

 private int preCol = 0; //上一列列索引

 private int titleRow = 0; //标题行,一般情况下为0

 private int rowsize = 0; //列数

 //excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型

 public abstract void optRows(int sheetIndex,int curRow, List<String> rowlist) throws SQLException;

 //只遍历一个sheet,其中sheetId为要遍历的sheet索引,从1开始,1-3

 /**

 *

 * @param filename

 * @param sheetId sheetId为要遍历的sheet索引,从1开始,1-3

 * @throws Exception

 */

 public void processOneSheet(String filename,int sheetId) throws Exception {

 OPCPackage pkg = OPCPackage.open(filename);

 XSSFReader r = new XSSFReader(pkg);

 SharedStringsTable sst = r.getSharedStringsTable();

 XMLReader parser = fetchSheetParser(sst);

 // rId2 found by processing the Workbook

 // 根据 rId# 或 rSheet# 查找sheet

 InputStream sheet2 = r.getSheet("rId"+sheetId);

 sheetIndex++;

 InputSource sheetSource = new InputSource(sheet2);

 parser.parse(sheetSource);

 sheet2.close();

 }

 /**

 * 遍历 excel 文件

 */

 public void process(String filename) throws Exception {

 OPCPackage pkg = OPCPackage.open(filename);

 XSSFReader r = new XSSFReader(pkg);

 SharedStringsTable sst = r.getSharedStringsTable();

 XMLReader parser = fetchSheetParser(sst);

 Iterator<InputStream> sheets = r.getSheetsData();

 while (sheets.hasNext()) {

 curRow = 0;

 sheetIndex++;

 InputStream sheet = sheets.next();

 InputSource sheetSource = new InputSource(sheet);

 parser.parse(sheetSource);

 sheet.close();

 }

 }

 public XMLReader fetchSheetParser(SharedStringsTable sst)

 throws SAXException {

 XMLReader parser = XMLReaderFactory.createXMLReader();

 //.createXMLReader("org.apache.xerces.parsers.SAXParser");

 this.sst = sst;

 parser.setContentHandler(this);

 return parser;

 }

 public void startElement(String uri, String localName, String name,

 Attributes attributes) throws SAXException {

 // c => 单元格

 if (name.equals("c")) {

 // 如果下一个元素是 SST 的索引,则将nextIsString标记为true

 String cellType = attributes.getValue("t");

 String rowStr = attributes.getValue("r");

 curCol = this.getRowIndex(rowStr);

 if (cellType != null && cellType.equals("s")) {

 nextIsString = true;

 } else {

 nextIsString = false;

 }

 }

 // 置空

 lastContents = "";

 }

 public void endElement(String uri, String localName, String name)

 throws SAXException {

 // 根据SST的索引值的到单元格的真正要存储的字符串

 // 这时characters()方法可能会被调用多次

 if (nextIsString) {

 try {

 int idx = Integer.parseInt(lastContents);

 lastContents = new XSSFRichTextString(sst.getEntryAt(idx))

 .toString();

 } catch (Exception e) {

 }

 }

 // v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引

 // 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符

 if (name.equals("v")) {

 String value = lastContents.trim();

 value = value.equals("")?" ":value;

 int cols = curCol-preCol;

 if (cols>1){

 for (int i = 0;i < cols-1;i++){

 rowlist.add(preCol,"");

 }

 }

 preCol = curCol;

 rowlist.add(curCol-1, value);

 }else {

 //如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法

 if (name.equals("row")) {

 int tmpCols = rowlist.size();

 if(curRow>this.titleRow && tmpCols<this.rowsize){

 for (int i = 0;i < this.rowsize-tmpCols;i++){

 rowlist.add(rowlist.size(), "");

 }

 }

 try {

 optRows(sheetIndex,curRow,rowlist);

 } catch (SQLException e) {

 e.printStackTrace();

 }

 if(curRow==this.titleRow){

 this.rowsize = rowlist.size();

 }

 rowlist.clear();

 curRow++;

 curCol = 0;

 preCol = 0;

 }

 }

 }

 public void characters(char[] ch, int start, int length)

 throws SAXException {

 //得到单元格内容的值

 lastContents += new String(ch, start, length);

 }

 //得到列索引,每一列c元素的r属性构成为字母加数字的形式,字母组合为列索引,数字组合为行索引,

 //如AB45,表示为第(A-A+1)*26+(B-A+1)*26列,45行

 public int getRowIndex(String rowStr){

 rowStr = rowStr.replaceAll("[^A-Z]", "");

 byte[] rowAbc = rowStr.getBytes();

 int len = rowAbc.length;

 float num = 0;

 for (int i=0;i<len;i++){

 num += (rowAbc[i]-'A'+1)*Math.pow(26,len-i-1 );

 }

 return (int) num;

 }

 public int getTitleRow() {

 return titleRow;

 }

 public void setTitleRow(int titleRow) {

 this.titleRow = titleRow;

 }

 //如何使用

 public static void main(String[] args) throws Exception {

 long start = System.currentTimeMillis();

 BigDataParseExcel xlx = new BigDataParseExcel(){


 @Override

 public void optRows(int sheetIndex, int curRow, List<String> rowlist) throws SQLException {

 System.out.println(rowlist);

 }

 };

 xlx.process("C:\\Users\\winitadmin\\导出123.xlsx");

 long end = System.currentTimeMillis();

 System.out.println((end-start)/1000);

 }

 }

  Parse out a row of data as a list: Among them, pay special attention to 2 points:

  1. Because it is parsed line by line based on xml, blank rows or blank columns that have been manipulated may also be parsed

  2. Pay special attention to the date format. The excel-recognizable date will be parsed into numbers (may contain decimals). The meaning of this number represents the number of days from 1900-01-01 to this date. The decimal part is the proportion of hours, minutes and seconds.
 

  The method of processing the date format separately: 

/**

  * 此方法处理excel 的日期格式, sax解析出来的日期为 1900-01-01 到此日期的天数

  * 故而要转换, 此方法暂时精确的天, 如若要更精确,请对datum这个数的小数部分做处理

  *

  * 据经验来看,日期格式有两种, 一种是excel可识别的日期格式(这种用上面的两行方法解决);

  * 另一种是20170215,excel识别为字符串,用simpleDataFormate解析

  *

  * @param datum

  * @return

  * @throws IllegalAccessException

  * @throws IllegalArgumentException

  */

  private Date dealXmlExcelDate(String datum){

  if(StringUtils.isBlank(datum)){

  return new Date();

  };

  Double doubleDay = Double.parseDouble(datum);

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");

  Date date = new Date();

  try {

  date = sdf.parse("1900-01-01");

  } catch (ParseException e) {

  e.printStackTrace();

  }

  Calendar ca = Calendar.getInstance();

  ca.setTime(date);

  //42934 7 -20 //39668 8 8

  ca.add(Calendar.DAY_OF_YEAR, doubleDay.intValue());

  ca.add(Calendar.DAY_OF_YEAR, -2);

  return ca.getTime();

  }

 pay attention:

 Because of the core method of loading Excel in the tool class OPCPackage pkg = OPCPackage.open(filename);

 There are two overload methods: OPCPackage.open(file);//file

 OPCPackage.open(fileInputStream);//File stream

 The latter relies heavily on memory, so in actual applications, if you can only get the file stream, it is recommended to copy the file to the local through the stream first, and then use the parsing tool class

 

 A few suggestions:

  1. Because it is a large amount of data analysis, if the parsed electronic data needs to be inserted into the database, it is not recommended to parse out one and insert one. You can parse multiple and insert them in batches. The recommended size is 500~1000! (Can be asynchronous or not)

  2. If you want to ensure the management of things inserted in this big data excel, please use the batch number for management, that is, the design table is added with a batch number, and every excel has a batch number! If the parsing fails or the insertion fails in the middle, delete it by batch number!

  3. It takes about 25 minutes to complete the analysis and insertion of 800,000 data (depending on the actual number of columns). It is recommended that the analysis function displays the status on the foreground page or a progress bar to improve the user experience!

 

 Export:

 The idea is as follows:

  1. First find out total,

  2. According to the total pagination query, while preventing the total from being greater than 1.04 million, it is also necessary to generate excel by sheet
 
 
 

  When exporting big data, please use the SXSSFWorkbook class:

  An example is as follows: 

 Thread export2 = new Thread() {
 
 

  public void run() throws SPIException {

  logger.info("导出报表开始....");

  long start_time = System.currentTimeMillis();

  // 2.新增成功,获取报表id

  final int statementId = saveStatement(command, statementCommand);

  OutputStream outStream = null;

  Workbook wb = null;

  try {

  CreatExcel2007.copyFile(templateFilePath, tmpFilePath);

  if (StringUtils.isNotBlank(entity.getReconciliationResult())) {

  entity.setResult(entity.getReconciliationResult().replace("(", "").replace(")", "").split(","));

  }

  int total =reconciliationDao.getTotal(entity);//得到导出总数

  logger.info("导出总数:"+total);

  int batch = 500;//每次批处理的条数

  int end = 1;

  wb = new SXSSFWorkbook(10000);//通过限制内存中可访问的记录行数来实现其低内存利用,当达到限定值时,新一行数据的加入会引起老一行的数据刷新到硬盘

  Map<Long, CellStyle> styles = getXSSFCellStyles(wb);

  int headRowNum = 2; // 表格头行数

  int sh= 0;

  int max = 1000000;//设置一个sheet最多存储一百万

  // 6.1 生成明细sheet

  while(total>0){//分sheet处理,一个sheet最多存储104万,超过再生成sheet

  Sheet sheet = wb.createSheet();/sheet

  Sheet sheet2 = wb.createSheet();//明细sheet

  wb.setSheetName(sh, "sh"+sh);

  wb.setSheetName(sh+1, "sh明细"+sh);

  sh += 2;//为了控制sheet名称不一样

  createISPHead(sheet.createRow(0), sheet.createRow(1),sheet);//组装表头

  createOrderLineHead(sheet2.createRow(0));//组装明细sheet

  sheet.setForceFormulaRecalculation(true);// 在打开工作簿时是否要求Excel重新计算此工作表上的所有公式。

  sheet2.setForceFormulaRecalculation(true);

  for(int s=batch; s<=max; s+=batch){//分批查询,一次性查询太多,内存无法存下

  logger.info("正在处理"+s+"条");

  if(s>total&& s>batch&&s>(total+batch)){//第一次一定要遍历,除第一次之外的, 后面如果比total大了,就结束循环

  break;

  }

  entity.setStart(end);

  end = end +batch;

  entity.setEnd(end);

  List<Map<String, Object>> listMap = reconciliationDao.queryPageEntities(entity);

  if (listMap.size() > 0) {

  List<String> trackList = new ArrayList<String>();

  //组装Sheet

  buildISPrecSheet(listMap, sheet, s, batch, headRowNum, trackList, styles);

  //组装明细Sheet

  long begin = System.currentTimeMillis();

  buildOrderLineSheet(trackList, command, batch, sheet2, s, styles);

  long endOf = System.currentTimeMillis();

  logger.info("用时:"+(begin-endOf)/1000);

  }

  }

  total -= max;

  }

  // 将数据写入到excel文件

  outStream = new FileOutputStream(tmpFilePath);

  logger.info("生成excel开始>>>>>");

  wb.write(outStream);

  ((SXSSFWorkbook) wb).dispose();

  long end_time = System.currentTimeMillis();

  logger.info("导出结束, 用时:"+(end_time-start_time)/1000); logger.info("生成excel结束>>>>>");

  logger.info("生成excel结束>>>>>");
 
 

  updateStatement(statementId, Constants.SUCCESS);

  } catch (IOException e1) {

  logger.error("导出上传Excel到文件服务器失败:" , e1);

  updateStatement(statementId, Constants.FAIL);

  throw new Exception(ErrorCode.CREATE_EXCEL_IOEXCEPTION);

  } catch (Exception e) {

  logger.error("导出上传Excel到文件服务器失败:" , e);

  updateStatement(statementId, Constants.FAIL);

  throw new Exception(e.getErrorCode());

  } catch(Exception e){

  e.printStackTrace();

  } finally {

  try {

  if (null != wb) {

  wb.close();

  }

  if (null != outStream) {

  outStream.close();

  }

  } catch (IOException e) {

  updateStatement(statementId, Constants.FAIL);

  logger.error("导出上传Excel到文件服务器失败:" , e);

  }

  }

  }

  };
 
 

  export2.start();

 

Guess you like

Origin blog.csdn.net/yucaifu1989/article/details/110494562