前言
工作中通常会有XLS和XLSX文件转换成CSV文件的需求,虽然Excel和WPS可以将这两种格式文件很方便的转换,但是不可能将生产上的数据下载下来转换后再上传上去(这真是个疯狂的想法...)。目前网上的解决方案是通过在python脚本中导入pandas模块然后实现转换,虽然很便捷,但是在java程序中又显得格格不入。
2023/02/27 更新最新版官方方案:
提示:在测试官方的csv转换功能时,生成的CSV文件总会出现乱码,待跟踪......推荐有需要的程序员读取后自行写入csv文件。
0、pom.xml文件需引入依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-examples</artifactId>
<version>5.2.3</version>
</dependency>
1、百万级数据文件读取
// 测试方法
import org.apache.poi.examples.hssf.eventusermodel.XLS2CSVmra;
import org.apache.poi.examples.xssf.eventusermodel.XLSX2CSV;
public class Test001 {
public static void main(String[] args) throws Exception {
// xlsx文件读取
Xlsx2Csv.main(new String[]{"D:\\通知消息记录浏览.xlsx"});
// xls文件读取
XLS2CSVmra.main(new String[]{"D:\\通知消息记录浏览.xls"});
}
}
------------------------------------------------时间分割线------------------------------------------------
旧版官方推荐方案正文代码:
0、pom.xml文件需引入依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.2</version>
</dependency>
1、百万级数据Xlsx文件读取
import org.apache.poi.ooxml.util.SAXHelper;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
public class Xlsx2Csv {
private final OPCPackage xlsxPackage;
private final int minColumns;
private final PrintStream output;
private class SheetToCSV implements XSSFSheetXMLHandler.SheetContentsHandler {
private boolean firstCellOfRow = false;
private int currentRow = -1;
private int currentCol = -1;
private void outputMissingRows(int number) {
for (int i = 0; i < number; i++) {
for (int j = 0; j < minColumns; j++) {
output.append(',');
}
output.append('\n');
}
}
@Override
public void startRow(int rowNum) {
outputMissingRows(rowNum - currentRow - 1);
firstCellOfRow = true;
currentRow = rowNum;
currentCol = -1;
}
public void endRow(int rowNum) {
for (int i = currentCol; i < minColumns; i++) {
output.append(',');
}
output.append(',');
}
public void cell(String cellReference, String formattedValue, XSSFComment xssfComment) {
if (firstCellOfRow) {
firstCellOfRow = false;
} else {
output.append(',');
}
if (cellReference == null) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
int thisCol = (new CellReference(cellReference)).getCol();
int missedCols = thisCol - currentCol - 1;
for (int i = 0; i < missedCols; i++) {
output.append(',');
}
currentCol = thisCol;
try {
Double.parseDouble(formattedValue);
output.append(formattedValue);
} catch (NumberFormatException e) {
output.append('"');
output.append(formattedValue);
output.append('"');
}
}
@Override
public void headerFooter(String s, boolean b, String s1) {
//CSV文件没有页眉页脚
}
}
public Xlsx2Csv(OPCPackage pkg, PrintStream output, int minColumns) {
this.xlsxPackage = pkg;
this.output = output;
this.minColumns = minColumns;
}
public void processSheet(StylesTable styles, ReadOnlySharedStringsTable readOnlyTableString, XSSFSheetXMLHandler.SheetContentsHandler sheetHandler, InputStream sheetInputStream) {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader reader = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(styles, readOnlyTableString, sheetHandler, formatter, false);
reader.setContentHandler(handler);
reader.parse(sheetSource);
} catch (SAXException | ParserConfigurationException | IOException e) {
e.printStackTrace();
}
}
public void process() throws IOException, SAXException, OpenXML4JException {
ReadOnlySharedStringsTable readOnlyTableString = new ReadOnlySharedStringsTable(this.xlsxPackage);
XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (iter.hasNext()) {
InputStream inputStream = iter.next();
processSheet(styles, readOnlyTableString, new SheetToCSV(), inputStream);
inputStream.close();
}
}
public static void main(String[] args) throws OpenXML4JException, SAXException, IOException {
File xlsxFile = new File(args[0]);
if (!xlsxFile.exists()) {
return;
}
int minColumns = -1;
OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
Xlsx2Csv xlsx2Csv = new Xlsx2Csv(p, System.out, minColumns);
xlsx2Csv.process();
p.close();
}
}
// 测试方法
public class Test001 {
public static void main(String[] args) throws OpenXML4JException, IOException, SAXException {
Xlsx2Csv.main(new String[]{"D:\\通知消息记录浏览.xls"});
}
}
2、百万级数据Xls文件读取
import org.apache.poi.examples.hssf.eventusermodel.XLS2CSVmra;
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
public class Xls2CsvMra implements HSSFListener {
private int minColumns;
private POIFSFileSystem fs;
private PrintStream output;
private int lastRowNumber;
private int lastColumnNumber;
// 是否输出表达式或者值
private boolean outputFormulaValues = false;
// 解析公式
private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
private HSSFWorkbook stubWorkbook;
// 处理过程中收集的记录
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
// 正在处理的sheet
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
private List<BoundSheetRecord> boundSheetRecords = new ArrayList<>();
// 处理带有字符串结果的公式
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
// 创建新的XLS到CSV的转化器
public Xls2CsvMra(POIFSFileSystem fs, PrintStream output, int minColumns) {
this.fs = fs;
this.output = output;
this.minColumns = minColumns;
}
// 创建新的XLS到CSV的转换器
public Xls2CsvMra(String filename, int minColumns) throws IOException {
this(
new POIFSFileSystem(new FileInputStream(filename)), System.out, minColumns
);
}
public Xls2CsvMra(String inputFilePath, String outputFilePath) throws IOException {
fs = new POIFSFileSystem(new FileInputStream(inputFilePath));
output = new PrintStream(outputFilePath);
minColumns = -1;
}
// 开始将XLS文件处理为CSV
public void process() throws IOException {
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
}
@Override
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add((BoundSheetRecord) record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
// 创建子工作簿
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
sheetIndex++;
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
output.println();
output.println(orderedBSRs[sheetIndex].getSheetname() + " [" + (sheetIndex + 1) + "]:");
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord berec = (BlankRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = "";
break;
case FormulaRecord.sid:
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = formatListener.formatNumberDateCell(frec);
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
break;
case StringRecord.sid:
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
thisStr = '"' + lrec.getValue() + '"';
break;
case LabelSSTRecord.sid:
LabelSSTRecord lsrec = (LabelSSTRecord) record;
thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
thisStr = '"' + "(No SST Record, can't identify string)" + '"';
} else {
thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex()).toString() + '"';
}
break;
case NoteRecord.sid:
NoteRecord nrec = (NoteRecord) record;
thisRow = nrec.getRow();
thisColumn = nrec.getColumn();
thisStr = '"' + "(TODO)" + '"';
break;
case NumberRecord.sid:
NumberRecord numrec = (NumberRecord) record;
thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
thisStr = formatListener.formatNumberDateCell(numrec);
break;
case RKRecord.sid:
RKRecord rkrec = (RKRecord) record;
thisRow = rkrec.getRow();
thisColumn = rkrec.getColumn();
thisStr = '"' + "(TODO)" + '"';
break;
default:
break;
}
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
thisRow = mc.getRow();
thisColumn = mc.getColumn();
thisStr = "";
}
if (thisStr != null) {
if (thisColumn > 0) {
output.print(',');
}
output.print(thisStr);
}
if (thisRow > -1) {
lastRowNumber = thisRow;
}
if (thisColumn > -1) {
lastColumnNumber = thisColumn;
}
if (record instanceof LastCellOfRowDummyRecord) {
if (minColumns > 0) {
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
for (int i = lastColumnNumber; i < (minColumns); i++) {
output.print(',');
}
}
lastColumnNumber = -1;
output.println();
}
}
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" XLS2CSVmra <xls file> [min columns]");
System.exit(1);
}
int minColumns = -1;
if (args.length >= 2) {
minColumns = Integer.parseInt(args[1]);
}
XLS2CSVmra xls2csv = new XLS2CSVmra(args[0], minColumns);
xls2csv.process();
}
}
// 测试方法
public class Test001 {
public static void main(String[] args) throws Exception {
XLS2CSVmra.main(new String[]{"D:\\通知消息记录浏览.xls"});
}
}
如有问题请在下方留言,我会认真的查看每一条并尽量的给予回复。谢谢观看^^