Java代码实现百万级数据XLS文件和XLSX文件的读取(已经过测试, 安心使用)

前言

工作中通常会有XLS和XLSX文件转换成CSV文件的需求,虽然Excel和WPS可以将这两种格式文件很方便的转换,但是不可能将生产上的数据下载下来转换后再上传上去(这真是个疯狂的想法...)。目前网上的解决方案是通过在python脚本中导入pandas模块然后实现转换,虽然很便捷,但是在java程序中又显得格格不入。

2023/02/27 更新最新版官方方案:

提示:在测试官方的csv转换功能时,生成的CSV文件总会出现乱码,待跟踪......推荐有需要的程序员读取后自行写入csv文件。

0、pom.xml文件需引入依赖

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-examples</artifactId>
    <version>5.2.3</version>
</dependency>

1、百万级数据文件读取

// 测试方法
import org.apache.poi.examples.hssf.eventusermodel.XLS2CSVmra;
import org.apache.poi.examples.xssf.eventusermodel.XLSX2CSV;

public class Test001 {
    public static void main(String[] args) throws Exception {
        // xlsx文件读取
        Xlsx2Csv.main(new String[]{"D:\\通知消息记录浏览.xlsx"});

        // xls文件读取
        XLS2CSVmra.main(new String[]{"D:\\通知消息记录浏览.xls"});
    }
}

------------------------------------------------时间分割线------------------------------------------------


旧版官方推荐方案正文代码​​​​​​​:

0、pom.xml文件需引入依赖

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>5.2.2</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>5.2.2</version>
</dependency>

1、百万级数据Xlsx文件读取

import org.apache.poi.ooxml.util.SAXHelper;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;

public class Xlsx2Csv {
    private final OPCPackage xlsxPackage;
    private final int minColumns;
    private final PrintStream output;

    private class SheetToCSV implements XSSFSheetXMLHandler.SheetContentsHandler {
        private boolean firstCellOfRow = false;
        private int currentRow = -1;
        private int currentCol = -1;

        private void outputMissingRows(int number) {
            for (int i = 0; i < number; i++) {
                for (int j = 0; j < minColumns; j++) {
                    output.append(',');
                }
                output.append('\n');
            }
        }

        @Override
        public void startRow(int rowNum) {
            outputMissingRows(rowNum - currentRow - 1);
            firstCellOfRow = true;
            currentRow = rowNum;
            currentCol = -1;
        }

        public void endRow(int rowNum) {
            for (int i = currentCol; i < minColumns; i++) {
                output.append(',');
            }
            output.append(',');
        }

        public void cell(String cellReference, String formattedValue, XSSFComment xssfComment) {
            if (firstCellOfRow) {
                firstCellOfRow = false;
            } else {
                output.append(',');
            }

            if (cellReference == null) {
                cellReference = new CellAddress(currentRow, currentCol).formatAsString();
            }

            int thisCol = (new CellReference(cellReference)).getCol();
            int missedCols = thisCol - currentCol - 1;
            for (int i = 0; i < missedCols; i++) {
                output.append(',');
            }
            currentCol = thisCol;
            try {
                Double.parseDouble(formattedValue);
                output.append(formattedValue);
            } catch (NumberFormatException e) {
                output.append('"');
                output.append(formattedValue);
                output.append('"');
            }
        }

        @Override
        public void headerFooter(String s, boolean b, String s1) {
            //CSV文件没有页眉页脚
        }

    }

    public Xlsx2Csv(OPCPackage pkg, PrintStream output, int minColumns) {
        this.xlsxPackage = pkg;
        this.output = output;
        this.minColumns = minColumns;
    }

    public void processSheet(StylesTable styles, ReadOnlySharedStringsTable readOnlyTableString, XSSFSheetXMLHandler.SheetContentsHandler sheetHandler, InputStream sheetInputStream) {
        DataFormatter formatter = new DataFormatter();
        InputSource sheetSource = new InputSource(sheetInputStream);
        try {
            XMLReader reader = SAXHelper.newXMLReader();
            ContentHandler handler = new XSSFSheetXMLHandler(styles, readOnlyTableString, sheetHandler, formatter, false);
            reader.setContentHandler(handler);
            reader.parse(sheetSource);
        } catch (SAXException | ParserConfigurationException | IOException e) {
            e.printStackTrace();
        }
    }

    public void process() throws IOException, SAXException, OpenXML4JException {
        ReadOnlySharedStringsTable readOnlyTableString = new ReadOnlySharedStringsTable(this.xlsxPackage);
        XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
        StylesTable styles = xssfReader.getStylesTable();
        XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
        while (iter.hasNext()) {
            InputStream inputStream = iter.next();
            processSheet(styles, readOnlyTableString, new SheetToCSV(), inputStream);
            inputStream.close();
        }
    }

    public static void main(String[] args) throws OpenXML4JException, SAXException, IOException {
        File xlsxFile = new File(args[0]);
        if (!xlsxFile.exists()) {
            return;
        }
        int minColumns = -1;

        OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
        Xlsx2Csv xlsx2Csv = new Xlsx2Csv(p, System.out, minColumns);
        xlsx2Csv.process();
        p.close();
    }
}
// 测试方法
public class Test001 {
    public static void main(String[] args) throws OpenXML4JException, IOException, SAXException {

        Xlsx2Csv.main(new String[]{"D:\\通知消息记录浏览.xls"});
    }
}

2、百万级数据Xls文件读取

import org.apache.poi.examples.hssf.eventusermodel.XLS2CSVmra;
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;

public class Xls2CsvMra implements HSSFListener {

    private int minColumns;
    private POIFSFileSystem fs;
    private PrintStream output;
    private int lastRowNumber;
    private int lastColumnNumber;
    // 是否输出表达式或者值
    private boolean outputFormulaValues = false;
    // 解析公式
    private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
    private HSSFWorkbook stubWorkbook;
    // 处理过程中收集的记录
    private SSTRecord sstRecord;
    private FormatTrackingHSSFListener formatListener;
    // 正在处理的sheet
    private int sheetIndex = -1;
    private BoundSheetRecord[] orderedBSRs;
    private List<BoundSheetRecord> boundSheetRecords = new ArrayList<>();
    // 处理带有字符串结果的公式
    private int nextRow;
    private int nextColumn;
    private boolean outputNextStringRecord;

    // 创建新的XLS到CSV的转化器
    public Xls2CsvMra(POIFSFileSystem fs, PrintStream output, int minColumns) {
        this.fs = fs;
        this.output = output;
        this.minColumns = minColumns;
    }

    // 创建新的XLS到CSV的转换器
    public Xls2CsvMra(String filename, int minColumns) throws IOException {
        this(
                new POIFSFileSystem(new FileInputStream(filename)), System.out, minColumns
        );
    }

    public Xls2CsvMra(String inputFilePath, String outputFilePath) throws IOException {
        fs = new POIFSFileSystem(new FileInputStream(inputFilePath));
        output = new PrintStream(outputFilePath);
        minColumns = -1;
    }

    // 开始将XLS文件处理为CSV
    public void process() throws IOException {
        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
        formatListener = new FormatTrackingHSSFListener(listener);

        HSSFEventFactory factory = new HSSFEventFactory();
        HSSFRequest request = new HSSFRequest();

        if (outputFormulaValues) {
            request.addListenerForAllRecords(formatListener);
        } else {
            workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
            request.addListenerForAllRecords(workbookBuildingListener);
        }

        factory.processWorkbookEvents(request, fs);
    }

    @Override
    public void processRecord(Record record) {
        int thisRow = -1;
        int thisColumn = -1;
        String thisStr = null;

        switch (record.getSid()) {
            case BoundSheetRecord.sid:
                boundSheetRecords.add((BoundSheetRecord) record);
                break;
            case BOFRecord.sid:
                BOFRecord br = (BOFRecord) record;
                if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
                    // 创建子工作簿
                    if (workbookBuildingListener != null && stubWorkbook == null) {
                        stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
                    }

                    sheetIndex++;
                    if (orderedBSRs == null) {
                        orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
                    }
                    output.println();
                    output.println(orderedBSRs[sheetIndex].getSheetname() + " [" + (sheetIndex + 1) + "]:");
                }
                break;
            case SSTRecord.sid:
                sstRecord = (SSTRecord) record;
                break;
            case BlankRecord.sid:
                BlankRecord berec = (BlankRecord) record;

                thisRow = berec.getRow();
                thisColumn = berec.getColumn();
                thisStr = "";
                break;
            case FormulaRecord.sid:
                FormulaRecord frec = (FormulaRecord) record;

                thisRow = frec.getRow();
                thisColumn = frec.getColumn();

                if (outputFormulaValues) {
                    if (Double.isNaN(frec.getValue())) {
                        outputNextStringRecord = true;
                        nextRow = frec.getRow();
                        nextColumn = frec.getColumn();
                    } else {
                        thisStr = formatListener.formatNumberDateCell(frec);
                    }
                } else {
                    thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
                }
                break;
            case StringRecord.sid:
                if (outputNextStringRecord) {
                    StringRecord srec = (StringRecord) record;
                    thisStr = srec.getString();
                    thisRow = nextRow;
                    thisColumn = nextColumn;
                    outputNextStringRecord = false;
                }
                break;
            case LabelRecord.sid:
                LabelRecord lrec = (LabelRecord) record;
                thisRow = lrec.getRow();
                thisColumn = lrec.getColumn();
                thisStr = '"' + lrec.getValue() + '"';
                break;
            case LabelSSTRecord.sid:
                LabelSSTRecord lsrec = (LabelSSTRecord) record;
                thisRow = lsrec.getRow();
                thisColumn = lsrec.getColumn();
                if (sstRecord == null) {
                    thisStr = '"' + "(No SST Record, can't identify string)" + '"';
                } else {
                    thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex()).toString() + '"';
                }
                break;
            case NoteRecord.sid:
                NoteRecord nrec = (NoteRecord) record;
                thisRow = nrec.getRow();
                thisColumn = nrec.getColumn();
                thisStr = '"' + "(TODO)" + '"';
                break;
            case NumberRecord.sid:
                NumberRecord numrec = (NumberRecord) record;
                thisRow = numrec.getRow();
                thisColumn = numrec.getColumn();
                thisStr = formatListener.formatNumberDateCell(numrec);
                break;
            case RKRecord.sid:
                RKRecord rkrec = (RKRecord) record;
                thisRow = rkrec.getRow();
                thisColumn = rkrec.getColumn();
                thisStr = '"' + "(TODO)" + '"';
                break;
            default:
                break;
        }

        if (thisRow != -1 && thisRow != lastRowNumber) {
            lastColumnNumber = -1;
        }

        if (record instanceof MissingCellDummyRecord) {
            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
            thisRow = mc.getRow();
            thisColumn = mc.getColumn();
            thisStr = "";
        }

        if (thisStr != null) {
            if (thisColumn > 0) {
                output.print(',');
            }
            output.print(thisStr);
        }

        if (thisRow > -1) {
            lastRowNumber = thisRow;
        }
        if (thisColumn > -1) {
            lastColumnNumber = thisColumn;
        }

        if (record instanceof LastCellOfRowDummyRecord) {
            if (minColumns > 0) {
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
                for (int i = lastColumnNumber; i < (minColumns); i++) {
                    output.print(',');
                }
            }
            lastColumnNumber = -1;
            output.println();
        }
    }

    public static void main(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("Use:");
            System.err.println("  XLS2CSVmra <xls file> [min columns]");
            System.exit(1);
        }

        int minColumns = -1;
        if (args.length >= 2) {
            minColumns = Integer.parseInt(args[1]);
        }

        XLS2CSVmra xls2csv = new XLS2CSVmra(args[0], minColumns);
        xls2csv.process();
    }
}
// 测试方法
public class Test001 {
    public static void main(String[] args) throws Exception {

        XLS2CSVmra.main(new String[]{"D:\\通知消息记录浏览.xls"});
    }
}

如有问题请在下方留言,我会认真的查看每一条并尽量的给予回复。谢谢观看^^

猜你喜欢

转载自blog.csdn.net/Damon_Cheung/article/details/128032292