python xlsx2csv

本文件是为了创建uvm寄存器模型而设置的python脚本。主要功能:将xlsx转成csv。 

#!/usr/bin/env python
#
#   Copyright information
#
#	Copyright (C) 2010-2018 Dilshod Temirkhodjaev <[email protected]>
#
#   License
#
#	This program is free software; you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation; either version 2 of the License, or
#	(at your option) any later version.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with this program. If not, see <http://www.gnu.org/licenses/>.

__author__ = "Dilshod Temirkhodjaev <[email protected]>"
__license__ = "GPL-2+"
__version__ = "0.7.4"

import csv, datetime, zipfile, string, sys, os, re, signal
import xml.parsers.expat
from xml.dom import minidom
try:
    # python2.4
    from cStringIO import StringIO
except:
    pass
try:
    from argparse import ArgumentParser
except:
    # python2.4
    from optparse import OptionParser

# see also ruby-roo lib at: http://github.com/hmcgowan/roo
FORMATS = {
  'general' : 'float',
  '0' : 'float',
  '0.00' : 'float',
  '#,##0' : 'float',
  '#,##0.00' : 'float',
  '0%' : 'percentage',
  '0.00%' : 'percentage',
  '0.00e+00' : 'float',
  'mm-dd-yy' : 'date',
  'd-mmm-yy' : 'date',
  'd-mmm' : 'date',
  'mmm-yy' : 'date',
  'h:mm am/pm' : 'date',
  'h:mm:ss am/pm' : 'date',
  'h:mm' : 'time',
  'h:mm:ss' : 'time',
  'm/d/yy h:mm' : 'date',
  '#,##0 ;(#,##0)' : 'float',
  '#,##0 ;[red](#,##0)' : 'float',
  '#,##0.00;(#,##0.00)' : 'float',
  '#,##0.00;[red](#,##0.00)' : 'float',
  'mm:ss' : 'time',
  '[h]:mm:ss' : 'time',
  'mmss.0' : 'time',
  '##0.0e+0' : 'float',
  '@' : 'float',
  'yyyy\\-mm\\-dd' : 'date',
  'dd/mm/yy' : 'date',
  'hh:mm:ss' : 'time',
  "dd/mm/yy\\ hh:mm" : 'date',
  'dd/mm/yyyy hh:mm:ss' : 'date',
  'yy-mm-dd' : 'date',
  'd-mmm-yyyy' : 'date',
  'm/d/yy' : 'date',
  'm/d/yyyy' : 'date',
  'dd-mmm-yyyy' : 'date',
  'dd/mm/yyyy' : 'date',
  'mm/dd/yy h:mm am/pm' : 'date',
  'mm/dd/yy hh:mm' : 'date',
  'mm/dd/yyyy h:mm am/pm' : 'date',
  'mm/dd/yyyy hh:mm:ss' : 'date',
  'yyyy-mm-dd hh:mm:ss' : 'date',
  '#,##0;(#,##0)' : 'float',
  '_(* #,##0_);_(* (#,##0);_(* "-"??_);_(@_)' : 'float',
  '_(* #,##0.00_);_(* (#,##0.00);_(* "-"??_);_(@_)' : 'float'
}
STANDARD_FORMATS = {
  0 : 'general',
  1 : '0',
  2 : '0.00',
  3 : '#,##0',
  4 : '#,##0.00',
  9 : '0%',
  10 : '0.00%',
  11 : '0.00e+00',
  12 : '# ?/?',
  13 : '# ??/??',
  14 : 'mm-dd-yy',
  15 : 'd-mmm-yy',
  16 : 'd-mmm',
  17 : 'mmm-yy',
  18 : 'h:mm am/pm',
  19 : 'h:mm:ss am/pm',
  20 : 'h:mm',
  21 : 'h:mm:ss',
  22 : 'm/d/yy h:mm',
  37 : '#,##0 ;(#,##0)',
  38 : '#,##0 ;[red](#,##0)',
  39 : '#,##0.00;(#,##0.00)',
  40 : '#,##0.00;[red](#,##0.00)',
  45 : 'mm:ss',
  46 : '[h]:mm:ss',
  47 : 'mmss.0',
  48 : '##0.0e+0',
  49 : '@',
}
CONTENT_TYPES = {
  'shared_strings',
  'styles',
  'workbook',
  'worksheet',
  'relationships',
}

class XlsxException(Exception):
    pass

class InvalidXlsxFileException(XlsxException):
    pass

class SheetNotFoundException(XlsxException):
    pass

class OutFileAlreadyExistsException(XlsxException):
    pass

class Xlsx2csv:
    """
     Usage: Xlsx2csv("test.xslx", **params).convert("test.csv", sheetid=1)
     Input:
       xlsxfile - path to file or filehandle
     options:
       sheetid - sheet no to convert (0 for all sheets)
       dateformat - override date/time format
       timeformat - override time format
       floatformat - override float format
       quoting - if and how to quote
       delimiter - csv columns delimiter symbol
       sheetdelimiter - sheets delimiter used when processing all sheets
       skip_empty_lines - skip empty lines
       skip_trailing_columns - skip trailing columns
       hyperlinks - include hyperlinks
       include_sheet_pattern - only include sheets named matching given pattern
       exclude_sheet_pattern - exclude sheets named matching given pattern
    """

    def __init__(self, xlsxfile, **options):
        options.setdefault("delimiter", ",")
        options.setdefault("quoting", csv.QUOTE_MINIMAL)
        options.setdefault("sheetdelimiter", "--------")
        options.setdefault("dateformat", None)
        options.setdefault("timeformat", None)
        options.setdefault("floatformat", None)
        options.setdefault("scifloat", False)
        options.setdefault("skip_empty_lines", False)
        options.setdefault("skip_trailing_columns", False)
        options.setdefault("escape_strings", False)
        options.setdefault("hyperlinks", False)
        options.setdefault("include_sheet_pattern", ["^.*$"])
        options.setdefault("exclude_sheet_pattern", [])
        options.setdefault("merge_cells", False)
        options.setdefault("ignore_formats", [''])
        options.setdefault("lineterminator", "\n")

        self.options = options
        try:
            self.ziphandle = zipfile.ZipFile(xlsxfile)
        except (zipfile.BadZipfile, IOError):
            raise InvalidXlsxFileException("Invalid xlsx file: " + str(xlsxfile))

        self.py3 = sys.version_info[0] == 3

        self.content_types = self._parse(ContentTypes, "/[Content_Types].xml")
        self.shared_strings = self._parse(SharedStrings, self.content_types.types["shared_strings"])
        self.styles = self._parse(Styles, self.content_types.types["styles"])
        self.workbook = self._parse(Workbook, self.content_types.types["workbook"])
        self.workbook.relationships = self._parse(Relationships, self.content_types.types["relationships"])
        if self.options['escape_strings']:
            self.shared_strings.escape_strings()

    def __del__(self):
        # make sure to close zip file, ziphandler does have a close() method
        self.ziphandle.close()

    def getSheetIdByName(self, name):
        for s in self.workbook.sheets:
            if s['name'] == name:
                return s['id']
        return None

    def convert(self, outfile, sheetid=1):
        """outfile - path to file or filehandle"""
        if sheetid > 0:
            self._convert(sheetid, outfile)
        else:
            if isinstance(outfile, str):
                if not os.path.exists(outfile):
                    os.makedirs(outfile)
                elif os.path.isfile(outfile):
                    raise OutFileAlreadyExistsException("File " + str(outfile) + " already exists!")
            for s in self.workbook.sheets:
                sheetname = s['name']

                # filter sheets by include pattern
                include_sheet_pattern = self.options['include_sheet_pattern']
                if type(include_sheet_pattern) == type(""): # optparser lib fix
                    include_sheet_pattern = [include_sheet_pattern]
                if len(include_sheet_pattern) > 0:
                    include = False
                    for pattern in include_sheet_pattern:
                        include = pattern and len(pattern) > 0 and re.match(pattern, sheetname)
                        if include:
                            break
                    if not include:
                        continue

                # filter sheets by exclude pattern
                exclude_sheet_pattern = self.options['exclude_sheet_pattern']
                if type(exclude_sheet_pattern) == type(""): # optparser lib fix
                    exclude_sheet_pattern = [exclude_sheet_pattern]
                exclude = False
                for pattern in exclude_sheet_pattern:
                    exclude = pattern and len(pattern) > 0 and re.match(pattern, sheetname)
                    if exclude:
                        break
                if exclude:
                    continue

                if not self.py3:
                    sheetname = sheetname.encode('utf-8')
                of = outfile
                if isinstance(outfile, str):
                    of = os.path.join(outfile, sheetname + '.csv')
                elif self.options['sheetdelimiter'] and len(self.options['sheetdelimiter']):
                    of.write(self.options['sheetdelimiter'] + " " + str(s['id']) + " - " + sheetname + self.options['lineterminator'])
                self._convert(s['id'], of)

    def _convert(self, sheetid, outfile):
        closefile = False
        if isinstance(outfile, str):
            if sys.version_info[0] == 2:
                outfile = open(outfile, 'wb+')
            elif sys.version_info[0] == 3:
                outfile = open(outfile, 'w+', encoding=self.options['outputencoding'], newline="")
            else:
                sys.stderr.write("error: version of your python is not supported: " + str(sys.version_info) + "\n")
                sys.exit(1)
            closefile = True
        try:
            writer = csv.writer(outfile, quoting=self.options['quoting'], delimiter=self.options['delimiter'], lineterminator=self.options['lineterminator'])
            sheetfile = self._filehandle("/xl/worksheets/sheet%i.xml" % sheetid)
            if not sheetfile:
                sheetfile = self._filehandle("/xl/worksheets/worksheet%i.xml" % sheetid)
            if not sheetfile and sheetid == 1:
                sheetfile = self._filehandle(self.content_types.types["worksheet"])
            if not sheetfile:
                raise SheetNotFoundException("Sheet %s not found" %sheetid)
            sheet = Sheet(self.workbook, self.shared_strings, self.styles, sheetfile)
            try:
                sheet.relationships = self._parse(Relationships, "/xl/worksheets/_rels/sheet%i.xml.rels" % sheetid)
                sheet.set_dateformat(self.options['dateformat'])
                sheet.set_timeformat(self.options['timeformat'])
                sheet.set_floatformat(self.options['floatformat'])
                sheet.set_skip_empty_lines(self.options['skip_empty_lines'])
                sheet.set_skip_trailing_columns(self.options['skip_trailing_columns'])
                sheet.set_include_hyperlinks(self.options['hyperlinks'])
                sheet.set_merge_cells(self.options['merge_cells'])
                sheet.set_scifloat(self.options['scifloat'])
                sheet.set_ignore_formats(self.options['ignore_formats'])
                if self.options['escape_strings'] and sheet.filedata:
                    sheet.filedata = re.sub(r"(<v>[^<>]+)&#10;([^<>]+</v>)", r"\1\\n\2", re.sub(r"(<v>[^<>]+)&#9;([^<>]+</v>)", r"\1\\t\2", re.sub(r"(<v>[^<>]+)&#13;([^<>]+</v>)", r"\1\\r\2", sheet.filedata)))
                sheet.to_csv(writer)
            finally:
                sheetfile.close()
                sheet.close()
        finally:
            if closefile:
                outfile.close()

    def _filehandle(self, filename):
        for name in filter(lambda f: filename and f.lower() == filename.lower()[1:], self.ziphandle.namelist()):
            # python2.4 fix
            if not hasattr(self.ziphandle, "open"):
                return StringIO(self.ziphandle.read(name))
            return self.ziphandle.open(name, "r")
        return None

    def _parse(self, klass, filename):
        instance = klass()
        filehandle = self._filehandle(filename)
        if filehandle:
            instance.parse(filehandle)
            filehandle.close()
        return instance

class Workbook:
    def __init__(self):
        self.sheets = []
        self.date1904 = False

    def parse(self, filehandle):
        workbookDoc = minidom.parseString(filehandle.read())
        if workbookDoc.firstChild.namespaceURI:
            fileVersion = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "fileVersion")
        else:
            fileVersion = workbookDoc.firstChild.getElementsByTagName("fileVersion")
        if len(fileVersion) == 0:
            self.appName = 'unknown'
        else:
            try:
                if workbookDoc.firstChild.namespaceURI:
                    self.appName = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "fileVersion")[0]._attrs['appName'].value
                else:
                    self.appName = workbookDoc.firstChild.getElementsByTagName("fileVersion")[0]._attrs['appName'].value
            except KeyError:
                # no app name
                self.appName = 'unknown'
        try:
            if workbookDoc.firstChild.namespaceURI:
                self.date1904 = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "workbookPr")[0]._attrs['date1904'].value.lower().strip() != "false"
            else:
                self.date1904 = workbookDoc.firstChild.getElementsByTagName("workbookPr")[0]._attrs['date1904'].value.lower().strip() != "false"
        except:
            pass

        if workbookDoc.firstChild.namespaceURI:
            sheets = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "sheets")[0]
        else:
            sheets = workbookDoc.firstChild.getElementsByTagName("sheets")[0]
        if workbookDoc.firstChild.namespaceURI:
            sheetNodes = sheets.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "sheet")
        else:
            sheetNodes = sheets.getElementsByTagName("sheet")
        for sheetNode in sheetNodes:
            attrs = sheetNode._attrs
            name = attrs["name"].value
            if self.appName == 'xl' and len(attrs["r:id"].value) > 2:
                if 'r:id' in attrs: id = int(attrs["r:id"].value[3:])
                else: id = int(attrs['sheetId'].value)
            else:
                if 'sheetId' in attrs: id = int(attrs["sheetId"].value)
                else: id = int(attrs['r:id'].value[3:])
            self.sheets.append({'name': name, 'id': id})

class ContentTypes:
    def __init__(self):
        self.types = {}
        for type in CONTENT_TYPES:
            self.types[type] = None

    def parse(self, filehandle):
        types = minidom.parseString(filehandle.read()).firstChild
        if not types:
            return
        if types.namespaceURI:
            overrideNodes = types.getElementsByTagNameNS(types.namespaceURI, "Override")
        else:
            overrideNodes = types.getElementsByTagName("Override")
        for override in overrideNodes:
            attrs = override._attrs
            type = attrs.get('ContentType').value
            name = attrs.get('PartName').value
            if type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml":
                self.types["workbook"] = name
            elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.styles+xml":
                self.types["styles"] = name
            elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml":
                self.types["worksheet"] = name
            elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml":
                self.types["shared_strings"] = name
            elif type == "application/vnd.openxmlformats-package.relationships+xml":
                self.types["relationships"] = name

class Relationships:
    def __init__(self):
        self.relationships = {}

    def parse(self, filehandle):
        doc = minidom.parseString(filehandle.read())
        if doc.namespaceURI:
            relationships = doc.getElementsByTagNameNS(doc.namespaceURI, "Relationships")
        else:
            relationships = doc.getElementsByTagName("Relationships")
        if not relationships:
            return
        if doc.namespaceURI:
            relationshipNodes = relationships[0].getElementsByTagNameNS(doc.namespaceURI, "Relationship")
        else:
            relationshipNodes = relationships[0].getElementsByTagName("Relationship")
        for rel in relationshipNodes:
            attrs = rel._attrs
            rId = attrs.get('Id')
            if rId:
                vtype = attrs.get('Type')
                target = attrs.get('Target')
                self.relationships[str(rId.value)] = {
                    "type" : vtype and str(vtype.value) or None,
                    "target" : target and target.value.encode("utf-8") or None
                }

class Styles:
    def __init__(self):
        self.numFmts = {}
        self.cellXfs = []

    def parse(self, filehandle):
        styles = minidom.parseString(filehandle.read()).firstChild
        # numFmts
        if styles.namespaceURI:
            numFmtsElement = styles.getElementsByTagNameNS(styles.namespaceURI, "numFmts")
        else:
            numFmtsElement = styles.getElementsByTagName("numFmts")
        if len(numFmtsElement) == 1:
            for numFmt in numFmtsElement[0].childNodes:
                if numFmt.nodeType == minidom.Node.ELEMENT_NODE:
                    numFmtId = int(numFmt._attrs['numFmtId'].value)
                    formatCode = numFmt._attrs['formatCode'].value.lower().replace('\\', '')
                    self.numFmts[numFmtId] = formatCode      

        if styles.namespaceURI:
            cellXfsElement = styles.getElementsByTagNameNS(styles.namespaceURI, "cellXfs")
        else:
            cellXfsElement = styles.getElementsByTagName("cellXfs")
        if len(cellXfsElement) == 1:
            for cellXfs in cellXfsElement[0].childNodes:
                if cellXfs.nodeType != minidom.Node.ELEMENT_NODE or not (cellXfs.nodeName == "xf" or cellXfs.nodeName.endswith(":xf")):
                    continue
                if cellXfs._attrs and 'numFmtId' in cellXfs._attrs:
                    numFmtId = int(cellXfs._attrs['numFmtId'].value)
                    if self.chk_exists(numFmtId)==None:
                      numFmtId = int(cellXfs._attrs['applyNumberFormat'].value)
                    self.cellXfs.append(numFmtId)
                else:
                    self.cellXfs.append(None)

    # When Unknown Numformat ID assign applyNumberFormat
    def chk_exists(self, numFmtId):
      xfs_numfmt = numFmtId
      format_str = None
      if xfs_numfmt in self.numFmts:
          format_str = self.numFmts[xfs_numfmt]
      elif xfs_numfmt in STANDARD_FORMATS:
          format_str = STANDARD_FORMATS[xfs_numfmt]      
      return format_str

class SharedStrings:
    def __init__(self):
        self.parser = None
        self.strings = []
        self.si = False
        self.t = False
        self.rPh = False
        self.value = ""

    def parse(self, filehandle):
        self.parser = xml.parsers.expat.ParserCreate()
        self.parser.CharacterDataHandler = self.handleCharData
        self.parser.StartElementHandler = self.handleStartElement
        self.parser.EndElementHandler = self.handleEndElement
        self.parser.ParseFile(filehandle)

    def escape_strings(self):
        for i in range(0, len(self.strings)):
            self.strings[i] = self.strings[i].replace("\r", "\\r").replace("\n", "\\n").replace("\t", "\\t")

    def handleCharData(self, data):
        if self.t:
            self.value+= data

    def handleStartElement(self, name, attrs):
        # ignore namespace
        i = name.find(":")
        if i >= 0:
            name = name[i + 1:]

        if name == 'si':
            self.si = True
            self.value = ""
        elif name == 't' and self.rPh:
            self.t = False
        elif name == 't' and self.si:
            self.t = True
        elif name == 'rPh':
            self.rPh = True

    def handleEndElement(self, name):
        # ignore namespace
        i = name.find(":")
        if i >= 0:
            name = name[i + 1:]

        if name == 'si':
            self.si = False
            self.strings.append(self.value)
        elif name == 't':
            self.t = False
        elif name == 'rPh':
            self.rPh = False

class Sheet:
    def __init__(self, workbook, sharedString, styles, filehandle):
        self.py3 = sys.version_info[0] == 3
        self.parser = None
        self.writer = None
        self.sharedString = None
        self.styles = None
        self.relationships = None
        self.columns_count = -1

        self.in_sheet = False
        self.in_row = False
        self.in_cell = False
        self.in_cell_value = False

        self.columns = {}
        self.lastRowNum = 0
        self.rowNum = None
        self.colType = None
        self.cellId = None
        self.s_attr = None
        self.data = None
        self.max_columns = -1

        self.dateformat = None
        self.timeformat = "%H:%M" # default time format
        self.floatformat = None
        self.skip_empty_lines = False
        self.skip_trailing_columns = False

        self.filedata = None
        self.filehandle = filehandle
        self.workbook = workbook
        self.sharedStrings = sharedString.strings
        self.styles = styles

        self.hyperlinks = {}
        self.mergeCells = {}
        self.ignore_formats = []

        self.colIndex = 0
        self.colNum = ""

    def close(self):
        # Make sure Worksheet is closed, parsers lib does not have a close() function, so simply delete it
        self.parser = None

    def set_dateformat(self, dateformat):
        self.dateformat = dateformat

    def set_timeformat(self, timeformat):
        if timeformat:
            self.timeformat = timeformat

    def set_floatformat(self, floatformat):
        self.floatformat = floatformat

    def set_skip_empty_lines(self, skip):
        self.skip_empty_lines = skip

    def set_skip_trailing_columns(self, skip):
        self.skip_trailing_columns = skip

    def set_ignore_formats(self, ignore_formats):
        self.ignore_formats = ignore_formats

    def set_merge_cells(self, mergecells):
        if not mergecells:
            return
        if not self.filedata:
            self.filedata = self.filehandle.read()
        data = str(self.filedata) # python3: convert byte buffer to string

        # find worksheet tag, we need namespaces from it
        start = data.find("<worksheet")
        if start < 0:
            return
        end = data.find(">", start)
        worksheet = data[start : end + 1]

        # find hyperlinks part
        start = data.find("<mergeCells")
        if start < 0:
            # hyperlinks not found
            return
        end = data.find("</mergeCells>")
        data = data[start : end + 13]

        # parse hyperlinks
        doc = minidom.parseString(worksheet + data + "</worksheet>").firstChild

        if doc.namespaceURI:
            mergeCells = doc.getElementsByTagNameNS(doc.namespaceURI, "mergeCell")
        else:
            mergeCells = doc.getElementsByTagName("mergeCell")
        for mergeCell in mergeCells:
            attrs = mergeCell._attrs
            if 'ref' in attrs.keys():
                rangeStr = attrs['ref'].value
                rng = rangeStr.split(":")
                if len(rng) > 1:
                    for cell in self._range(rangeStr):
                        self.mergeCells[cell] = {}
                        self.mergeCells[cell]['copyFrom'] = rng[0]

    def set_scifloat(self, scifloat):
        self.scifloat = scifloat

    def set_include_hyperlinks(self, hyperlinks):
        if not hyperlinks or not self.relationships or not self.relationships.relationships:
            return
        # we must read file first to get hyperlinks, but we don't wont to parse whole file
        if not self.filedata:
            self.filedata = self.filehandle.read()
        data = str(self.filedata) # python3: convert byte buffer to string

        # find worksheet tag, we need namespaces from it
        start = data.find("<worksheet")
        if start < 0:
            return
        end = data.find(">", start)
        worksheet = data[start : end + 1]

        # find hyperlinks part
        start = data.find("<hyperlinks>")
        if start < 0:
            # hyperlinks not found
            return
        end = data.find("</hyperlinks>")
        data = data[start : end + 13]

        # parse hyperlinks
        doc = minidom.parseString(worksheet + data + "</worksheet>").firstChild
        if doc.namespaceURI:
            hiperlinkNodes = doc.getElementsByTagNameNS(doc.namespaceURI, "hyperlink")
        else:
            hiperlinkNodes = doc.getElementsByTagName("hyperlink")
        for hlink in hiperlinkNodes:
            attrs = hlink._attrs
            ref = rId = None
            for k in attrs.keys():
                if k == "ref":
                    ref = str(attrs[k].value)
                if k.endswith(":id"):
                    rId = str(attrs[k].value)
            if not ref or not rId:
                continue
            rel = self.relationships.relationships.get(rId)
            if not rel:
                continue
            target = rel.get('target')
            for cell in self._range(ref):
                self.hyperlinks[cell] = target

    def to_csv(self, writer):
        self.writer = writer
        self.parser = xml.parsers.expat.ParserCreate()
        self.parser.buffer_text = True
        self.parser.CharacterDataHandler = self.handleCharData
        self.parser.StartElementHandler = self.handleStartElement
        self.parser.EndElementHandler = self.handleEndElement
        if self.filedata:
            self.parser.Parse(self.filedata)
        else:
            self.parser.ParseFile(self.filehandle)

    def handleCharData(self, data):
        if self.in_cell_value:
            self.collected_string+= data
            self.data = self.collected_string
            if self.colType == "s": # shared string
                self.data = self.sharedStrings[int(self.data)]
            elif self.colType == "b": # boolean
                self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data
            elif self.colType == "str" or self.colType == "inlineStr":
                self.data = data
            elif self.s_attr:
                s = int(self.s_attr)

                # get cell format
                format_str = "general"
                xfs_numfmt = self.styles.cellXfs[s]
                if xfs_numfmt in self.styles.numFmts:
                    format_str = self.styles.numFmts[xfs_numfmt]
                elif xfs_numfmt in STANDARD_FORMATS:
                    format_str = STANDARD_FORMATS[xfs_numfmt]

                # get format type
                if not format_str:
                    print("unknown format %s at %d" %(format_str,xfs_numfmt))
                    return

                format_type = None
                if format_str in FORMATS:
                    format_type = FORMATS[format_str]
                elif re.match("^\d+(\.\d+)?$", self.data) and re.match(".*[hsmdyY]", format_str) and not re.match('.*\[.*[dmhys].*\]', format_str):
                    # it must be date format
                    if float(self.data) < 1:
                        format_type = "time"
                    else:
                        format_type = "date"
                elif re.match("^-?\d+(.\d+)?$", self.data) or (self.scifloat and re.match("^-?\d+(.\d+)?([eE]-?\d+)?$", self.data)):
                    format_type = "float"
                if format_type == 'date' and self.dateformat == 'float':
                    format_type = "float"
                if format_type and not format_type in self.ignore_formats:
                    try:
                        if format_type == 'date': # date/time
                            if self.workbook.date1904:
                                date = datetime.datetime(1904, 1, 1) + datetime.timedelta(float(self.data))
                            else:
                                date = datetime.datetime(1899, 12, 30) + datetime.timedelta(float(self.data))
                            if self.dateformat:
                                # str(dateformat) - python2.5 bug, see: http://bugs.python.org/issue2782
                                self.data = date.strftime(str(self.dateformat))
                            else:
                                # ignore ";@", don't know what does it mean right now
                                # ignore "[$-409], [$-f409], [$-16001]" and similar format codes
                                dateformat = re.sub(r"\[\$\-[A-z0-9]*\]", "", format_str, 1). \
                                  replace(";@", ""). \
                                  replace("yyyy", "%Y").replace("yy", "%y"). \
                                  replace("hh:mm", "%H:%M").replace("h", "%I").replace("%H%H", "%H").replace("ss", "%S"). \
                                  replace("dddd", "d"). \
                                  replace("dd", "d").replace("d", "%d"). \
                                  replace("am/pm", "%p"). \
                                  replace("mmmm", "%B").replace("mmm", "%b").replace(":mm", ":%M").replace("m", "%m").replace("%m%m", "%m")
                                self.data = date.strftime(str(dateformat)).strip()
                        elif format_type == 'time': # time
                            t = int(round((float(self.data) % 1) * 24*60*60, 6)) # it should be in seconds
                            d = datetime.time((t // 3600) % 24, (t // 60) % 60, t % 60)
                            self.data = d.strftime(self.timeformat)
                        elif format_type == 'float' and ('E' in self.data or 'e' in self.data):
                            self.data = str(self.floatformat or '%f') % float(self.data)
                        # if cell is general, be aggressive about stripping any trailing 0s, decimal points, etc.
                        elif format_type == 'float' and format_str == 'general':
                            self.data = ("%f" %(float(self.data))).rstrip('0').rstrip('.')
                        elif format_type == 'float' and format_str[0:3] == '0.0':
                            if self.floatformat:
                                self.data = str(self.floatformat) % float(self.data)
                            else:
                                L = len(format_str.split(".")[1])
                                if '%' in format_str:
                                    L += 1
                                self.data = ("%." + str(L) + "f") % float(self.data)
                        elif format_type == 'float':
                            # unsupported float formatting
                            self.data = ("%f" %(float(self.data))).rstrip('0').rstrip('.')

                    except (ValueError, OverflowError): # this catch must be removed, it's hiding potential problems
                        # invalid date format
                        pass

    def handleStartElement(self, name, attrs):
        has_namespace = name.find(":") > 0
        if self.in_row and (name == 'c' or (has_namespace and name.endswith(':c'))):
            self.colType = attrs.get("t")
            self.s_attr = attrs.get("s")
            self.cellId = attrs.get("r")
            if self.cellId:
                self.colNum = self.cellId[:len(self.cellId)-len(self.rowNum)]
                self.colIndex = 0
            else:
                self.colIndex+= 1
            self.data = ""
            self.in_cell = True
        elif self.in_cell and ((name == 'v' or name == 'is') or (has_namespace and (name.endswith(':v') or name.endswith(':is')))):
            self.in_cell_value = True
            self.collected_string = ""
        elif self.in_sheet and (name == 'row' or (has_namespace and name.endswith(':row'))) and ('r' in attrs):
            self.rowNum = attrs['r']
            self.in_row = True
            self.colIndex = 0
            self.colNum = ""
            self.columns = {}
            self.spans = None
            if 'spans' in attrs:
                self.spans = [int(i) for i in attrs['spans'].split(" ")[-1].split(":")]
        elif name == 't':
            # reset collected string
            self.collected_string = ""

        elif name == 'sheetData' or (has_namespace and name.endswith(':sheetData')):
            self.in_sheet = True
        elif name == 'dimension':
            rng = attrs.get("ref").split(":")
            if len(rng) > 1:
                start = re.match("^([A-Z]+)(\d+)$", rng[0])
                if (start):
                    end = re.match("^([A-Z]+)(\d+)$", rng[1])
                    startCol = start.group(1)
                    endCol = end.group(1)
                    self.columns_count = 0
                    for cell in self._range(startCol + "1:" + endCol + "1"):
                        self.columns_count+= 1

    def handleEndElement(self, name):
        has_namespace = name.find(":") > 0
        if self.in_cell and ((name == 'v' or name == 'is' or name == 't') or (has_namespace and (name.endswith(':v') or name.endswith(':is')))):
            self.in_cell_value = False
        elif self.in_cell and (name == 'c' or (has_namespace and name.endswith(':c'))):
            t = 0
            for i in self.colNum: t = t*26 + ord(i) - 64
            d = self.data
            if self.hyperlinks:
                hyperlink = self.hyperlinks.get(self.cellId)
                if hyperlink:
                    hyperlink = hyperlink.decode("utf-8")
                    d = "<a href='" + hyperlink + "'>" + d + "</a>"
            if self.colNum + self.rowNum in self.mergeCells.keys():
                if 'copyFrom' in self.mergeCells[self.colNum + self.rowNum].keys() and self.mergeCells[self.colNum + self.rowNum]['copyFrom'] == self.colNum + self.rowNum:
                    self.mergeCells[self.colNum + self.rowNum]['value'] = d
                else:
                    d = self.mergeCells[self.mergeCells[self.colNum + self.rowNum]['copyFrom']]['value']

            self.columns[t - 1 + self.colIndex] = d

        if self.in_row and (name == 'row' or (has_namespace and name.endswith(':row'))):
            if len(self.columns.keys()) > 0:
                d = [""] * (max(self.columns.keys()) + 1)
                for k in self.columns.keys():
                    val = self.columns[k]
                    if not self.py3:
                        val = val.encode("utf-8")
                    d[k] = val
                if self.spans:
                    l = self.spans[1]
                    if len(d) < l:
                        d+= (l - len(d)) * ['']

                # write empty lines
                if not self.skip_empty_lines:
                    for i in range(self.lastRowNum, int(self.rowNum) - 1):
                        self.writer.writerow([])
                    self.lastRowNum = int(self.rowNum)

                # write line to csv
                if not self.skip_empty_lines or d.count('') != len(d):
                    while len(d) < self.columns_count:
                        d.append("")

                    if self.skip_trailing_columns:
                        if self.max_columns < 0:
                            self.max_columns = len(d)
                            while len(d) > 0 and d[-1] == "":
                                d = d[0:-1]
                                self.max_columns = self.max_columns - 1
                        elif self.max_columns > 0:
                            d = d[0:self.max_columns]
                    self.writer.writerow(d)

            self.in_row = False
        elif self.in_sheet and (name == 'sheetData' or (has_namespace and name.endswith(':sheetData'))):
            self.in_sheet = False

    # rangeStr: "A3:C12" or "D5"
    # example: for cell in _range("A1:Z12"): print cell
    def _range(self, rangeStr):
        rng = rangeStr.split(":")
        if len(rng) == 1:
            yield rangeStr
        else:
            start = re.match("^([A-Z]+)(\d+)$", rng[0])
            end = re.match("^([A-Z]+)(\d+)$", rng[1])
            if not start or not end:
                return
            startCol = start.group(1)
            startRow = int(start.group(2))
            endCol = end.group(1)
            endRow = int(end.group(2))
            col = startCol
            while True:
                for row in range(startRow, endRow + 1):
                    yield col + str(row)
                if col == endCol:
                    break
                t = 0
                for i in col: t = t * 26 + ord(i) - 64
                col = ""
                while t >= 0:
                  col = chr(t % 26 + 65) + col
                  t = t // 26 - 1


def convert_recursive(path, sheetid, outfile, kwargs):
    for name in os.listdir(path):
        fullpath = os.path.join(path, name)
        if os.path.isdir(fullpath):
            convert_recursive(fullpath, sheetid, outfile, kwargs)
        else:
            outfilepath = outfile
            if len(outfilepath) == 0 and fullpath.lower().endswith(".xlsx"):
                outfilepath = fullpath[:-4] + 'csv'

            print("Converting %s to %s" %(fullpath, outfilepath))
            try:
                Xlsx2csv(fullpath, **kwargs).convert(outfilepath, sheetid)
            except zipfile.BadZipfile:
                print("File %s is not a zip file" %fullpath)

if __name__ == "__main__":
    try:
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
        signal.signal(signal.SIGINT, signal.SIG_DFL)
    except AttributeError:
        pass

    if "ArgumentParser" in globals():
        parser = ArgumentParser(description = "xlsx to csv converter")
        parser.add_argument('infile', metavar='xlsxfile', help="xlsx file path")
        parser.add_argument('outfile', metavar='outfile', nargs='?', help="output csv file path")
        parser.add_argument('-v', '--version', action='version', version=__version__)
        nargs_plus = "+"
        argparser = True
    else:
        parser = OptionParser(usage = "%prog [options] infile [outfile]", version=__version__)
        parser.add_argument = parser.add_option
        nargs_plus = 1
        argparser = False


    if sys.version_info[0] == 2 and sys.version_info[1] < 5:
        inttype = "int"
    else:
        inttype = int
    parser.add_argument("-a", "--all", dest="all", default=False, action="store_true",
      help="export all sheets")
    parser.add_argument("-c", "--outputencoding", dest="outputencoding", default="utf-8", action="store",
      help="encoding of output csv ** Python 3 only ** (default: utf-8)")
    parser.add_argument("-d", "--delimiter", dest="delimiter", default=",",
      help="delimiter - columns delimiter in csv, 'tab' or 'x09' for a tab (default: comma ',')")
    parser.add_argument("--hyperlinks", "--hyperlinks", dest="hyperlinks", action="store_true", default=False,
      help="include hyperlinks")
    parser.add_argument("-e", "--escape", dest='escape_strings', default=False, action="store_true",
      help="Escape \\r\\n\\t characters")
    parser.add_argument("-E", "--exclude_sheet_pattern", nargs=nargs_plus, dest="exclude_sheet_pattern", default="",
      help="exclude sheets named matching given pattern, only effects when -a option is enabled.")
    parser.add_argument("-f", "--dateformat", dest="dateformat",
      help="override date/time format (ex. %%Y/%%m/%%d)")
    parser.add_argument("-t", "--timeformat", dest="timeformat",
      help="override time format (ex. %%H/%%M/%%S)")
    parser.add_argument("--floatformat", dest="floatformat",
      help="override float format (ex. %%.15f)")
    parser.add_argument("--sci-float", dest="scifloat", default=False, action="store_true",
      help="force scientific notation to float")
    parser.add_argument("-I", "--include_sheet_pattern", nargs=nargs_plus, dest="include_sheet_pattern", default="^.*$",
      help="only include sheets named matching given pattern, only effects when -a option is enabled.")
    parser.add_argument("--ignore-formats", nargs=nargs_plus, type=str, dest="ignore_formats", default=[''],
      help="Ignores format for specific data types.")
    parser.add_argument("-l", "--lineterminator", dest="lineterminator", default="\n",
      help="line terminator - lines terminator in csv, '\\n' '\\r\\n' or '\\r' (default: \\n)")
    parser.add_argument("-m", "--merge-cells", dest="merge_cells", default=False, action="store_true",
      help="merge cells")
    parser.add_argument("-n", "--sheetname", dest="sheetname", default=None,
      help="sheet name to convert")
    parser.add_argument("-i", "--ignoreempty", dest="skip_empty_lines", default=False, action="store_true",
      help="skip empty lines")
    parser.add_argument("--skipemptycolumns", dest="skip_trailing_columns", default=False, action="store_true",
      help="skip trailing empty columns")
    parser.add_argument("-p", "--sheetdelimiter", dest="sheetdelimiter", default="--------",
      help="sheet delimiter used to separate sheets, pass '' if you do not need delimiter, or 'x07' or '\\f' for form feed (default: '--------')")
    parser.add_argument("-q", "--quoting", dest="quoting", default="minimal",
      help="quoting - fields quoting in csv, 'none' 'minimal' 'nonnumeric' or 'all' (default: minimal)")
    parser.add_argument("-s", "--sheet", dest="sheetid", default=1, type=inttype,
      help="sheet number to convert")

    if argparser:
        options = parser.parse_args()
    else:
        (options, args) = parser.parse_args()
        if len(args) < 1:
            parser.print_usage()
            sys.stderr.write("error: too few arguments" + os.linesep)
            sys.exit(1)
        options.infile = args[0]
        options.outfile = len(args) > 1 and args[1] or None

    if len(options.delimiter) == 1:
        pass
    elif options.delimiter == 'tab' or options.delimiter == '\\t':
        options.delimiter = '\t'
    elif options.delimiter == 'comma':
        options.delimiter = ','
    elif options.delimiter[0] == 'x':
        options.delimiter = chr(int(options.delimiter[1:]))
    else:
        sys.stderr.write("error: invalid delimiter\n")
        sys.exit(1)

    if options.quoting == 'none':
        options.quoting = csv.QUOTE_NONE
    elif options.quoting == 'minimal':
        options.quoting = csv.QUOTE_MINIMAL
    elif options.quoting == 'nonnumeric':
        options.quoting = csv.QUOTE_NONNUMERIC
    elif options.quoting == 'all':
        options.quoting = csv.QUOTE_ALL
    else:
        sys.stderr.write("error: invalid quoting\n")
        sys.exit(1)

    if options.lineterminator == '\n':
        pass
    elif options.lineterminator == '\\n':
        options.lineterminator = '\n'
    elif options.lineterminator == '\\r':
        options.lineterminator = '\r'
    elif options.lineterminator == '\\r\\n':
        options.lineterminator = '\r\n'
    else:
        sys.stderr.write("error: invalid line terminator\n")
        sys.exit(1)

    if options.sheetdelimiter == '--------':
        pass
    elif options.sheetdelimiter == '':
        pass
    elif options.sheetdelimiter == '\\f':
        options.sheetdelimiter = '\f'
    elif options.sheetdelimiter[0] == 'x':
        options.sheetdelimiter = chr(int(options.sheetdelimiter[1:]))
    else:
        sys.stderr.write("error: invalid sheet delimiter\n")
        sys.exit(1)

    kwargs = {
      'delimiter' : options.delimiter,
      'quoting' : options.quoting,
      'sheetdelimiter' : options.sheetdelimiter,
      'dateformat' : options.dateformat,
      'timeformat' : options.timeformat,
      'floatformat' : options.floatformat,
      'scifloat' : options.scifloat,
      'skip_empty_lines' : options.skip_empty_lines,
      'skip_trailing_columns' : options.skip_trailing_columns,
      'escape_strings' : options.escape_strings,
      'hyperlinks' : options.hyperlinks,
      'include_sheet_pattern' : options.include_sheet_pattern,
      'exclude_sheet_pattern' : options.exclude_sheet_pattern,
      'merge_cells' : options.merge_cells,
      'outputencoding' : options.outputencoding,
      'lineterminator' : options.lineterminator,
      'ignore_formats' : options.ignore_formats
    }
    sheetid = options.sheetid
    if options.all:
        sheetid = 0

    outfile = options.outfile or sys.stdout
    try:
        if os.path.isdir(options.infile):
            convert_recursive(options.infile, sheetid, outfile, kwargs)
        else:
            xlsx2csv = Xlsx2csv(options.infile, **kwargs)
            if options.sheetname:
                sheetid = xlsx2csv.getSheetIdByName(options.sheetname)
                if not sheetid:
                    raise XlsxException("Sheet '%s' not found" % options.sheetname)
            xlsx2csv.convert(outfile, sheetid)
    except XlsxException:
        _, e, _ = sys.exc_info()
        sys.stderr.write(str(e) + "\n")
        sys.exit(1)

猜你喜欢

转载自blog.csdn.net/qq_41394155/article/details/84618469
今日推荐