diff options
Diffstat (limited to 'Tools/XlsToCsv/xlsx2csv.py')
-rw-r--r-- | Tools/XlsToCsv/xlsx2csv.py | 1203 |
1 files changed, 0 insertions, 1203 deletions
diff --git a/Tools/XlsToCsv/xlsx2csv.py b/Tools/XlsToCsv/xlsx2csv.py deleted file mode 100644 index fe7874b..0000000 --- a/Tools/XlsToCsv/xlsx2csv.py +++ /dev/null @@ -1,1203 +0,0 @@ -#!/usr/bin/env python -# -# Copyright information -# -# Copyright (C) 2010-2018 Dilshod Temirkhodjaev <tdilshod@gmail.com> -# -# License -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. -from __future__ import print_function - -__author__ = "Dilshod Temirkhodjaev <tdilshod@gmail.com>" -__license__ = "GPL-2+" -__version__ = "0.7.9" - -import csv, datetime, zipfile, string, sys, os, re, signal -import xml.parsers.expat -from xml.dom import minidom - -try: - # python2.4 - from cStringIO import StringIO -except: - pass -try: - from argparse import ArgumentParser -except: - # python2.4 - from optparse import OptionParser - -# see also ruby-roo lib at: http://github.com/hmcgowan/roo -FORMATS = { - 'general': 'float', - '0': 'float', - '0.00': 'float', - '#,##0': 'float', - '#,##0.00': 'float', - '0%': 'percentage', - '0.00%': 'percentage', - '0.00e+00': 'float', - 'mm-dd-yy': 'date', - 'd-mmm-yy': 'date', - 'd-mmm': 'date', - 'mmm-yy': 'date', - 'h:mm am/pm': 'date', - 'h:mm:ss am/pm': 'date', - 'h:mm': 'time', - 'h:mm:ss': 'time', - 'm/d/yy h:mm': 'date', - '#,##0 ;(#,##0)': 'float', - '#,##0 ;[red](#,##0)': 'float', - '#,##0.00;(#,##0.00)': 'float', - '#,##0.00;[red](#,##0.00)': 'float', - 'mm:ss': 'time', - '[h]:mm:ss': 'time', - 'mmss.0': 'time', - '##0.0e+0': 'float', - '@': 'float', - 'yyyy\\-mm\\-dd': 'date', - 'dd/mm/yy': 'date', - 'hh:mm:ss': 'time', - "dd/mm/yy\\ hh:mm": 'date', - 'dd/mm/yyyy hh:mm:ss': 'date', - 'yy-mm-dd': 'date', - 'd-mmm-yyyy': 'date', - 'm/d/yy': 'date', - 'm/d/yyyy': 'date', - 'dd-mmm-yyyy': 'date', - 'dd/mm/yyyy': 'date', - 'mm/dd/yy h:mm am/pm': 'date', - 'mm/dd/yy hh:mm': 'date', - 'mm/dd/yyyy h:mm am/pm': 'date', - 'mm/dd/yyyy hh:mm:ss': 'date', - 'yyyy-mm-dd hh:mm:ss': 'date', - '#,##0;(#,##0)': 'float', - '_(* #,##0_);_(* (#,##0);_(* "-"??_);_(@_)': 'float', - '_(* #,##0.00_);_(* (#,##0.00);_(* "-"??_);_(@_)': 'float' -} -STANDARD_FORMATS = { - 0: 'general', - 1: '0', - 2: '0.00', - 3: '#,##0', - 4: '#,##0.00', - 9: '0%', - 10: '0.00%', - 11: '0.00e+00', - 12: '# ?/?', - 13: '# ??/??', - 14: 'mm-dd-yy', - 15: 'd-mmm-yy', - 16: 'd-mmm', - 17: 'mmm-yy', - 18: 'h:mm am/pm', - 19: 'h:mm:ss am/pm', - 20: 'h:mm', - 21: 'h:mm:ss', - 22: 'm/d/yy h:mm', - 37: '#,##0 ;(#,##0)', - 38: '#,##0 ;[red](#,##0)', - 39: '#,##0.00;(#,##0.00)', - 40: '#,##0.00;[red](#,##0.00)', - 45: 'mm:ss', - 46: '[h]:mm:ss', - 47: 'mmss.0', - 48: '##0.0e+0', - 49: '@', -} -CONTENT_TYPES = { - 'shared_strings', - 'styles', - 'workbook', - 'worksheet', - 'relationships', -} - -DEFAULT_APP_PATH = "/xl" -DEFAULT_WORKBOOK_PATH = DEFAULT_APP_PATH + "/workbook.xml" - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - -class XlsxException(Exception): - pass - - -class InvalidXlsxFileException(XlsxException): - pass - - -class SheetNotFoundException(XlsxException): - pass - - -class OutFileAlreadyExistsException(XlsxException): - pass - - -class Xlsx2csv: - """ - Usage: Xlsx2csv("test.xslx", **params).convert("test.csv", sheetid=1) - Input: - xlsxfile - path to file or filehandle - options: - sheetid - sheet no to convert (0 for all sheets) - sheetname - sheet name to convert - dateformat - override date/time format - timeformat - override time format - floatformat - override float format - quoting - if and how to quote - delimiter - csv columns delimiter symbol - sheetdelimiter - sheets delimiter used when processing all sheets - skip_empty_lines - skip empty lines - skip_trailing_columns - skip trailing columns - hyperlinks - include hyperlinks - include_sheet_pattern - only include sheets named matching given pattern - exclude_sheet_pattern - exclude sheets named matching given pattern - exclude_hidden_sheets - exclude hidden sheets - """ - - def __init__(self, xlsxfile, **options): - options.setdefault("delimiter", ",") - options.setdefault("quoting", csv.QUOTE_MINIMAL) - options.setdefault("sheetdelimiter", "--------") - options.setdefault("dateformat", None) - options.setdefault("timeformat", None) - options.setdefault("floatformat", None) - options.setdefault("scifloat", False) - options.setdefault("skip_empty_lines", False) - options.setdefault("skip_trailing_columns", False) - options.setdefault("escape_strings", False) - options.setdefault("no_line_breaks", False) - options.setdefault("hyperlinks", False) - options.setdefault("include_sheet_pattern", ["^.*$"]) - options.setdefault("exclude_sheet_pattern", []) - options.setdefault("exclude_hidden_sheets", False) - options.setdefault("merge_cells", False) - options.setdefault("ignore_formats", ['']) - options.setdefault("lineterminator", "\n") - options.setdefault("outputencoding", "utf-8") - - self.options = options - try: - self.ziphandle = zipfile.ZipFile(xlsxfile) - except (zipfile.BadZipfile, IOError): - raise InvalidXlsxFileException("Invalid xlsx file: " + str(xlsxfile)) - - self.py3 = sys.version_info[0] == 3 - - self.content_types = self._parse(ContentTypes, "/[Content_Types].xml") - self.shared_strings = self._parse(SharedStrings, self.content_types.types["shared_strings"]) - self.styles = self._parse(Styles, self.content_types.types["styles"]) - self.workbook = self._parse(Workbook, self.content_types.types["workbook"]) - workbook_relationships = list(filter(lambda r: "book" in r, self.content_types.types["relationships"]))[0] - self.workbook.relationships = self._parse(Relationships, workbook_relationships) - if self.options['no_line_breaks']: - self.shared_strings.replace_line_breaks() - elif self.options['escape_strings']: - self.shared_strings.escape_strings() - - def __del__(self): - # make sure to close zip file, ziphandler does have a close() method - self.ziphandle.close() - - def getSheetIdByName(self, name): - for s in self.workbook.sheets: - if s['name'] == name: - return s['index'] - return None - - def convert(self, outfile, sheetid=1, sheetname=None): - """outfile - path to file or filehandle""" - if sheetname: - sheetid = self.getSheetIdByName(sheetname) - if not sheetid: - raise XlsxException("Sheet '%s' not found" % sheetname) - if sheetid > 0: - self._convert(sheetid, outfile) - else: - if isinstance(outfile, str): - if not os.path.exists(outfile): - os.makedirs(outfile) - elif os.path.isfile(outfile): - raise OutFileAlreadyExistsException("File " + str(outfile) + " already exists!") - for s in self.workbook.sheets: - sheetname = s['name'] - sheetstate = s['state'] - - # filter hidden sheets - if sheetstate in ('hidden', 'veryHidden') and self.options['exclude_hidden_sheets']: - continue - - # filter sheets by include pattern - include_sheet_pattern = self.options['include_sheet_pattern'] - if type(include_sheet_pattern) == type(""): # optparser lib fix - include_sheet_pattern = [include_sheet_pattern] - if len(include_sheet_pattern) > 0: - include = False - for pattern in include_sheet_pattern: - include = pattern and len(pattern) > 0 and re.match(pattern, sheetname) - if include: - break - if not include: - continue - - # filter sheets by exclude pattern - exclude_sheet_pattern = self.options['exclude_sheet_pattern'] - if type(exclude_sheet_pattern) == type(""): # optparser lib fix - exclude_sheet_pattern = [exclude_sheet_pattern] - exclude = False - for pattern in exclude_sheet_pattern: - exclude = pattern and len(pattern) > 0 and re.match(pattern, sheetname) - if exclude: - break - if exclude: - continue - - if not self.py3: - sheetname = sheetname.encode('utf-8') - of = outfile - if isinstance(outfile, str): - of = os.path.join(outfile, sheetname + '.csv') - elif self.options['sheetdelimiter'] and len(self.options['sheetdelimiter']): - of.write(self.options['sheetdelimiter'] + " " + str(s['index']) + " - " + sheetname + self.options['lineterminator']) - self._convert(s['index'], of) - - def _convert(self, sheet_index, outfile): - closefile = False - if isinstance(outfile, str): - if sys.version_info[0] == 2: - outfile = open(outfile, 'wb+') - elif sys.version_info[0] == 3: - outfile = open(outfile, 'w+', encoding=self.options['outputencoding'], newline="") - else: - sys.stderr.write("error: version of your python is not supported: " + str(sys.version_info) + "\n") - sys.exit(1) - closefile = True - try: - writer = csv.writer(outfile, quoting=self.options['quoting'], delimiter=self.options['delimiter'], - lineterminator=self.options['lineterminator']) - - sheets_filtered = list(filter(lambda s: s['index'] == sheet_index, self.workbook.sheets)) - if len(sheets_filtered) == 0: - eprint("Sheet with index %i not found or can't be handled" % sheet_index) - return 1 - - sheet_path = None - # using sheet relation information - if 'relation_id' in sheets_filtered[0] and sheets_filtered[0]['relation_id'] is not None: - - relation_id = sheets_filtered[0]['relation_id'] - if relation_id in self.workbook.relationships.relationships and \ - 'target' in self.workbook.relationships.relationships[relation_id]: - relationship = self.workbook.relationships.relationships[relation_id] - sheet_path = relationship['target'] - if not (sheet_path.startswith("/xl/") or sheet_path.startswith("xl/")): - sheet_path = "/xl/" + sheet_path - - sheet_file = None - if sheet_path is None: - sheet_path = "/xl/worksheets/sheet%i.xml" % sheet_index - sheet_file = self._filehandle(sheet_path) - if sheet_file is None: - sheet_path = None - if sheet_path is None: - sheet_path = "/xl/worksheets/worksheet%i.xml" % sheet_index - sheet_file = self._filehandle(sheet_path) - if sheet_file is None: - sheet_path = None - if sheet_path is None and sheet_index == 1: - sheet_path = self.content_types.types["worksheet"] - sheet_file = self._filehandle(sheet_path) - if sheet_file is None: - sheet_path = None - if sheet_file is None and sheet_path is not None: - sheet_file = self._filehandle(sheet_path) - if sheet_file is None: - raise SheetNotFoundException("Sheet %i not found" % sheet_index) - sheet = Sheet(self.workbook, self.shared_strings, self.styles, sheet_file) - try: - relationships_path = os.path.join(os.path.dirname(sheet_path), - "_rels", - os.path.basename(sheet_path) + ".rels") - sheet.relationships = self._parse(Relationships, relationships_path) - sheet.set_dateformat(self.options['dateformat']) - sheet.set_timeformat(self.options['timeformat']) - sheet.set_floatformat(self.options['floatformat']) - sheet.set_skip_empty_lines(self.options['skip_empty_lines']) - sheet.set_skip_trailing_columns(self.options['skip_trailing_columns']) - sheet.set_include_hyperlinks(self.options['hyperlinks']) - sheet.set_merge_cells(self.options['merge_cells']) - sheet.set_scifloat(self.options['scifloat']) - sheet.set_ignore_formats(self.options['ignore_formats']) - if self.options['escape_strings'] and sheet.filedata: - sheet.filedata = re.sub(r"(<v>[^<>]+) ([^<>]+</v>)", r"\1\\n\2", - re.sub(r"(<v>[^<>]+)	([^<>]+</v>)", r"\1\\t\2", - re.sub(r"(<v>[^<>]+) ([^<>]+</v>)", r"\1\\r\2", sheet.filedata))) - sheet.to_csv(writer) - finally: - sheet_file.close() - sheet.close() - finally: - if closefile: - outfile.close() - - def _filehandle(self, filename): - for name in filter(lambda f: filename and f.lower() == filename.lower()[1:], self.ziphandle.namelist()): - # python2.4 fix - if not hasattr(self.ziphandle, "open"): - return StringIO(self.ziphandle.read(name)) - return self.ziphandle.open(name, "r") - return None - - def _parse(self, klass, filename): - instance = klass() - filehandle = self._filehandle(filename) - if filehandle: - instance.parse(filehandle) - filehandle.close() - return instance - - -class Workbook: - def __init__(self): - self.sheets = list() - self.date1904 = False - - def parse(self, filehandle): - workbookDoc = minidom.parseString(filehandle.read()) - if workbookDoc.firstChild.namespaceURI: - fileVersion = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, - "fileVersion") - else: - fileVersion = workbookDoc.firstChild.getElementsByTagName("fileVersion") - if len(fileVersion) == 0: - self.appName = DEFAULT_APP_PATH - else: - try: - if workbookDoc.firstChild.namespaceURI: - self.appName = \ - workbookDoc.firstChild.getElementsByTagNameNS( - workbookDoc.firstChild.namespaceURI, "fileVersion")[0]._attrs['appName'].value - else: - self.appName = workbookDoc.firstChild.getElementsByTagName("fileVersion")[0]._attrs['appName'].value - except KeyError: - # no app name - self.appName = DEFAULT_APP_PATH - try: - if workbookDoc.firstChild.namespaceURI: - self.date1904 = \ - workbookDoc.firstChild.getElementsByTagNameNS( - workbookDoc.firstChild.namespaceURI, "workbookPr")[0]._attrs['date1904'].value.lower().strip() \ - != "false" - else: - self.date1904 = \ - workbookDoc.firstChild.getElementsByTagName("workbookPr")[0] \ - ._attrs['date1904'].value.lower().strip() \ - != "false" - except: - pass - - if workbookDoc.firstChild.namespaceURI: - sheets = workbookDoc.firstChild.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "sheets")[0] - else: - sheets = workbookDoc.firstChild.getElementsByTagName("sheets")[0] - if workbookDoc.firstChild.namespaceURI: - sheetNodes = sheets.getElementsByTagNameNS(workbookDoc.firstChild.namespaceURI, "sheet") - else: - sheetNodes = sheets.getElementsByTagName("sheet") - for i, sheetNode in enumerate(sheetNodes): - attrs = sheetNode._attrs - name = attrs["name"].value - state = None - if 'state' in attrs: - state = attrs["state"].value - relation_id = None - if 'r:id' in attrs: - relation_id = attrs['r:id'].value - self.sheets.append( - { - 'name': name, - 'relation_id': relation_id, - 'index': i + 1, - 'id': i + 1, # remove id starting 0.8.0 version - 'state': state - } - ) - - -class ContentTypes: - def __init__(self): - self.types = {} - for type in CONTENT_TYPES: - self.types[type] = None - - def parse(self, filehandle): - types = minidom.parseString(filehandle.read()).firstChild - if not types: - return - if types.namespaceURI: - overrideNodes = types.getElementsByTagNameNS(types.namespaceURI, "Override") - else: - overrideNodes = types.getElementsByTagName("Override") - for override in overrideNodes: - attrs = override._attrs - type = attrs.get('ContentType').value - name = attrs.get('PartName').value - if type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml": - self.types["workbook"] = name - elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.styles+xml": - self.types["styles"] = name - elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml": - # BUG preserved only last sheet - self.types["worksheet"] = name - elif type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml": - self.types["shared_strings"] = name - elif type == "application/vnd.openxmlformats-package.relationships+xml": - if self.types["relationships"] is None: - self.types["relationships"] = list() - self.types["relationships"].append(name) - - if self.types["workbook"] is None: - self.types["workbook"] = DEFAULT_WORKBOOK_PATH - if self.types["relationships"] is None: - self.types["relationships"] = [os.path.dirname(self.types["workbook"]) + "/_rels/" + \ - os.path.basename(self.types["workbook"]) + ".rels"] - - -class Relationships: - def __init__(self): - self.relationships = {} - - def parse(self, filehandle): - doc = minidom.parseString(filehandle.read()) - if doc.namespaceURI: - relationships = doc.getElementsByTagNameNS(doc.namespaceURI, "Relationships") - else: - relationships = doc.getElementsByTagName("Relationships") - if not relationships: - return - if doc.namespaceURI: - relationshipNodes = relationships[0].getElementsByTagNameNS(doc.namespaceURI, "Relationship") - else: - relationshipNodes = relationships[0].getElementsByTagName("Relationship") - for rel in relationshipNodes: - attrs = rel._attrs - rId = attrs.get('Id') - if rId: - vtype = attrs.get('Type') - target = attrs.get('Target') - self.relationships[str(rId.value)] = { - "type": vtype and str(vtype.value) or None, - "target": target and str(target.value) or None - } - - -class Styles: - def __init__(self): - self.numFmts = {} - self.cellXfs = [] - - def parse(self, filehandle): - styles = minidom.parseString(filehandle.read()).firstChild - # numFmts - if styles.namespaceURI: - numFmtsElement = styles.getElementsByTagNameNS(styles.namespaceURI, "numFmts") - else: - numFmtsElement = styles.getElementsByTagName("numFmts") - if len(numFmtsElement) == 1: - for numFmt in numFmtsElement[0].childNodes: - if numFmt.nodeType == minidom.Node.ELEMENT_NODE: - numFmtId = int(numFmt._attrs['numFmtId'].value) - formatCode = numFmt._attrs['formatCode'].value.lower().replace('\\', '') - self.numFmts[numFmtId] = formatCode - - if styles.namespaceURI: - cellXfsElement = styles.getElementsByTagNameNS(styles.namespaceURI, "cellXfs") - else: - cellXfsElement = styles.getElementsByTagName("cellXfs") - if len(cellXfsElement) == 1: - for cellXfs in cellXfsElement[0].childNodes: - if cellXfs.nodeType != minidom.Node.ELEMENT_NODE or not ( - cellXfs.nodeName == "xf" or cellXfs.nodeName.endswith(":xf")): - continue - if cellXfs._attrs and 'numFmtId' in cellXfs._attrs: - numFmtId = int(cellXfs._attrs['numFmtId'].value) - if self.chk_exists(numFmtId) == None: - numFmtId = int(cellXfs._attrs['applyNumberFormat'].value) - self.cellXfs.append(numFmtId) - else: - self.cellXfs.append(None) - - # When Unknown Numformat ID assign applyNumberFormat - def chk_exists(self, numFmtId): - xfs_numfmt = numFmtId - format_str = None - if xfs_numfmt in self.numFmts: - format_str = self.numFmts[xfs_numfmt] - elif xfs_numfmt in STANDARD_FORMATS: - format_str = STANDARD_FORMATS[xfs_numfmt] - return format_str - - -class SharedStrings: - def __init__(self): - self.parser = None - self.strings = [] - self.si = False - self.t = False - self.rPh = False - self.value = "" - - def parse(self, filehandle): - self.parser = xml.parsers.expat.ParserCreate() - self.parser.CharacterDataHandler = self.handleCharData - self.parser.StartElementHandler = self.handleStartElement - self.parser.EndElementHandler = self.handleEndElement - self.parser.ParseFile(filehandle) - - def escape_strings(self): - for i in range(0, len(self.strings)): - self.strings[i] = self.strings[i].replace("\r", "\\r").replace("\n", "\\n").replace("\t", "\\t") - - def replace_line_breaks(self): - for i in range(0, len(self.strings)): - self.strings[i] = self.strings[i].replace("\r", " ").replace("\n", " ").replace("\t", " ") - - def handleCharData(self, data): - if self.t: - self.value += data - - def handleStartElement(self, name, attrs): - # ignore namespace - i = name.find(":") - if i >= 0: - name = name[i + 1:] - - if name == 'si': - self.si = True - self.value = "" - elif name == 't' and self.rPh: - self.t = False - elif name == 't' and self.si: - self.t = True - elif name == 'rPh': - self.rPh = True - - def handleEndElement(self, name): - # ignore namespace - i = name.find(":") - if i >= 0: - name = name[i + 1:] - - if name == 'si': - self.si = False - self.strings.append(self.value) - elif name == 't': - self.t = False - elif name == 'rPh': - self.rPh = False - - -class Sheet: - def __init__(self, workbook, sharedString, styles, filehandle): - self.py3 = sys.version_info[0] == 3 - self.parser = None - self.writer = None - self.sharedString = None - self.styles = None - self.relationships = None - self.columns_count = -1 - - self.in_sheet = False - self.in_row = False - self.in_cell = False - self.in_cell_value = False - - self.columns = {} - self.lastRowNum = 0 - self.rowNum = None - self.colType = None - self.cellId = None - self.s_attr = None - self.data = None - self.max_columns = -1 - - self.dateformat = None - self.timeformat = "%H:%M" # default time format - self.floatformat = None - self.skip_empty_lines = False - self.skip_trailing_columns = False - - self.filedata = None - self.filehandle = filehandle - self.workbook = workbook - self.sharedStrings = sharedString.strings - self.styles = styles - - self.hyperlinks = {} - self.mergeCells = {} - self.ignore_formats = [] - - self.colIndex = 0 - self.colNum = "" - - def close(self): - # Make sure Worksheet is closed, parsers lib does not have a close() function, so simply delete it - self.parser = None - - def set_dateformat(self, dateformat): - self.dateformat = dateformat - - def set_timeformat(self, timeformat): - if timeformat: - self.timeformat = timeformat - - def set_floatformat(self, floatformat): - self.floatformat = floatformat - - def set_skip_empty_lines(self, skip): - self.skip_empty_lines = skip - - def set_skip_trailing_columns(self, skip): - self.skip_trailing_columns = skip - - def set_ignore_formats(self, ignore_formats): - self.ignore_formats = ignore_formats - - def set_merge_cells(self, mergecells): - if not mergecells: - return - if not self.filedata: - self.filedata = self.filehandle.read() - data = str(self.filedata) # python3: convert byte buffer to string - - # find worksheet tag, we need namespaces from it - start = data.find("<worksheet") - if start < 0: - return - end = data.find(">", start) - worksheet = data[start: end + 1] - - # find hyperlinks part - start = data.find("<mergeCells") - if start < 0: - # hyperlinks not found - return - end = data.find("</mergeCells>") - data = data[start: end + 13] - - # parse hyperlinks - doc = minidom.parseString(worksheet + data + "</worksheet>").firstChild - - if doc.namespaceURI: - mergeCells = doc.getElementsByTagNameNS(doc.namespaceURI, "mergeCell") - else: - mergeCells = doc.getElementsByTagName("mergeCell") - for mergeCell in mergeCells: - attrs = mergeCell._attrs - if 'ref' in attrs.keys(): - rangeStr = attrs['ref'].value - rng = rangeStr.split(":") - if len(rng) > 1: - for cell in self._range(rangeStr): - self.mergeCells[cell] = {} - self.mergeCells[cell]['copyFrom'] = rng[0] - - def set_scifloat(self, scifloat): - self.scifloat = scifloat - - def set_include_hyperlinks(self, hyperlinks): - if not hyperlinks or not self.relationships or not self.relationships.relationships: - return - # we must read file first to get hyperlinks, but we don't wont to parse whole file - if not self.filedata: - self.filedata = self.filehandle.read() - data = str(self.filedata) # python3: convert byte buffer to string - - # find worksheet tag, we need namespaces from it - start = data.find("<worksheet") - if start < 0: - return - end = data.find(">", start) - worksheet = data[start: end + 1] - - # find hyperlinks part - start = data.find("<hyperlinks>") - if start < 0: - # hyperlinks not found - return - end = data.find("</hyperlinks>") - data = data[start: end + 13] - - # parse hyperlinks - doc = minidom.parseString(worksheet + data + "</worksheet>").firstChild - if doc.namespaceURI: - hiperlinkNodes = doc.getElementsByTagNameNS(doc.namespaceURI, "hyperlink") - else: - hiperlinkNodes = doc.getElementsByTagName("hyperlink") - for hlink in hiperlinkNodes: - attrs = hlink._attrs - ref = rId = None - for k in attrs.keys(): - if k == "ref": - ref = str(attrs[k].value) - if k.endswith(":id"): - rId = str(attrs[k].value) - if not ref or not rId: - continue - rel = self.relationships.relationships.get(rId) - if not rel: - continue - target = rel.get('target') - for cell in self._range(ref): - self.hyperlinks[cell] = target - - def to_csv(self, writer): - self.writer = writer - self.parser = xml.parsers.expat.ParserCreate() - self.parser.buffer_text = True - self.parser.CharacterDataHandler = self.handleCharData - self.parser.StartElementHandler = self.handleStartElement - self.parser.EndElementHandler = self.handleEndElement - if self.filedata: - self.parser.Parse(self.filedata) - else: - self.parser.ParseFile(self.filehandle) - - def handleCharData(self, data): - if self.in_cell_value: - format_type = None - format_str = "general" - self.collected_string += data - self.data = self.collected_string - if self.colType == "s": # shared string - format_type = "string" - self.data = self.sharedStrings[int(self.data)] - elif self.colType == "b": # boolean - format_type = "boolean" - self.data = (int(data) == 1 and "TRUE") or (int(data) == 0 and "FALSE") or data - elif self.colType == "str" or self.colType == "inlineStr": - format_type = "string" - self.data = data - elif self.s_attr: - s = int(self.s_attr) - - # get cell format - xfs_numfmt = None - if s < len(self.styles.cellXfs): - xfs_numfmt = self.styles.cellXfs[s] - if xfs_numfmt in self.styles.numFmts: - format_str = self.styles.numFmts[xfs_numfmt] - elif xfs_numfmt in STANDARD_FORMATS: - format_str = STANDARD_FORMATS[xfs_numfmt] - - # get format type - if not format_str: - eprint("unknown format %s at %d" % (format_str, xfs_numfmt)) - return - - if format_str in FORMATS: - format_type = FORMATS[format_str] - elif re.match("^\d+(\.\d+)?$", self.data) and re.match(".*[hsmdyY]", format_str) and not re.match( - '.*\[.*[dmhys].*\]', format_str): - # it must be date format - if float(self.data) < 1: - format_type = "time" - else: - format_type = "date" - elif re.match("^-?\d+(.\d+)?$", self.data) or ( - self.scifloat and re.match("^-?\d+(.\d+)?([eE]-?\d+)?$", self.data)): - format_type = "float" - if format_type == 'date' and self.dateformat == 'float': - format_type = "float" - elif self.colType == "n": - format_type = "float" - - if format_type and not format_type in self.ignore_formats: - try: - if format_type == 'date': # date/time - if self.workbook.date1904: - date = datetime.datetime(1904, 1, 1) + datetime.timedelta(float(self.data)) - else: - date = datetime.datetime(1899, 12, 30) + datetime.timedelta(float(self.data)) - if self.dateformat: - # str(dateformat) - python2.5 bug, see: http://bugs.python.org/issue2782 - self.data = date.strftime(str(self.dateformat)) - else: - # ignore ";@", don't know what does it mean right now - # ignore "[$-409], [$-f409], [$-16001]" and similar format codes - dateformat = re.sub(r"\[\$\-[A-z0-9]*\]", "", format_str, 1) \ - .replace(";@", "").replace("yyyy", "%Y").replace("yy", "%y") \ - .replace("hh:mm", "%H:%M").replace("h", "%I").replace("%H%H", "%H") \ - .replace("ss", "%S").replace("dddd", "d").replace("dd", "d").replace("d", "%d") \ - .replace("am/pm", "%p").replace("mmmm", "%B").replace("mmm", "%b") \ - .replace(":mm", ":%M").replace("m", "%m").replace("%m%m", "%m") - self.data = date.strftime(str(dateformat)).strip() - elif format_type == 'time': # time - t = int(round((float(self.data) % 1) * 24 * 60 * 60, 6)) # it should be in seconds - d = datetime.time(int((t // 3600) % 24), int((t // 60) % 60), int(t % 60)) - self.data = d.strftime(self.timeformat) - elif format_type == 'float' and ('E' in self.data or 'e' in self.data): - self.data = str(self.floatformat or '%f') % float(self.data) - # if cell is general, be aggressive about stripping any trailing 0s, decimal points, etc. - elif format_type == 'float' and format_str == 'general': - self.data = ("%f" % (float(self.data))).rstrip('0').rstrip('.') - elif format_type == 'float' and format_str[0:3] == '0.0': - if self.floatformat: - self.data = str(self.floatformat) % float(self.data) - else: - L = len(format_str.split(".")[1]) - if '%' in format_str: - L += 1 - self.data = ("%." + str(L) + "f") % float(self.data) - elif format_type == 'float': - # unsupported float formatting - self.data = ("%f" % (float(self.data))).rstrip('0').rstrip('.') - - except (ValueError, OverflowError): # this catch must be removed, it's hiding potential problems - eprint("Error: potential invalid date format.") - # invalid date format - pass - - def handleStartElement(self, name, attrs): - has_namespace = name.find(":") > 0 - if self.in_row and (name == 'c' or (has_namespace and name.endswith(':c'))): - self.colType = attrs.get("t") - self.s_attr = attrs.get("s") - self.cellId = attrs.get("r") - if self.cellId: - self.colNum = self.cellId[:len(self.cellId) - len(self.rowNum)] - self.colIndex = 0 - else: - self.colIndex += 1 - self.data = "" - self.in_cell = True - elif self.in_cell and ( - (name == 'v' or name == 'is') or (has_namespace and (name.endswith(':v') or name.endswith(':is')))): - self.in_cell_value = True - self.collected_string = "" - elif self.in_sheet and (name == 'row' or (has_namespace and name.endswith(':row'))) and ('r' in attrs): - self.rowNum = attrs['r'] - self.in_row = True - self.colIndex = 0 - self.colNum = "" - self.columns = {} - self.spans = None - if 'spans' in attrs: - self.spans = [int(i) for i in attrs['spans'].split(" ")[-1].split(":")] - elif name == 't': - # reset collected string - self.collected_string = "" - - elif name == 'sheetData' or (has_namespace and name.endswith(':sheetData')): - self.in_sheet = True - elif name == 'dimension': - rng = attrs.get("ref").split(":") - if len(rng) > 1: - start = re.match("^([A-Z]+)(\d+)$", rng[0]) - if (start): - end = re.match("^([A-Z]+)(\d+)$", rng[1]) - startCol = start.group(1) - endCol = end.group(1) - self.columns_count = 0 - for cell in self._range(startCol + "1:" + endCol + "1"): - self.columns_count += 1 - - def handleEndElement(self, name): - has_namespace = name.find(":") > 0 - if self.in_cell and ((name == 'v' or name == 'is' or name == 't') or ( - has_namespace and (name.endswith(':v') or name.endswith(':is')))): - self.in_cell_value = False - elif self.in_cell and (name == 'c' or (has_namespace and name.endswith(':c'))): - t = 0 - for i in self.colNum: t = t * 26 + ord(i) - 64 - d = self.data - if self.hyperlinks: - hyperlink = self.hyperlinks.get(self.cellId) - if hyperlink: - d = "<a href='" + hyperlink + "'>" + d + "</a>" - if self.colNum + self.rowNum in self.mergeCells.keys(): - if 'copyFrom' in self.mergeCells[self.colNum + self.rowNum].keys() and \ - self.mergeCells[self.colNum + self.rowNum]['copyFrom'] == self.colNum + self.rowNum: - self.mergeCells[self.colNum + self.rowNum]['value'] = d - else: - d = self.mergeCells[self.mergeCells[self.colNum + self.rowNum]['copyFrom']]['value'] - - self.columns[t - 1 + self.colIndex] = d - - if self.in_row and (name == 'row' or (has_namespace and name.endswith(':row'))): - if len(self.columns.keys()) > 0: - if min(self.columns.keys()) < 0: # Weird - d = [] - keys = self.columns.keys() - keys.sort() - for k in keys: - val = self.columns[k] - if not self.py3: - val = val.encode("utf-8") - d.append(val) - else: - d = [""] * (max(self.columns.keys()) + 1) - for k in self.columns.keys(): - val = self.columns[k] - if not self.py3: - val = val.encode("utf-8") - d[k] = val - if self.spans: - l = self.spans[1] - if len(d) < l: - d += (l - len(d)) * [''] - - # write empty lines - if not self.skip_empty_lines: - for i in range(self.lastRowNum, int(self.rowNum) - 1): - self.writer.writerow([]) - self.lastRowNum = int(self.rowNum) - - # write line to csv - if not self.skip_empty_lines or d.count('') != len(d): - while len(d) < self.columns_count: - d.append("") - - if self.skip_trailing_columns: - if self.max_columns < 0: - self.max_columns = len(d) - while len(d) > 0 and d[-1] == "": - d = d[0:-1] - self.max_columns = self.max_columns - 1 - elif self.max_columns > 0: - d = d[0:self.max_columns] - self.writer.writerow(d) - - self.in_row = False - elif self.in_sheet and (name == 'sheetData' or (has_namespace and name.endswith(':sheetData'))): - self.in_sheet = False - - # rangeStr: "A3:C12" or "D5" - # example: for cell in _range("A1:Z12"): print cell - def _range(self, rangeStr): - rng = rangeStr.split(":") - if len(rng) == 1: - yield rangeStr - else: - start = re.match("^([A-Z]+)(\d+)$", rng[0]) - end = re.match("^([A-Z]+)(\d+)$", rng[1]) - if not start or not end: - return - startCol = start.group(1) - startRow = int(start.group(2)) - endCol = end.group(1) - endRow = int(end.group(2)) - col = startCol - while True: - for row in range(startRow, endRow + 1): - yield col + str(row) - if col == endCol: - break - t = 0 - for i in col: t = t * 26 + ord(i) - 64 - col = "" - while t >= 0: - col = chr(t % 26 + 65) + col - t = t // 26 - 1 - - -def convert_recursive(path, sheetid, outfile, kwargs): - for name in os.listdir(path): - fullpath = os.path.join(path, name) - if os.path.isdir(fullpath): - convert_recursive(fullpath, sheetid, outfile, kwargs) - else: - outfilepath = outfile - if len(outfilepath) == 0 and fullpath.lower().endswith(".xlsx"): - outfilepath = fullpath[:-4] + 'csv' - - print("Converting %s to %s" % (fullpath, outfilepath)) - try: - Xlsx2csv(fullpath, **kwargs).convert(outfilepath, sheetid) - except zipfile.BadZipfile: - print("File %s is not a zip file" % fullpath) - - -if __name__ == "__main__": - try: - signal.signal(signal.SIGPIPE, signal.SIG_DFL) - signal.signal(signal.SIGINT, signal.SIG_DFL) - except AttributeError: - pass - - if "ArgumentParser" in globals(): - parser = ArgumentParser(description="xlsx to csv converter") - parser.add_argument('infile', metavar='xlsxfile', help="xlsx file path") - parser.add_argument('outfile', metavar='outfile', nargs='?', help="output csv file path") - parser.add_argument('-v', '--version', action='version', version=__version__) - nargs_plus = "+" - argparser = True - else: - parser = OptionParser(usage="%prog [options] infile [outfile]", version=__version__) - parser.add_argument = parser.add_option - nargs_plus = 1 - argparser = False - - if sys.version_info[0] == 2 and sys.version_info[1] < 5: - inttype = "int" - else: - inttype = int - parser.add_argument("-a", "--all", dest="all", default=False, action="store_true", - help="export all sheets") - parser.add_argument("-c", "--outputencoding", dest="outputencoding", default="utf-8", action="store", - help="encoding of output csv ** Python 3 only ** (default: utf-8)") - parser.add_argument("-d", "--delimiter", dest="delimiter", default=",", - help="delimiter - columns delimiter in csv, 'tab' or 'x09' for a tab (default: comma ',')") - parser.add_argument("--hyperlinks", "--hyperlinks", dest="hyperlinks", action="store_true", default=False, - help="include hyperlinks") - parser.add_argument("-e", "--escape", dest='escape_strings', default=False, action="store_true", - help="Escape \\r\\n\\t characters") - parser.add_argument("--no-line-breaks", "--no-line-breaks", dest='no_line_breaks', default=False, action="store_true", - help="Replace \\r\\n\\t with space") - parser.add_argument("-E", "--exclude_sheet_pattern", nargs=nargs_plus, dest="exclude_sheet_pattern", default="", - help="exclude sheets named matching given pattern, only effects when -a option is enabled.") - parser.add_argument("-f", "--dateformat", dest="dateformat", - help="override date/time format (ex. %%Y/%%m/%%d)") - parser.add_argument("-t", "--timeformat", dest="timeformat", - help="override time format (ex. %%H/%%M/%%S)") - parser.add_argument("--floatformat", dest="floatformat", - help="override float format (ex. %%.15f)") - parser.add_argument("--sci-float", dest="scifloat", default=False, action="store_true", - help="force scientific notation to float") - parser.add_argument("-I", "--include_sheet_pattern", nargs=nargs_plus, dest="include_sheet_pattern", default="^.*$", - help="only include sheets named matching given pattern, only effects when -a option is enabled.") - parser.add_argument("--exclude_hidden_sheets", default=False, action="store_true", - help="Exclude hidden sheets from the output, only effects when -a option is enabled.") - parser.add_argument("--ignore-formats", nargs=nargs_plus, type=str, dest="ignore_formats", default=[''], - help="Ignores format for specific data types.") - parser.add_argument("-l", "--lineterminator", dest="lineterminator", default="\n", - help="line terminator - lines terminator in csv, '\\n' '\\r\\n' or '\\r' (default: \\n)") - parser.add_argument("-m", "--merge-cells", dest="merge_cells", default=False, action="store_true", - help="merge cells") - parser.add_argument("-n", "--sheetname", dest="sheetname", default=None, - help="sheet name to convert") - parser.add_argument("-i", "--ignoreempty", dest="skip_empty_lines", default=False, action="store_true", - help="skip empty lines") - parser.add_argument("--skipemptycolumns", dest="skip_trailing_columns", default=False, action="store_true", - help="skip trailing empty columns") - parser.add_argument("-p", "--sheetdelimiter", dest="sheetdelimiter", default="--------", - help="sheet delimiter used to separate sheets, pass '' if you do not need delimiter, or 'x07' " - "or '\\f' for form feed (default: '--------')") - parser.add_argument("-q", "--quoting", dest="quoting", default="minimal", - help="quoting - fields quoting in csv, 'none' 'minimal' 'nonnumeric' or 'all' (default: minimal)") - parser.add_argument("-s", "--sheet", dest="sheetid", default=1, type=inttype, - help="sheet number to convert") - - if argparser: - options = parser.parse_args() - else: - (options, args) = parser.parse_args() - if len(args) < 1: - parser.print_usage() - sys.stderr.write("error: too few arguments" + os.linesep) - sys.exit(1) - options.infile = args[0] - options.outfile = len(args) > 1 and args[1] or None - - if len(options.delimiter) == 1: - pass - elif options.delimiter == 'tab' or options.delimiter == '\\t': - options.delimiter = '\t' - elif options.delimiter == 'comma': - options.delimiter = ',' - elif options.delimiter[0] == 'x': - options.delimiter = chr(int(options.delimiter[1:])) - else: - sys.stderr.write("error: invalid delimiter\n") - sys.exit(1) - - if options.quoting == 'none': - options.quoting = csv.QUOTE_NONE - elif options.quoting == 'minimal': - options.quoting = csv.QUOTE_MINIMAL - elif options.quoting == 'nonnumeric': - options.quoting = csv.QUOTE_NONNUMERIC - elif options.quoting == 'all': - options.quoting = csv.QUOTE_ALL - else: - sys.stderr.write("error: invalid quoting\n") - sys.exit(1) - - if options.lineterminator == '\n': - pass - elif options.lineterminator == '\\n': - options.lineterminator = '\n' - elif options.lineterminator == '\\r': - options.lineterminator = '\r' - elif options.lineterminator == '\\r\\n': - options.lineterminator = '\r\n' - else: - sys.stderr.write("error: invalid line terminator\n") - sys.exit(1) - - if options.sheetdelimiter == '--------': - pass - elif options.sheetdelimiter == '': - pass - elif options.sheetdelimiter == '\\f': - options.sheetdelimiter = '\f' - elif options.sheetdelimiter[0] == 'x': - options.sheetdelimiter = chr(int(options.sheetdelimiter[1:])) - else: - sys.stderr.write("error: invalid sheet delimiter\n") - sys.exit(1) - - kwargs = { - 'delimiter': options.delimiter, - 'quoting': options.quoting, - 'sheetdelimiter': options.sheetdelimiter, - 'dateformat': options.dateformat, - 'timeformat': options.timeformat, - 'floatformat': options.floatformat, - 'scifloat': options.scifloat, - 'skip_empty_lines': options.skip_empty_lines, - 'skip_trailing_columns': options.skip_trailing_columns, - 'escape_strings': options.escape_strings, - 'no_line_breaks': options.no_line_breaks, - 'hyperlinks': options.hyperlinks, - 'include_sheet_pattern': options.include_sheet_pattern, - 'exclude_sheet_pattern': options.exclude_sheet_pattern, - 'exclude_hidden_sheets': options.exclude_hidden_sheets, - 'merge_cells': options.merge_cells, - 'outputencoding': options.outputencoding, - 'lineterminator': options.lineterminator, - 'ignore_formats': options.ignore_formats - } - sheetid = options.sheetid - if options.all: - sheetid = 0 - - outfile = options.outfile or sys.stdout - try: - if os.path.isdir(options.infile): - convert_recursive(options.infile, sheetid, outfile, kwargs) - else: - xlsx2csv = Xlsx2csv(options.infile, **kwargs) - if options.sheetname: - sheetid = xlsx2csv.getSheetIdByName(options.sheetname) - if not sheetid: - raise XlsxException("Sheet '%s' not found" % options.sheetname) - xlsx2csv.convert(outfile, sheetid) - except XlsxException: - _, e, _ = sys.exc_info() - sys.stderr.write(str(e) + "\n") - sys.exit(1) |