Folgendes liest die Zellenwerte in eine Matrix und gibt diese dann aus.
Code: Alles auswählen
#!/usr/bin/env python
# -*- coding: windows-1251 -*-
# Copyright (C) 2005 Kiseliov Roman
__rev_id__ = """$Id: xls2csv.py,v 1.1 2005/10/26 07:44:24 rvk Exp $"""
from pyExcelerator import *
import sys
me, args = sys.argv[0], sys.argv[1:]
if args:
for arg in args:
print >>sys.stderr, 'extracting data from', arg
for sheet_name, values in parse_xls(arg, 'cp1251'): # parse_xls(arg) -- default encoding
matrix = [[]]
print 'Sheet = "%s"' % sheet_name.encode('cp866', 'backslashreplace')
print '----------------'
for row_idx, col_idx in sorted(values.keys()):
v = values[(row_idx, col_idx)]
if isinstance(v, unicode):
v = v.encode('cp866', 'backslashreplace')
else:
v = `v`
v = '"%s"' % v.strip()
last_row, last_col = len(matrix), len(matrix[-1])
while last_row <= row_idx:
matrix.extend([[]])
last_row = len(matrix)
while last_col < col_idx:
matrix[-1].extend([''])
last_col = len(matrix[-1])
matrix[-1].extend([v])
for row in matrix:
csv_row = ', '.join(row)
print csv_row
else:
print 'usage: %s (inputfile)+' % me
Code: Alles auswählen
#!/usr/bin/env python
# -*- coding: windows-1251 -*-
# Copyright (C) 2005 Roman V. Kiseliov
# All rights reserved.
# ...
import struct
encodings = {
0x016F: 'ascii', #ASCII
0x01B5: 'cp437', #IBM PC CP-437 (US)
0x02D0: 'cp720', #IBM PC CP-720 (OEM Arabic)
0x02E1: 'cp737', #IBM PC CP-737 (Greek)
#...
}
biff8 = True
SST = {}
sheets = []
sheet_names = []
values = {}
ws_num = 0
BOFs = 0
EOFs = 0
# Inside MS Office document looks like filesystem
# We need extract stream named 'Workbook' or 'Book'
ole_streams = CompoundDoc.Reader(filename).STREAMS
if 'Workbook' in ole_streams:
workbook_stream = ole_streams['Workbook']
elif 'Book' in ole_streams:
workbook_stream = ole_streams['Book']
else:
raise Exception, 'No workbook stream in file.'
workbook_stream_len = len(workbook_stream)
stream_pos = 0
# Excel's method of data storing is based on
# ancient technology "TLV" (Type, Length, Value).
# In addition, if record size grows to some limit
# Excel writes CONTINUE records
while stream_pos < workbook_stream_len and EOFs <= ws_num:
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
stream_pos += data_size
if rec_id == 0x0809: # BOF
#print 'BOF',
BOFs += 1
ver, substream_type = unpack('<2H', rec_data[:4])
if substream_type == 0x0005:
# workbook global substream
biff8 = ver >= 0x0600
elif substream_type == 0x0010:
# worksheet substream
pass
else: # skip chart stream or unknown stream
# stream offsets may be used from BOUNDSHEET record
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
while rec_id != 0x000A: # EOF
#print 'SST CONTINUE'
stream_pos += 4
stream_pos += data_size
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
#print 'BIFF8 == ', biff8
elif rec_id == 0x000A: # EOF
#print 'EOF'
if BOFs > 1:
sheets.extend([values])
values = {}
EOFs += 1
elif rec_id == 0x0042: # CODEPAGE
cp , = unpack('<H', rec_data)
#print 'CODEPAGE', hex(cp)
if not encoding:
encoding = encodings[cp]
#print encoding
elif rec_id == 0x0085: # BOUNDSHEET
#print 'BOUNDSHEET',
ws_num += 1
b = process_BOUNDSHEET(biff8, rec_data)
sheet_names.extend([b])
#print b.encode('cp866')
elif rec_id == 0x00FC: # SST
#print 'SST'
sst_data = rec_data
sst_continues = []
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
while rec_id == 0x003C: # CONTINUE
#print 'SST CONTINUE'
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
sst_continues.extend([rec_data])
stream_pos += data_size
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
SST = process_SST(sst_data, sst_continues)
elif rec_id == 0x00FD: # LABELSST
#print 'LABELSST',
r, c, i = process_LABELSST(rec_data)
values[(r, c)] = SST[i]
#print r, c, SST[i].encode('cp866')
elif rec_id == 0x0204: # LABEL
#print 'LABEL',
r, c, b = process_LABEL(biff8, rec_data)
values[(r, c)] = b
#print r, c, b.encode('cp866')
elif rec_id == 0x00D6: # RSTRING
#print 'RSTRING',
r, c, b = process_RSTRING(biff8, rec_data)
values[(r, c)] = b
#print r, c, b.encode('cp866')
elif rec_id == 0x027E: # RK
#print 'RK',
r, c, b = process_RK(rec_data)
values[(r, c)] = b
#print r, c, b
elif rec_id == 0x00BD: # MULRK
#print 'MULRK',
for r, c, b in process_MULRK(rec_data):
values[(r, c)] = b
#print r, c, b
elif rec_id == 0x0203: # NUMBER
#print 'NUMBER',
r, c, b = process_NUMBER(rec_data)
values[(r, c)] = b
#print r, c, b
elif rec_id == 0x0006: # FORMULA
#print 'FORMULA',
r, c, x = unpack('<3H', rec_data[0:6])
if rec_data[12] == '\xFF' and rec_data[13] == '\xFF':
if rec_data[6] == '\x00':
got_str = False
if ord(rec_data[14]) & 8:
# part of shared formula
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
stream_pos += data_size
if rec_id == 0x0207: # STRING
got_str = True
elif rec_id not in (0x0221, 0x04BC, 0x0236, 0x0037, 0x0036):
raise Exception("Expected ARRAY, SHRFMLA, TABLEOP* or STRING record")
if not got_str:
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
stream_pos += data_size
if rec_id != 0x0207: # STRING
raise Exception("Expected STRING record")
values[(r, c)] = unpack2str(biff8, rec_data)
elif rec_data[6] == '\x01':
# boolean
v = ord(rec_data[8])
values[(r, c)] = bool(v)
elif rec_data[6] == '\x02':
# error
v = ord(rec_data[8])
if v in ExcelMagic.error_msg_by_code:
values[(r, c)] = ExcelMagic.error_msg_by_code[v]
else:
values[(r, c)] = u'#UNKNOWN ERROR!'
elif rec_data[6] == '\x03':
# empty
values[(r, c)] = u''
else:
raise Exception("Unknown value for formula result")
else:
# 64-bit float
d, = unpack("<d", rec_data[6:14])
values[(r, c)] = d
encoding = None
return zip(sheet_names, sheets)
Das was mir fehlt ist eine Auswertung des XF-Records und FONT.
gefunden.