Ist leider noch ein frühes Stadium des Programms, hat aber bereits einiges Lernen und einige Arbeit erfordert, gibt aber bereits einige Antworten auf meine Fragen. Ich will mehr über die 'harte' und 'weiche' Formatierung verstehen bzw. das Dokument auch maschinell verändern sowie kontrolliert in anderen Formaten ausgeben können. Liefert bei entsprechender Einstellung von .print_flag sehr viel Angaben. Bitte um Anregungen und Verbesserungsvorschläge
Code: Alles auswählen
#!/usr/bin/env python
# -*- coding: iso-8859-15 -*-
'''Tool for an analyse of XML-Dokuments
Gerhard Enders
start 2008.07.18
working version 2008.08.07
To Do:
an other solution instaed of 'c_' 's_' for marking style source, to show equal
styles in style and content, perhaps new attribute 'source'
check for .zip-files
many other
'''
import os
import sys
import zipfile
import xml.dom.minidom
#------------------------------------------------------------------------------
class HandleOpenDocumentFiles:
'''.zip files
it shoud bee more general, odf is a special case
'''
#----------------------------------------------------------------------------
def __init__ (self, odf_filename):
'''Get a valid ODF file
'''
if len(sys.argv) > 1 :
fpos = str(os.getcwd())
self.odf_file = fpos + '\\' + odf_filename
else :
print 'Sorry, you forgot to give an ODF file name.'
'''
# test if the file is a valid pkzip file
#
if zipfile.is_zipfile(zfilename):
#
print "%s is a valid pkzip file" % zfilename
#
else:
#
print "%s is not a valid pkzip file" % zfilename
#
'''
#----------------------------------------------------------------------------
def get_odf_component(self, component_name):
'''Get a named component from an ODF file.
'''
odf_zipfile = zipfile.ZipFile(self.odf_file, 'r')
component = odf_zipfile.read(component_name)
odf_zipfile.close()
return component
#----------------------------------------------------------------------------
def print_odf_zipfile_info(self) :
'''Print the names of the zip components in the file.
'''
total_compressed_size = 0
total_uncompressed_size = 0
odf_zipfile = zipfile.ZipFile(self.odf_file, 'r')
self.print_separator()
print 'The components in the %s ODF zip file are:\n' % (self.odf_file)
print ' %-40s %12s %12s' % ('Name', 'Compressed', 'Uncompressed')
print ' %-40s %12s %12s\n' % (' ', 'Size', 'Size')
for info in odf_zipfile.infolist():
total_compressed_size += info.compress_size
total_uncompressed_size += info.file_size
print ' %-40s %12d %12d' % (info.filename,
info.compress_size,
info.file_size)
print ' %-40s %12s %12s' % (' ', '--------', '--------')
print ' %-40s %12d %12d' % ('Total',
total_compressed_size,
total_uncompressed_size),
odf_zipfile.close()
self.print_separator()
return total_compressed_size
#----------------------------------------------------------------------------
def print_separator(self):
print '\n' + 79 * '-'
#----------------------------------------------------------------------------
def rewrite_zipfile(self, name, old):
'''
'''
new_name = name + '.zip'
new_zipfile = zipfile.ZipFile(new_name, 'w')
old_zipfile = zipfile.ZipFile(old, 'r')
old_zipfile.close
new_zipfile.close
#----------------------------------------------------------------------------
def print_attr (self, list):
attr = []
for item in range(len(list)):
print type(item)
#------------------------------------------------------------------------------
class AnalyseXmlDocument:
'''Tools for an analyse of a XML-Document
'''
def __init__(self):
''' '''
self.ntype = [
'',
'ELEMENT_NODE',
'ATTRIBUTE_NODE',
'TEXT_NODE',
'CDATA_SECTION_NODE',
'ENTITY_REFERENCE_NODE'
'ENTITY_NODE',
'PROCESSING_INSTRUCTION_NODE',
'COMMENT_NODE',
'DOCUMENT_NODE',
'DOCUMENT_TYPE_NODE',
'DOCUMENT_FRAGMENT_NODE',
'NOTATION_NODE'
]
self.attr_names = [
'style:name',
'style:font-name',
'style:list-style-name',
'text:name',
'text:style-name'
]
self.document_content = ''
self.floor = 0
self.node_hash = {}
self.stairs = {}
self.print_flag = False
self.text_flag = False
#----------------------------------------------------------------------------
def walkingThroughDocument(self, element, part):
'''Recursive walking through the whole Element
The actual element is always a node
'''
self.documentNode(element, part)
self.floor += 1
for node in element.childNodes:
self.walkingThroughDocument(node, part)
self.floor -= 1
# take care of formating end
if self.text_flag is True and self.stairs[self.floor] <> '#text':
self.document_content += self.formatingInfo('END', self.floor)
return
#----------------------------------------------------------------------------
def formatingInfo(self, case, floor):
return '%2s' %(floor) + ' ' + case + ': ' + self.stairs[floor] + '\n'
#----------------------------------------------------------------------------
def printInfo(self, info):
if self.print_flag is True:
print info
return
#----------------------------------------------------------------------------
def walkingThroughDocumentFinish(self):
self.text_flag = False
#----------------------------------------------------------------------------
def documentNode(self, node, part):
'''Print all information of the node
'''
self.printInfo(str(self.floor) + ' ' + unicode(node))
self.printInfo('Type : ' + str(node.nodeType) + ' ' + self.ntype[node.nodeType])
self.printInfo('NodeName : ' + str(node.nodeName) + ' ' + str(node.localName))
attr = {}
name = node.nodeName
if node.nodeType <> node.TEXT_NODE:
if self.print_flag is True: print 'Attributes:',
if node.hasAttributes():
d = node.attributes.keys()
if self.print_flag is True: print len(d), d
for item in d:
self.printInfo(" %-35s %-15s " % (item, node.getAttribute(item)))
attr[item] = node.getAttribute(item)
if item in self.attr_names:
name += ('_' + node.getAttribute(item))
self.printInfo(node.getAttribute(item))
else:
pass
else:
self.printInfo('None')
try:
self.node_hash[part + name].append(attr)
except KeyError:
self.node_hash[part + name] = [attr]
self.printInfo(attr)
# update floor-name
try:
self.stairs[self.floor] = name
except Keyerror:
self.stairs[self.floor] = name
# take care of formating start
if self.text_flag is True and self.stairs[self.floor] <> '#text':
self.document_content += self.formatingInfo('BEG', self.floor)
# take care of text content
if node.nodeType == node.TEXT_NODE:
self.printInfo('Content : ' + node.nodeValue.strip())
if self.text_flag is False and self.floor > 1:
for i in range(self.floor - 1):
self.document_content += self.formatingInfo('BEG', i)
self.text_flag = True
self.document_content += (node.nodeValue.strip() + '\n')
self.printInfo('')
return
#----------------------------------------------------------------------------
def print_attr(self, liste):
'''Print table of a list of directories
Testversion
'''
attr = [] # is a list with all attributes
attr_len = 0 # largest length of one of the attributes
value_len = 0 # largest length of one of the values
for item in range(len(liste)):
act_dir = liste[item].keys() # is a directory with attributes
for elem in act_dir:
if elem not in attr:
attr.append(elem)
act_alen = len(elem)
if act_alen > attr_len:
attr_len = act_alen
act_vlen = len(liste[item][elem])
if act_vlen > value_len:
value_len = act_vlen
attr.sort()
mask1 = '%-' + str(attr_len) + 's'
mask2 = '%-' + str(value_len) + 's'
print
for j in range(len(attr)):
print mask1 %(attr[j]),
for item in range(len(liste)):
if liste[item].has_key(attr[j]):
print mask2 %(liste[item][attr[j]]),
else:
print mask2 %('n.e.'),
print
print
return
#------------------------------------------------------------------------------
if __name__ == "__main__":
""" """
odt = HandleOpenDocumentFiles(sys.argv[1])
odt.print_odf_zipfile_info()
# odt.rewrite_zipfile('ZipNeu', odt.odf_file)
analyse = AnalyseXmlDocument()
print
dom = xml.dom.minidom.parseString(odt.get_odf_component('content.xml'))
analyse.walkingThroughDocument(dom.documentElement, 'c_')
analyse.walkingThroughDocumentFinish()
dom = xml.dom.minidom.parseString(odt.get_odf_component('styles.xml'))
analyse.walkingThroughDocument(dom.documentElement, 's_')
analyse.walkingThroughDocumentFinish()
if analyse.print_flag is True:
print analyse.node_hash
print
print
keys = analyse.node_hash.keys()
keys.sort()
analyse.print_flag = True
if analyse.print_flag is True:
print keys
for item in keys:
print item + ':' , len(analyse.node_hash[item])
analyse.print_attr(analyse.node_hash[item])
print
print
print analyse.document_content