Code: Alles auswählen
import urllib2
from xml.dom.minidom import parse, parseString
def parseFeed(url):
request=urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5;Windows NT')
opener = urllib2.build_opener()
data = opener.open(request).read()
return data
def parseRssDoc(doc):
rootNode = doc.documentElement
channelNode = rootNode.firstChild
itemNodeList = channelNode.getElementsByTagName('item')
for item in itemNodeList:
rssItemNodes = item.childNodes
try:
for n in rssItemNodes:
prefix = ""
if n.nodeName == 'title' or n.nodeName == 'link':
print "prefix=%s val=%s" % (n.nodeName, n.firstChild.nodeValue)
except Exception, e:
print "*************"
print "WARN: unable to parse node"
print e
print "*************"
def parse(url):
data = parseFeed(url)
doc = parseString(data)
parseRssDoc(doc)
if __name__ == '__main__':
print "running"
initialFeed = 'http://www.spiegel.de/schlagzeilen/rss/0,5291,20,00.xml'
#initialFeed = 'http://news.google.com/nwshp?sourceid=navclient&ie=UTF-8&output=rss'
parse(initialFeed)
print "done"
>>>
running
Traceback (most recent call last):
File "C:\burn\rssparser.py", line 38, in <module>
parse(initialFeed)
File "C:\burn\rssparser.py", line 32, in parse
parseRssDoc(doc)
File "C:\burn\rssparser.py", line 14, in parseRssDoc
itemNodeList = channelNode.getElementsByTagName('item')
AttributeError: Text instance has no attribute 'getElementsByTagName'
>>>
Wenn ich den auskommentierten String (google) parse, kommt kein Fehler.
Liegt vielleicht ein Unicode/Latin-Fehler vor?
Was kann ich machen?
MfG
Andreas