Bei der Suche bin ich auf die Seite http://www.iana.org/assignments/uri-schemes.html gestoßen... Ich hab ein kleines skript gehackt, damit man die Daten der HTML Seite in Python verwenden kann.
Nicht schön, tut aber z.Z. was es soll:
Code: Alles auswählen
#!/usr/bin/env python
# coding: utf-8
from HTMLParser import HTMLParser
import os
import urllib2
import pprint
URL = "http://www.iana.org/assignments/uri-schemes.html"
TEMP_FILE = "uri-schemes.html.tmp"
if os.path.exists(TEMP_FILE):
print "Use temp file %r..." % TEMP_FILE,
f = file(TEMP_FILE, "rb")
html_page = f.read()
f.close()
print "OK"
else:
print "Request %r..." % URL,
req = urllib2.urlopen(URL)
html_page = req.read()
req.close()
f = file(TEMP_FILE, "wb")
f.write(html_page)
f.close()
print "OK"
class IanaHTMLParser(HTMLParser):
def __init__(self):
# Note: HTMLPaser is a oldstyle class!
self.reset() # Initialize and reset this HTMLParser instance.
self._tag_info = dict([(t, False) for t in ("td", "tr")])
self._td_no = 0
self.last_data = []
self.url_schemes_data = {}
def handle_starttag(self, tag, attrs):
if tag in self._tag_info:
if tag == "td":
self._td_no += 1
self._tag_info[tag] = True
if tag == "a" and self._td_no == 3:
href = attrs[0][1]
self.last_data.append(href)
# print "--- begin <%s>" % tag
def handle_endtag(self, tag):
if tag in self._tag_info:
self._tag_info[tag] = False
if tag == "tr":
# print self.last_data
if len(self.last_data) == 6 and self.last_data[2] == "[" and self.last_data[5] == "]":
del(self.last_data[5])
del(self.last_data[2])
# print "***", self.last_data
self.url_schemes_data[self.last_data[0]] = tuple(self.last_data[2:])
self.last_data = []
self._td_no = 0
# print "--- end </%s>" % tag
def handle_data(self, data):
if self._tag_info["td"] and self._tag_info["tr"]:
data = data.strip()
# print "XXX", data
self.last_data.append(data)
p = IanaHTMLParser()
p.feed(html_page)
pprint.pprint(p.url_schemes_data)
Code: Alles auswählen
{'aaa': ('http://www.rfc-editor.org/rfc/rfc3588.txt', 'RFC3588'),
'aaas': ('http://www.rfc-editor.org/rfc/rfc3588.txt', 'RFC3588'),
'acap': ('http://www.rfc-editor.org/rfc/rfc2244.txt', 'RFC2244'),
'afs': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
'cap': ('http://www.rfc-editor.org/rfc/rfc4324.txt', 'RFC4324'),
'cid': ('http://www.rfc-editor.org/rfc/rfc2392.txt', 'RFC2392'),
'crid': ('http://www.rfc-editor.org/rfc/rfc4078.txt', 'RFC4078'),
'data': ('http://www.rfc-editor.org/rfc/rfc2397.txt', 'RFC2397'),
'dav': ('http://www.rfc-editor.org/rfc/rfc4918.txt', 'RFC4918'),
'dict': ('http://www.rfc-editor.org/rfc/rfc2229.txt', 'RFC2229'),
'dns': ('http://www.rfc-editor.org/rfc/rfc4501.txt', 'RFC4501'),
'dtn': ('http://www.rfc-editor.org/rfc/rfc5050.txt', 'RFC5050'),
'dvb': ('http://tools.ietf.org/id/draft-mcroberts-uri-dvb',
'draft-mcroberts-uri-dvb'),
'fax': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC2806'),
'file': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
'ftp': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
'geo': ('http://www.rfc-editor.org/rfc/rfc5870.txt', 'RFC5870'),
'go': ('http://www.rfc-editor.org/rfc/rfc3368.txt', 'RFC3368'),
'gopher': ('http://www.rfc-editor.org/rfc/rfc4266.txt', 'RFC4266'),
'h323': ('http://www.rfc-editor.org/rfc/rfc3508.txt', 'RFC3508'),
'http': ('http://www.rfc-editor.org/rfc/rfc2616.txt', 'RFC2616'),
'https': ('http://www.rfc-editor.org/rfc/rfc2818.txt', 'RFC2818'),
'iax': ('http://www.rfc-editor.org/rfc/rfc5456.txt', 'RFC5456'),
'icap': ('http://www.rfc-editor.org/rfc/rfc3507.txt', 'RFC3507'),
'icon': ('http://tools.ietf.org/html/draft-lafayette-icon-uri-scheme',
'draft-lafayette-icon-uri-scheme'),
'im': ('http://www.rfc-editor.org/rfc/rfc3860.txt', 'RFC3860'),
'imap': ('http://www.rfc-editor.org/rfc/rfc5092.txt', 'RFC5092'),
'info': ('http://www.rfc-editor.org/rfc/rfc4452.txt', 'RFC4452'),
'ipn': ('http://www.rfc-editor.org/rfc/rfc6260.txt', 'RFC6260'),
'ipp': ('http://www.rfc-editor.org/rfc/rfc3510.txt', 'RFC3510'),
'iris': ('http://www.rfc-editor.org/rfc/rfc3981.txt', 'RFC3981'),
'iris.beep': ('http://www.rfc-editor.org/rfc/rfc3983.txt', 'RFC3983'),
'iris.lwz': ('http://www.rfc-editor.org/rfc/rfc4993.txt', 'RFC4993'),
'iris.xpc': ('http://www.rfc-editor.org/rfc/rfc4992.txt', 'RFC4992'),
'iris.xpcs': ('http://www.rfc-editor.org/rfc/rfc4992.txt', 'RFC4992'),
'jms': ('http://www.rfc-editor.org/rfc/rfc6167.txt', 'RFC6167'),
'ldap': ('http://www.ietf.org/rfc/rfc4516.txt', 'RFC4516'),
'mailserver': ('http://www.rfc-editor.org/rfc/rfc6196.txt', 'RFC6196'),
'mailto': ('http://www.rfc-editor.org/rfc/rfc6068.txt', 'RFC6068'),
'mid': ('http://www.rfc-editor.org/rfc/rfc2392.txt', 'RFC2392'),
'modem': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC2806'),
'msrp': ('http://www.rfc-editor.org/rfc/rfc4975.txt', 'RFC4975'),
'msrps': ('http://www.rfc-editor.org/rfc/rfc4975.txt', 'RFC4975'),
'mtqp': ('http://www.rfc-editor.org/rfc/rfc3887.txt', 'RFC3887'),
'mupdate': ('http://www.rfc-editor.org/rfc/rfc3656.txt', 'RFC3656'),
'news': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
'nfs': ('http://www.rfc-editor.org/rfc/rfc2224.txt', 'RFC2224'),
'nntp': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
'oid': ('http://tools.ietf.org/id/draft-larmouth-oid-iri',
'draft-larmouth-oid-iri'),
'opaquelocktoken': ('http://www.rfc-editor.org/rfc/rfc4918.txt', 'RFC4918'),
'pack': ('http://tools.ietf.org/id/draft-shur-pack-uri-scheme',
'draft-shur-pack-uri-scheme'),
'pop': ('http://www.rfc-editor.org/rfc/rfc2384.txt', 'RFC2384'),
'pres': ('http://www.rfc-editor.org/rfc/rfc3859.txt', 'RFC3859'),
'prospero': ('http://www.rfc-editor.org/rfc/rfc4157.txt', 'RFC4157'),
'rsync': ('http://www.rfc-editor.org/rfc/rfc5781.txt', 'RFC5781'),
'rtsp': ('http://www.rfc-editor.org/rfc/rfc2326.txt', 'RFC2326'),
'service': ('http://www.rfc-editor.org/rfc/rfc2609.txt', 'RFC2609'),
'shttp': ('http://www.rfc-editor.org/rfc/rfc2660.txt', 'RFC2660'),
'sieve': ('http://www.rfc-editor.org/rfc/rfc5804.txt', 'RFC5804'),
'sip': ('http://www.rfc-editor.org/rfc/rfc3261.txt', 'RFC3261'),
'sips': ('http://www.rfc-editor.org/rfc/rfc3261.txt', 'RFC3261'),
'sms': ('http://www.rfc-editor.org/rfc/rfc5724.txt', 'RFC5724'),
'snews': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
'snmp': ('http://www.rfc-editor.org/rfc/rfc4088.txt', 'RFC4088'),
'soap.beep': ('http://www.rfc-editor.org/rfc/rfc4227.txt', 'RFC4227'),
'soap.beeps': ('http://www.rfc-editor.org/rfc/rfc4227.txt', 'RFC4227'),
'tag': ('http://www.rfc-editor.org/rfc/rfc4151.txt', 'RFC4151'),
'tel': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC3966'),
'telnet': ('http://www.rfc-editor.org/rfc/rfc4248.txt', 'RFC4248'),
'tftp': ('http://www.rfc-editor.org/rfc/rfc3617.txt', 'RFC3617'),
'thismessage': ('http://www.rfc-editor.org/rfc/rfc2557.txt', 'RFC2557'),
'tip': ('http://www.rfc-editor.org/rfc/rfc2371.txt', 'RFC2371'),
'tn3270': ('http://www.rfc-editor.org/rfc/rfc6270.txt', 'RFC6270'),
'tv': ('http://www.rfc-editor.org/rfc/rfc2838.txt', 'RFC2838'),
'urn': ('http://www.rfc-editor.org/rfc/rfc2141.txt', 'RFC2141'),
'vemmi': ('http://www.rfc-editor.org/rfc/rfc2122.txt', 'RFC2122'),
'view-source': ('http://www.iana.org/assignments/contact-people.html#Yevstifeyev',
'Yevstifeyev'),
'wais': ('http://www.rfc-editor.org/rfc/rfc4156.txt', 'RFC4156'),
'ws': ('http://tools.ietf.org/html/draft-ietf-hybi-thewebsocketprotocol',
'draft-ietf-hybi-thewebsocketprotocol'),
'wss': ('http://tools.ietf.org/html/draft-ietf-hybi-thewebsocketprotocol',
'draft-ietf-hybi-thewebsocketprotocol'),
'xmlrpc.beep': ('http://www.rfc-editor.org/rfc/rfc3529.txt', 'RFC3529'),
'xmlrpc.beeps': ('http://www.rfc-editor.org/rfc/rfc3529.txt', 'RFC3529'),
'xmpp': ('http://www.rfc-editor.org/rfc/rfc5122.txt', 'RFC5122'),
'z39.50r': ('http://www.rfc-editor.org/rfc/rfc2056.txt', 'RFC2056'),
'z39.50s': ('http://www.rfc-editor.org/rfc/rfc2056.txt', 'RFC2056')}