Seite 1 von 1

IANA URI Schemes parsen...

Verfasst: Mittwoch 3. August 2011, 10:09
von jens
Ich wollte eine Liste aller offiziellen "URI Schemes" haben...

Bei der Suche bin ich auf die Seite http://www.iana.org/assignments/uri-schemes.html gestoßen... Ich hab ein kleines skript gehackt, damit man die Daten der HTML Seite in Python verwenden kann.

Nicht schön, tut aber z.Z. was es soll:

Code: Alles auswählen

#!/usr/bin/env python
# coding: utf-8

from HTMLParser import HTMLParser
import os
import urllib2
import pprint

URL = "http://www.iana.org/assignments/uri-schemes.html"
TEMP_FILE = "uri-schemes.html.tmp"

if os.path.exists(TEMP_FILE):
    print "Use temp file %r..." % TEMP_FILE,
    f = file(TEMP_FILE, "rb")
    html_page = f.read()
    f.close()
    print "OK"
else:
    print "Request %r..." % URL,
    req = urllib2.urlopen(URL)
    html_page = req.read()
    req.close()
    f = file(TEMP_FILE, "wb")
    f.write(html_page)
    f.close()
    print "OK"


class IanaHTMLParser(HTMLParser):
    def __init__(self):
        # Note: HTMLPaser is a oldstyle class!
        self.reset() # Initialize and reset this HTMLParser instance.

        self._tag_info = dict([(t, False) for t in ("td", "tr")])
        self._td_no = 0
        self.last_data = []
        self.url_schemes_data = {}

    def handle_starttag(self, tag, attrs):
        if tag in self._tag_info:
            if tag == "td":
                self._td_no += 1
            self._tag_info[tag] = True

        if tag == "a" and self._td_no == 3:
            href = attrs[0][1]
            self.last_data.append(href)
#        print "--- begin <%s>" % tag

    def handle_endtag(self, tag):
        if tag in self._tag_info:
            self._tag_info[tag] = False

            if tag == "tr":
#                print self.last_data

                if len(self.last_data) == 6 and self.last_data[2] == "[" and self.last_data[5] == "]":
                    del(self.last_data[5])
                    del(self.last_data[2])
#                    print "***", self.last_data
                    self.url_schemes_data[self.last_data[0]] = tuple(self.last_data[2:])

                self.last_data = []
                self._td_no = 0

#        print "--- end </%s>" % tag

    def handle_data(self, data):
        if self._tag_info["td"] and self._tag_info["tr"]:
            data = data.strip()
#            print "XXX", data
            self.last_data.append(data)


p = IanaHTMLParser()
p.feed(html_page)
pprint.pprint(p.url_schemes_data)
Raus kommt dann:

Code: Alles auswählen

{'aaa': ('http://www.rfc-editor.org/rfc/rfc3588.txt', 'RFC3588'),
 'aaas': ('http://www.rfc-editor.org/rfc/rfc3588.txt', 'RFC3588'),
 'acap': ('http://www.rfc-editor.org/rfc/rfc2244.txt', 'RFC2244'),
 'afs': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
 'cap': ('http://www.rfc-editor.org/rfc/rfc4324.txt', 'RFC4324'),
 'cid': ('http://www.rfc-editor.org/rfc/rfc2392.txt', 'RFC2392'),
 'crid': ('http://www.rfc-editor.org/rfc/rfc4078.txt', 'RFC4078'),
 'data': ('http://www.rfc-editor.org/rfc/rfc2397.txt', 'RFC2397'),
 'dav': ('http://www.rfc-editor.org/rfc/rfc4918.txt', 'RFC4918'),
 'dict': ('http://www.rfc-editor.org/rfc/rfc2229.txt', 'RFC2229'),
 'dns': ('http://www.rfc-editor.org/rfc/rfc4501.txt', 'RFC4501'),
 'dtn': ('http://www.rfc-editor.org/rfc/rfc5050.txt', 'RFC5050'),
 'dvb': ('http://tools.ietf.org/id/draft-mcroberts-uri-dvb',
         'draft-mcroberts-uri-dvb'),
 'fax': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC2806'),
 'file': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
 'ftp': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
 'geo': ('http://www.rfc-editor.org/rfc/rfc5870.txt', 'RFC5870'),
 'go': ('http://www.rfc-editor.org/rfc/rfc3368.txt', 'RFC3368'),
 'gopher': ('http://www.rfc-editor.org/rfc/rfc4266.txt', 'RFC4266'),
 'h323': ('http://www.rfc-editor.org/rfc/rfc3508.txt', 'RFC3508'),
 'http': ('http://www.rfc-editor.org/rfc/rfc2616.txt', 'RFC2616'),
 'https': ('http://www.rfc-editor.org/rfc/rfc2818.txt', 'RFC2818'),
 'iax': ('http://www.rfc-editor.org/rfc/rfc5456.txt', 'RFC5456'),
 'icap': ('http://www.rfc-editor.org/rfc/rfc3507.txt', 'RFC3507'),
 'icon': ('http://tools.ietf.org/html/draft-lafayette-icon-uri-scheme',
          'draft-lafayette-icon-uri-scheme'),
 'im': ('http://www.rfc-editor.org/rfc/rfc3860.txt', 'RFC3860'),
 'imap': ('http://www.rfc-editor.org/rfc/rfc5092.txt', 'RFC5092'),
 'info': ('http://www.rfc-editor.org/rfc/rfc4452.txt', 'RFC4452'),
 'ipn': ('http://www.rfc-editor.org/rfc/rfc6260.txt', 'RFC6260'),
 'ipp': ('http://www.rfc-editor.org/rfc/rfc3510.txt', 'RFC3510'),
 'iris': ('http://www.rfc-editor.org/rfc/rfc3981.txt', 'RFC3981'),
 'iris.beep': ('http://www.rfc-editor.org/rfc/rfc3983.txt', 'RFC3983'),
 'iris.lwz': ('http://www.rfc-editor.org/rfc/rfc4993.txt', 'RFC4993'),
 'iris.xpc': ('http://www.rfc-editor.org/rfc/rfc4992.txt', 'RFC4992'),
 'iris.xpcs': ('http://www.rfc-editor.org/rfc/rfc4992.txt', 'RFC4992'),
 'jms': ('http://www.rfc-editor.org/rfc/rfc6167.txt', 'RFC6167'),
 'ldap': ('http://www.ietf.org/rfc/rfc4516.txt', 'RFC4516'),
 'mailserver': ('http://www.rfc-editor.org/rfc/rfc6196.txt', 'RFC6196'),
 'mailto': ('http://www.rfc-editor.org/rfc/rfc6068.txt', 'RFC6068'),
 'mid': ('http://www.rfc-editor.org/rfc/rfc2392.txt', 'RFC2392'),
 'modem': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC2806'),
 'msrp': ('http://www.rfc-editor.org/rfc/rfc4975.txt', 'RFC4975'),
 'msrps': ('http://www.rfc-editor.org/rfc/rfc4975.txt', 'RFC4975'),
 'mtqp': ('http://www.rfc-editor.org/rfc/rfc3887.txt', 'RFC3887'),
 'mupdate': ('http://www.rfc-editor.org/rfc/rfc3656.txt', 'RFC3656'),
 'news': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
 'nfs': ('http://www.rfc-editor.org/rfc/rfc2224.txt', 'RFC2224'),
 'nntp': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
 'oid': ('http://tools.ietf.org/id/draft-larmouth-oid-iri',
         'draft-larmouth-oid-iri'),
 'opaquelocktoken': ('http://www.rfc-editor.org/rfc/rfc4918.txt', 'RFC4918'),
 'pack': ('http://tools.ietf.org/id/draft-shur-pack-uri-scheme',
          'draft-shur-pack-uri-scheme'),
 'pop': ('http://www.rfc-editor.org/rfc/rfc2384.txt', 'RFC2384'),
 'pres': ('http://www.rfc-editor.org/rfc/rfc3859.txt', 'RFC3859'),
 'prospero': ('http://www.rfc-editor.org/rfc/rfc4157.txt', 'RFC4157'),
 'rsync': ('http://www.rfc-editor.org/rfc/rfc5781.txt', 'RFC5781'),
 'rtsp': ('http://www.rfc-editor.org/rfc/rfc2326.txt', 'RFC2326'),
 'service': ('http://www.rfc-editor.org/rfc/rfc2609.txt', 'RFC2609'),
 'shttp': ('http://www.rfc-editor.org/rfc/rfc2660.txt', 'RFC2660'),
 'sieve': ('http://www.rfc-editor.org/rfc/rfc5804.txt', 'RFC5804'),
 'sip': ('http://www.rfc-editor.org/rfc/rfc3261.txt', 'RFC3261'),
 'sips': ('http://www.rfc-editor.org/rfc/rfc3261.txt', 'RFC3261'),
 'sms': ('http://www.rfc-editor.org/rfc/rfc5724.txt', 'RFC5724'),
 'snews': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
 'snmp': ('http://www.rfc-editor.org/rfc/rfc4088.txt', 'RFC4088'),
 'soap.beep': ('http://www.rfc-editor.org/rfc/rfc4227.txt', 'RFC4227'),
 'soap.beeps': ('http://www.rfc-editor.org/rfc/rfc4227.txt', 'RFC4227'),
 'tag': ('http://www.rfc-editor.org/rfc/rfc4151.txt', 'RFC4151'),
 'tel': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC3966'),
 'telnet': ('http://www.rfc-editor.org/rfc/rfc4248.txt', 'RFC4248'),
 'tftp': ('http://www.rfc-editor.org/rfc/rfc3617.txt', 'RFC3617'),
 'thismessage': ('http://www.rfc-editor.org/rfc/rfc2557.txt', 'RFC2557'),
 'tip': ('http://www.rfc-editor.org/rfc/rfc2371.txt', 'RFC2371'),
 'tn3270': ('http://www.rfc-editor.org/rfc/rfc6270.txt', 'RFC6270'),
 'tv': ('http://www.rfc-editor.org/rfc/rfc2838.txt', 'RFC2838'),
 'urn': ('http://www.rfc-editor.org/rfc/rfc2141.txt', 'RFC2141'),
 'vemmi': ('http://www.rfc-editor.org/rfc/rfc2122.txt', 'RFC2122'),
 'view-source': ('http://www.iana.org/assignments/contact-people.html#Yevstifeyev',
                 'Yevstifeyev'),
 'wais': ('http://www.rfc-editor.org/rfc/rfc4156.txt', 'RFC4156'),
 'ws': ('http://tools.ietf.org/html/draft-ietf-hybi-thewebsocketprotocol',
        'draft-ietf-hybi-thewebsocketprotocol'),
 'wss': ('http://tools.ietf.org/html/draft-ietf-hybi-thewebsocketprotocol',
         'draft-ietf-hybi-thewebsocketprotocol'),
 'xmlrpc.beep': ('http://www.rfc-editor.org/rfc/rfc3529.txt', 'RFC3529'),
 'xmlrpc.beeps': ('http://www.rfc-editor.org/rfc/rfc3529.txt', 'RFC3529'),
 'xmpp': ('http://www.rfc-editor.org/rfc/rfc5122.txt', 'RFC5122'),
 'z39.50r': ('http://www.rfc-editor.org/rfc/rfc2056.txt', 'RFC2056'),
 'z39.50s': ('http://www.rfc-editor.org/rfc/rfc2056.txt', 'RFC2056')}

Re: IANA URI Schemes parsen...

Verfasst: Mittwoch 3. August 2011, 15:24
von /me
jens hat geschrieben:Ich wollte eine Liste aller offiziellen "URI Schemes" haben...
Das ist eine sehr interessante Lösung. Ich wusste gar nicht, dass es für sieve einen RfC gibt.

Nur ... warum ist htcpcp (http://www.rfc-editor.org/rfc/rfc2324.txt) nicht in der Liste?