IANA URI Schemes parsen...

Code-Stücke können hier veröffentlicht werden.
Antworten
Benutzeravatar
jens
Python-Forum Veteran
Beiträge: 8502
Registriert: Dienstag 10. August 2004, 09:40
Wohnort: duisburg
Kontaktdaten:

Ich wollte eine Liste aller offiziellen "URI Schemes" haben...

Bei der Suche bin ich auf die Seite http://www.iana.org/assignments/uri-schemes.html gestoßen... Ich hab ein kleines skript gehackt, damit man die Daten der HTML Seite in Python verwenden kann.

Nicht schön, tut aber z.Z. was es soll:

Code: Alles auswählen

#!/usr/bin/env python
# coding: utf-8

from HTMLParser import HTMLParser
import os
import urllib2
import pprint

URL = "http://www.iana.org/assignments/uri-schemes.html"
TEMP_FILE = "uri-schemes.html.tmp"

if os.path.exists(TEMP_FILE):
    print "Use temp file %r..." % TEMP_FILE,
    f = file(TEMP_FILE, "rb")
    html_page = f.read()
    f.close()
    print "OK"
else:
    print "Request %r..." % URL,
    req = urllib2.urlopen(URL)
    html_page = req.read()
    req.close()
    f = file(TEMP_FILE, "wb")
    f.write(html_page)
    f.close()
    print "OK"


class IanaHTMLParser(HTMLParser):
    def __init__(self):
        # Note: HTMLPaser is a oldstyle class!
        self.reset() # Initialize and reset this HTMLParser instance.

        self._tag_info = dict([(t, False) for t in ("td", "tr")])
        self._td_no = 0
        self.last_data = []
        self.url_schemes_data = {}

    def handle_starttag(self, tag, attrs):
        if tag in self._tag_info:
            if tag == "td":
                self._td_no += 1
            self._tag_info[tag] = True

        if tag == "a" and self._td_no == 3:
            href = attrs[0][1]
            self.last_data.append(href)
#        print "--- begin <%s>" % tag

    def handle_endtag(self, tag):
        if tag in self._tag_info:
            self._tag_info[tag] = False

            if tag == "tr":
#                print self.last_data

                if len(self.last_data) == 6 and self.last_data[2] == "[" and self.last_data[5] == "]":
                    del(self.last_data[5])
                    del(self.last_data[2])
#                    print "***", self.last_data
                    self.url_schemes_data[self.last_data[0]] = tuple(self.last_data[2:])

                self.last_data = []
                self._td_no = 0

#        print "--- end </%s>" % tag

    def handle_data(self, data):
        if self._tag_info["td"] and self._tag_info["tr"]:
            data = data.strip()
#            print "XXX", data
            self.last_data.append(data)


p = IanaHTMLParser()
p.feed(html_page)
pprint.pprint(p.url_schemes_data)
Raus kommt dann:

Code: Alles auswählen

{'aaa': ('http://www.rfc-editor.org/rfc/rfc3588.txt', 'RFC3588'),
 'aaas': ('http://www.rfc-editor.org/rfc/rfc3588.txt', 'RFC3588'),
 'acap': ('http://www.rfc-editor.org/rfc/rfc2244.txt', 'RFC2244'),
 'afs': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
 'cap': ('http://www.rfc-editor.org/rfc/rfc4324.txt', 'RFC4324'),
 'cid': ('http://www.rfc-editor.org/rfc/rfc2392.txt', 'RFC2392'),
 'crid': ('http://www.rfc-editor.org/rfc/rfc4078.txt', 'RFC4078'),
 'data': ('http://www.rfc-editor.org/rfc/rfc2397.txt', 'RFC2397'),
 'dav': ('http://www.rfc-editor.org/rfc/rfc4918.txt', 'RFC4918'),
 'dict': ('http://www.rfc-editor.org/rfc/rfc2229.txt', 'RFC2229'),
 'dns': ('http://www.rfc-editor.org/rfc/rfc4501.txt', 'RFC4501'),
 'dtn': ('http://www.rfc-editor.org/rfc/rfc5050.txt', 'RFC5050'),
 'dvb': ('http://tools.ietf.org/id/draft-mcroberts-uri-dvb',
         'draft-mcroberts-uri-dvb'),
 'fax': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC2806'),
 'file': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
 'ftp': ('http://www.rfc-editor.org/rfc/rfc1738.txt', 'RFC1738'),
 'geo': ('http://www.rfc-editor.org/rfc/rfc5870.txt', 'RFC5870'),
 'go': ('http://www.rfc-editor.org/rfc/rfc3368.txt', 'RFC3368'),
 'gopher': ('http://www.rfc-editor.org/rfc/rfc4266.txt', 'RFC4266'),
 'h323': ('http://www.rfc-editor.org/rfc/rfc3508.txt', 'RFC3508'),
 'http': ('http://www.rfc-editor.org/rfc/rfc2616.txt', 'RFC2616'),
 'https': ('http://www.rfc-editor.org/rfc/rfc2818.txt', 'RFC2818'),
 'iax': ('http://www.rfc-editor.org/rfc/rfc5456.txt', 'RFC5456'),
 'icap': ('http://www.rfc-editor.org/rfc/rfc3507.txt', 'RFC3507'),
 'icon': ('http://tools.ietf.org/html/draft-lafayette-icon-uri-scheme',
          'draft-lafayette-icon-uri-scheme'),
 'im': ('http://www.rfc-editor.org/rfc/rfc3860.txt', 'RFC3860'),
 'imap': ('http://www.rfc-editor.org/rfc/rfc5092.txt', 'RFC5092'),
 'info': ('http://www.rfc-editor.org/rfc/rfc4452.txt', 'RFC4452'),
 'ipn': ('http://www.rfc-editor.org/rfc/rfc6260.txt', 'RFC6260'),
 'ipp': ('http://www.rfc-editor.org/rfc/rfc3510.txt', 'RFC3510'),
 'iris': ('http://www.rfc-editor.org/rfc/rfc3981.txt', 'RFC3981'),
 'iris.beep': ('http://www.rfc-editor.org/rfc/rfc3983.txt', 'RFC3983'),
 'iris.lwz': ('http://www.rfc-editor.org/rfc/rfc4993.txt', 'RFC4993'),
 'iris.xpc': ('http://www.rfc-editor.org/rfc/rfc4992.txt', 'RFC4992'),
 'iris.xpcs': ('http://www.rfc-editor.org/rfc/rfc4992.txt', 'RFC4992'),
 'jms': ('http://www.rfc-editor.org/rfc/rfc6167.txt', 'RFC6167'),
 'ldap': ('http://www.ietf.org/rfc/rfc4516.txt', 'RFC4516'),
 'mailserver': ('http://www.rfc-editor.org/rfc/rfc6196.txt', 'RFC6196'),
 'mailto': ('http://www.rfc-editor.org/rfc/rfc6068.txt', 'RFC6068'),
 'mid': ('http://www.rfc-editor.org/rfc/rfc2392.txt', 'RFC2392'),
 'modem': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC2806'),
 'msrp': ('http://www.rfc-editor.org/rfc/rfc4975.txt', 'RFC4975'),
 'msrps': ('http://www.rfc-editor.org/rfc/rfc4975.txt', 'RFC4975'),
 'mtqp': ('http://www.rfc-editor.org/rfc/rfc3887.txt', 'RFC3887'),
 'mupdate': ('http://www.rfc-editor.org/rfc/rfc3656.txt', 'RFC3656'),
 'news': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
 'nfs': ('http://www.rfc-editor.org/rfc/rfc2224.txt', 'RFC2224'),
 'nntp': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
 'oid': ('http://tools.ietf.org/id/draft-larmouth-oid-iri',
         'draft-larmouth-oid-iri'),
 'opaquelocktoken': ('http://www.rfc-editor.org/rfc/rfc4918.txt', 'RFC4918'),
 'pack': ('http://tools.ietf.org/id/draft-shur-pack-uri-scheme',
          'draft-shur-pack-uri-scheme'),
 'pop': ('http://www.rfc-editor.org/rfc/rfc2384.txt', 'RFC2384'),
 'pres': ('http://www.rfc-editor.org/rfc/rfc3859.txt', 'RFC3859'),
 'prospero': ('http://www.rfc-editor.org/rfc/rfc4157.txt', 'RFC4157'),
 'rsync': ('http://www.rfc-editor.org/rfc/rfc5781.txt', 'RFC5781'),
 'rtsp': ('http://www.rfc-editor.org/rfc/rfc2326.txt', 'RFC2326'),
 'service': ('http://www.rfc-editor.org/rfc/rfc2609.txt', 'RFC2609'),
 'shttp': ('http://www.rfc-editor.org/rfc/rfc2660.txt', 'RFC2660'),
 'sieve': ('http://www.rfc-editor.org/rfc/rfc5804.txt', 'RFC5804'),
 'sip': ('http://www.rfc-editor.org/rfc/rfc3261.txt', 'RFC3261'),
 'sips': ('http://www.rfc-editor.org/rfc/rfc3261.txt', 'RFC3261'),
 'sms': ('http://www.rfc-editor.org/rfc/rfc5724.txt', 'RFC5724'),
 'snews': ('http://www.rfc-editor.org/rfc/rfc5538.txt', 'RFC5538'),
 'snmp': ('http://www.rfc-editor.org/rfc/rfc4088.txt', 'RFC4088'),
 'soap.beep': ('http://www.rfc-editor.org/rfc/rfc4227.txt', 'RFC4227'),
 'soap.beeps': ('http://www.rfc-editor.org/rfc/rfc4227.txt', 'RFC4227'),
 'tag': ('http://www.rfc-editor.org/rfc/rfc4151.txt', 'RFC4151'),
 'tel': ('http://www.rfc-editor.org/rfc/rfc3966.txt', 'RFC3966'),
 'telnet': ('http://www.rfc-editor.org/rfc/rfc4248.txt', 'RFC4248'),
 'tftp': ('http://www.rfc-editor.org/rfc/rfc3617.txt', 'RFC3617'),
 'thismessage': ('http://www.rfc-editor.org/rfc/rfc2557.txt', 'RFC2557'),
 'tip': ('http://www.rfc-editor.org/rfc/rfc2371.txt', 'RFC2371'),
 'tn3270': ('http://www.rfc-editor.org/rfc/rfc6270.txt', 'RFC6270'),
 'tv': ('http://www.rfc-editor.org/rfc/rfc2838.txt', 'RFC2838'),
 'urn': ('http://www.rfc-editor.org/rfc/rfc2141.txt', 'RFC2141'),
 'vemmi': ('http://www.rfc-editor.org/rfc/rfc2122.txt', 'RFC2122'),
 'view-source': ('http://www.iana.org/assignments/contact-people.html#Yevstifeyev',
                 'Yevstifeyev'),
 'wais': ('http://www.rfc-editor.org/rfc/rfc4156.txt', 'RFC4156'),
 'ws': ('http://tools.ietf.org/html/draft-ietf-hybi-thewebsocketprotocol',
        'draft-ietf-hybi-thewebsocketprotocol'),
 'wss': ('http://tools.ietf.org/html/draft-ietf-hybi-thewebsocketprotocol',
         'draft-ietf-hybi-thewebsocketprotocol'),
 'xmlrpc.beep': ('http://www.rfc-editor.org/rfc/rfc3529.txt', 'RFC3529'),
 'xmlrpc.beeps': ('http://www.rfc-editor.org/rfc/rfc3529.txt', 'RFC3529'),
 'xmpp': ('http://www.rfc-editor.org/rfc/rfc5122.txt', 'RFC5122'),
 'z39.50r': ('http://www.rfc-editor.org/rfc/rfc2056.txt', 'RFC2056'),
 'z39.50s': ('http://www.rfc-editor.org/rfc/rfc2056.txt', 'RFC2056')}

GitHub | Open HUB | Xing | Linked in
Bitcoins to: 1JEgSQepxGjdprNedC9tXQWLpS424AL8cd
Benutzeravatar
/me
User
Beiträge: 3561
Registriert: Donnerstag 25. Juni 2009, 14:40
Wohnort: Bonn

jens hat geschrieben:Ich wollte eine Liste aller offiziellen "URI Schemes" haben...
Das ist eine sehr interessante Lösung. Ich wusste gar nicht, dass es für sieve einen RfC gibt.

Nur ... warum ist htcpcp (http://www.rfc-editor.org/rfc/rfc2324.txt) nicht in der Liste?
Antworten