BBCode Parser
Verfasst: Samstag 28. Mai 2005, 15:57
Hier ein kleiner BBCode Parser mit in etwa dem Umfang von phpbb.
Er sollte gültigen XHTML Code erzeugen, bei Fragen einfach hier posten.
Er sollte gültigen XHTML Code erzeugen, bei Fragen einfach hier posten.
Code: Alles auswählen
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------- #
# BBCode Parser #
# Author: Armin Ronacher <armin.ronacher@active-4.com> #
# licensed under the GPL #
# ---------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or #
# modify it under the terms of the GNU General Public License #
# as published by the Free Software Foundation; either version 2 #
# of the License, or (at your option) any later version. #
# ---------------------------------------------------------------------------- #
import re
from random import random
from md5 import md5
from time import time
LANGUAGE = {
"SIMPLE_QUOTE": "Zitat:",
"AUTHOR_QUOTE": "%s schrieb:"
}
"""
this module implements a simple BBCode parser for pythonBB
Here a small example for using it:
text = file("input.txt", "r").read()
p = parser(text)
text = p.parse()
file("output.html", "w").write(text)
"""
class parser:
def __init__(self, text):
self.text = text
self.table = {}
self.parse()
def parse(self):
text = self.text
text = self.htmlspecialchars(text)
text = self.extract(text)
text = self.inlineformat(text)
text = self.paragraphs(text)
text = self.assamble(text)
self.table = {}
return text
def htmlspecialchars(self, text):
rules = [
("&", "&"),
("< ", "<"),
(">", ">")
]
for rule in rules:
text = text.replace(rule[0], rule[1])
return text
def extract(self, text):
areas = [
("\[code\](.*?)\[/code\](?uism)", self.do_code),
("\[list(=(.*?))?\](.*?)\[/list](?uism)", self.do_list)
]
for area in areas:
try:
while re.search(area[0], text):
token_id = self.__create_token_id()
result = re.search(area[0], text)
self.table[token_id] = (area[1], result)
text = text.replace(result.group(0), token_id, 1)
except:
pass
return text
def inlineformat(self, text):
rules = [
("\[b\](.*?)\[/b\](?uism)", r"<strong>\1</strong>"),
("\[i\](.*?)\[/i\](?uism)", r"<em>\1</em>"),
("\[u\](.*?)\[/u\](?uism)", r"<u>\1</u>"),
("\[url(=(.*?))?\](.*?)\[/url\](?uism)", self.do_link),
("\[img(=\"(.*?)\")?\](.*?)\[/img\](?uism)", self.do_image),
("\[quote(=\"(.*?)\")?\](.*?)\[/quote\](?uism)", self.do_quote),
]
for rule in rules:
text = re.sub(rule[0], rule[1], text)
return text
def paragraphs(self, text):
lines = re.split("\n{2,}", text)
text = ""
for line in lines:
line = line.strip().replace("\n", "<br />\n")
text += "<p>"+line+"</p>\n\n"
return text
def assamble(self, text):
for token in self.table:
handler = self.table[token][0]
content = self.table[token][1]
block = handler(content)
text = text.replace(token, block)
text = re.sub("<p>([\s]*?)</p>", "", text) #removes empty paragraphs
return text
def __create_token_id(self):
while True:
token = "~~"+md5(str(time() + random())).hexdigest()+"~~"
if not self.table.has_key(token):
return token
def do_link(self, matchobj):
caption = matchobj.group(3)
if matchobj.group(1) == None:
link = matchobj.group(3)
else:
link = matchobj.group(2)
return "<a href=\""+link+"\">"+caption+"</a>"
def do_image(self, matchobj):
href = matchobj.group(3)
if matchobj.group(1) == None:
title = matchobj.group(3)
else:
title = matchobj.group(2)
return "<img src=\""+href+"\" title=\""+title+"\" alt=\""+title+"\" />"
def do_quote(self, matchobj):
text = matchobj.group(3)
if matchobj.group(1) == None:
title = LANGUAGE["SIMPLE_QUOTE"]
else:
title = matchobj.group(2)
if not title.endswith(":"):
title = LANGUAGE["AUTHOR_QUOTE"].replace("%s", title)
return "</p><div class=\"quote\"><h2>"+title+"</h2><p>"+text+"</p></div><p>"
def do_code(self, matchobj):
content = matchobj.group(1)
if content[0] == "\n": content = content[1:]
result = "</p><code>"+content+"</code><p>"
return result
def do_list(self, matchobj):
content = self.inlineformat(matchobj.group(3))
content = re.sub("\[\*\](.*?)\n", r"<li>\1</li>\n", content)
if matchobj.group(1) == None:
list_type = "*"
else:
list_type = matchobj.group(2)
if list_type == "1":
result == "<ol class=\"numeric\">"+content+"</ol>"
elif list_type == "a":
result = "<ol class=\"alpha\">"+content+"</ol>"
elif list_type == "i":
result = "<ol class=\"roman\">"+content+"</ol>"
else:
result = "<ul>"+content+"</ul>"
result = "</p>"+result+"<p>"
return result