Das deutsche Python-Forum

Ich habe momentan einen kleinen Denkfehler (oder zumindest glaube ich das).
Ich habe folgendes Modul geschrieben:

# -*- coding: utf-8 -*-

import re
from random import random
from md5 import md5
from time import time

class parser:
    def __init__(self, text):
        self.text = text
        self.table = {}
        
    def parse(self):
        self.extract()
        self.textformat()
        self.assamble()
        
    def extract(self):
        areas = [
            ("\[code\]", "\[/code\]", self.do_code)
        ]
        for area in areas:
            token_id = self.__create_token_id()
            try:
                regex = re.compile(area[0]+"(.*?)"+area[1], re.I | re.S | re.M)
                result = regex.search(self.text)
                self.table[token_id] = (area[2], result.group(1))
                self.text = self.text.replace(result.group(0), token_id, 1)
            except:
                pass
            
    def textformat(self):
        text = self.text
        rules = [
            ("\[b\](.*?)\[/b\]", self.do_bold),
            ("\[i\](.*?)\[/i\]", self.do_italic),
            ("\[u\](.*?)\[/u\]", self.do_underline)
        ]
        for rule in rules:
            regex = re.compile(rule[0], re.I | re.S | re.M)
            text = rule[1](regex, text)
        self.text = text.replace("\n", "<br />\n")
        
    def assamble(self):
        text = self.text
        for token in self.table:
            handler = self.table[token][0]
            content = self.table[token][1]
            block = handler(content)
            text = text.replace(token, block)
        self.text = text
            
    def __create_token_id(self):
        while True:
            token = "~~"+md5(str(time() + random())).hexdigest()+"~~"
            if not self.table.has_key(token):
                return token
                
    def do_bold(self, regex, text):
        return regex.sub(r"<strong>\1</strong>", text)
    
    def do_italic(self, regex, text):
        return regex.sub(r"<em>\1</em>", text)
    
    def do_underline(self, regex, text):
        return regex.sub(r"<u>\1</u>", text)
    
    def do_code(self, content):
        rules = {
            "\n ":"\n ",
            "  ": "  ",
            "\t": "    "
        }
        for rule in rules:
            content = content.replace(rule, rules[rule])
        result = "<div class=\"code\">"+content+"</div>"
        return result

Soweit funktioniert alles. Nur würde ich gerne unter textformat parsen lassen.
Aber alles zwischen diesen beiden Tags soll zuerst durch eine Funktion laufen.
Nur. Wie kann ich das machen?

Meine Idee war in etwa so:

Code: Alles auswählen

    def do_link(self, regex, text):
        caption = regex.group(1)
        return regex.sub(r"<u>"+caption+"</u>", text)

Aber so will das nicht funktionieren, kann es auch nicht, weil ich ja alle ersetzen will und nicht nur das erste Vorkommen.

Genauergesagt will ich sowas wie PHPs preg_replace_callback

Hi blackbird!

Ich bin mir nicht ganz sicher, ob ich das richtig verstanden habe.

Vielleicht hilft dir dieses Beispiel weiter:

Code: Alles auswählen

>>> s = "hallo welta walda; hallo weltb waldb"
>>> re.sub("(welt.).*?(wald.)", r"du grosser \1 eingefuegt \2", s, re.M | re.I | re.S)
'hallo du grosser welta eingefuegt walda; hallo du grosser weltb eingefuegt waldb'
>>>

lg
Gerold

Das funktioniert so natürlich auch:

>>> s = "hallo [b]welta[/b] walda; hallo [b]weltb[/b] waldb"
>>> re.sub(r"\[b\](.*?)\[/b\]", r"\1", s, re.M | re.I | re.S)
'hallo welta walda; hallo weltb waldb'

lg
Gerold

Sorry. Ich habe mich leider ziemlich schlecht ausgedrückt.
Ich will in etwa sowas machen:

Code: Alles auswählen

<?php
$text = "Ich bin ein kleiner [url]www.google.de[/url] Link";
$text = preg_replace_callback("/\[url\](.*)\[/url\]/Uism", "foobar", $text);

function foobar($args)
{
    $caption = trim($args[1]);
    $link = trim($args[1]);
    if (substr($link, 0, 4) == "www.")) $link = "http://".$link;
    return "<a href=".$link.">".$caption."</a>";
}
?>

Hab aber keine Idee, wie ich das mit Python hinkriege

Tja. Wer lesen kann ist klar im Vorteil.
re.sub kann als 2. Parameter eine Funktion aufnehmen.
Das sieht dann in etwa so aus:

Code: Alles auswählen

# -*- coding: utf-8 -*-

import re
from random import random
from md5 import md5
from time import time

class parser:
    def __init__(self, text):
        self.text = text
        self.table = {}
        
    def parse(self):
        self.extract()
        self.textformat()
        self.assamble()
        
    def extract(self):
        areas = [
            ("\[code\]", "\[/code\]", self.do_code)
        ]
        for area in areas:
            token_id = self.__create_token_id()
            try:
                regex = re.compile(area[0]+"(.*?)"+area[1], re.I | re.S | re.M)
                result = regex.search(self.text)
                self.table[token_id] = (area[2], result.group(1))
                self.text = self.text.replace(result.group(0), token_id, 1)
            except:
                pass
            
    def textformat(self):
        text = self.text
        rules = [
            ("\[b\](.*?)\[/b\](?uism)", self.do_bold),
            ("\[i\](.*?)\[/i\](?uism)", self.do_italic),
            ("\[u\](.*?)\[/u\](?uism)", self.do_underline),
            ("\[url\](.*?)\[/url](?uism)", self.do_link)
        ]
        for rule in rules:
            text = re.sub(rule[0], rule[1], text)
        self.text = text.replace("\n", "<br />\n")
        
    def assamble(self):
        text = self.text
        for token in self.table:
            handler = self.table[token][0]
            content = self.table[token][1]
            block = handler(content)
            text = text.replace(token, block)
        self.text = text
            
    def __create_token_id(self):
        while True:
            token = "~~"+md5(str(time() + random())).hexdigest()+"~~"
            if not self.table.has_key(token):
                return token
    
    def do_bold(self, matchobj):
        return "<strong>"+matchobj.group(1)+"</strong>"
        
    def do_italic(self, matchobj):
        return "<em>"+matchobj.group(1)+"</em>"
        
    def do_underline(self, matchobj):
        return "<u>"+matchobj.group(1)+"</u>"
        
    def do_link(self, matchobj):
        link = matchobj.group(1)
        caption = "foobar"
        return "<a href=\""+link+"\">"+caption+"</a>"
    
    def do_code(self, content):
        rules = {
            "\n ":"\n ",
            "  ": "  ",
            "\t": "    "
        }
        for rule in rules:
            content = content.replace(rule, rules[rule])
        result = "<div class=\"code\">"+content+"</div>"
        return result
        
if __name__ == "__main__":
    text = "Ich bin eine kleine [url]url[/url] und ich bin eine lange: [url]http://www.google.de/adfasdf/fafaf[/url]"
    p = parser(text)
    p.parse()
    print p.text
    print p.table

Finde ich ja super Interessant, was du da machst. Weil ich selber mal versucht hab einen Parser zu schreiben: http://www.jensdiemer.de/?face Ist aber schon lange her, deswegen sieht der Code auch dementsprechend aus

Warum machst du bei den einfachen Text-formatierungen so umständlich? Man kann doch die Formatierung direkt durch re.sub durführen lassen, z.B.:

statt

Code: Alles auswählen

("\[b\](.*?)\[/b\](?uism)", self.do_bold)

einfach:

Code: Alles auswählen

("\[b\](?P<txt>(.|\s)*?)\[/b\]", "<strong>\g<txt></strong>")

Mir gefällt der erzeugte Code nicht so ganz. Aber ich weiß, das das wirklich schwer ist, anständigen HTML-Code zu erzeugen, der die Leerzeilen richtig mit und wiedergibt. Ich hatte mir bei meinem TextParser auch ein halbes Bein ausgerissen, damit es halbwegs funktionierte... Aber vielleicht hast du da auch noch eine bessere Lösung?

Dennoch finde ich es sehr interressant. Ich könnte nämlich eine schlankere Version vom Textile-Markup für mein PyLucid gebrauchen! s. auch http://www.python-forum.de/viewtopic.php?p=18484#18484
(Im übrigen erzeugt Textile die Absätzte richtig mit und , jedoch ist dazu einiges an Code nötig. Schau dir mal im Sourcecode )

Ich hab mal selbst rumprobiert:

Code: Alles auswählen

# -*- coding: utf-8 -*-

import re
from random import random
from md5 import md5
from time import time

class parser:
    def __init__(self, text):
        self.text = text
        self.table = {}

    def parse(self):
        self.extract()
        self.inlineformat()
        self.paragraph()
        self.assamble()

    def extract(self):
        """
        Spezial-Textbl礫e extrahieren, die mit self.assable() wieder eingef&#55860; werden sollen
        """
        areas = [
            ("\[code\]", "\[/code\]", self.do_code)
        ]
        for area in areas:
            token_id = self.__create_token_id()
            try:
                regex = re.compile(area[0]+"(.*?)"+area[1], re.I | re.S | re.M)
                result = regex.search(self.text)
                self.table[token_id] = (area[2], result.group(1))
                self.text = self.text.replace(result.group(0), token_id, 1)
            except:
                pass

    def paragraph( self ):
        """Process a paragraph.

        This function processes the paragraphs, enclosing the text in a
        <p> tag and breaking lines with <br />.
        """
        # Split the lines.
        lines = re.split('\n{2,}', self.text)

        output = []
        for line in lines:
            if line:
                # Clean the line.
                line = line.strip()

                # Break lines.
                line = self.preg_replace(r'(<br />|\n)+', '<br />\n', line)

                # Remove <br /> from inside broken HTML tags.
                line = self.preg_replace(r'(<[^>]*)<br />\n(.*?>)', r'\1 \2', line)

                # Inline formatting.
                #~ line = self.inline(line)

                output.append("<p>" + line + "</p>")

        self.text = '\n\n'.join(output)

    def preg_replace(self, pattern, replacement, text):
        """Alternative re.sub that handles empty groups.

        This acts like re.sub, except it replaces empty groups with ''
        instead of raising an exception.
        """

        def replacement_func(matchobj):
            counter = 1
            rc = replacement

            for matchitem in matchobj.groups():
                if not matchitem:
                    matchitem = ''

                rc = rc.replace(r'\%s' % counter, matchitem)
                counter += 1

            return rc

        p = re.compile(pattern)

        return p.sub(replacement_func, text)

    def inlineformat(self):
        rules = [
            ("\[b\](?P<txt>(.|\s)*?)\[/b\]", "<strong>\g<txt></strong>"),
            ("\[i\](?P<txt>(.|\s)*?)\[/i\]", "<em>\g<txt></em>"),
            ("\[u\](?P<txt>(.|\s)*?)\[/u\]", "<u>\g<txt></u>"),
            ("\[url\](.*?)\[/url](?uism)", self.do_link)
        ]
        for rule in rules:
            self.text = re.sub(rule[0], rule[1], self.text)

    def assamble(self):
        """
        Vorher ausextrahierter Text einf&#55845;n
        """
        text = self.text
        for token in self.table:
            handler = self.table[token][0]
            content = self.table[token][1]
            block = handler(content)
            text = text.replace(token, block)
        self.text = text

    def __create_token_id(self):
        while True:
            token = "~~"+md5(str(time() + random())).hexdigest()+"~~"
            if not self.table.has_key(token):
                return token

    def do_link(self, matchobj):
        link = matchobj.group(1)
        caption = "foobar"
        return "<a href=\""+link+"\">"+caption+"</a>"

    def do_code(self, content):
        rules = {
            "\n ":"\n ",
            "  ": "  ",
            "\t": "    "
        }
        for rule in rules:
            content = content.replace(rule, rules[rule])
        result = "<div class=\"code\">"+content+"</div>"
        return result

if __name__ == "__main__":
    text = """Ein Kleiner [b]TEST[/b]
Ich bin eine kleine [url]http://www.google.de[/url] und ich bin eine lange: http://www.google.de/adfasdf/fafaf

*und wie sieht das aus?*

und das?

hier ist ein wenig code:
[code]
Formatieren kann man mit [Befehl] d.h.:
[url] - F&#56480;Links
[b] - Fett
[code] - Codezeilen

Noch eine Normale Zeile, und wieder code:

Code: Alles auswählen

Hier ist noch mal [code] drin

Geht's?
"""
p = parser(text)
p.parse()
print p.text
#~ print p.table[/code]
Ausgabe:

Ein Kleiner TEST 
Ich bin eine kleine <a href="http://www.google.de">foobar</a> und ich bin eine lange: http://www.google.de/adfasdf/fafaf

*und wie sieht das aus?*

und das?

hier ist ein wenig code: 
<div class="code">
Formatieren kann man mit [Befehl] d.h.:
[url] - F&#56480;Links
- Fett
Code: Alles auswählen
 - Codezeilen
</div>

Noch eine Normale Zeile, und wieder code: 
[code] 
Hier ist noch mal [code] drin 
 
Geht's?

Ich habe folgendes gemacht:

textformat() nach inlineformat() umbenannt und nur noch self.do_link übrig gelassen.
paragraph eingebaut

Ich weiß nicht genau ob man preg_replace() nicht irgendwie kürzer machen kann, bzw. ob die überhaupt notwendig ist.

Wie man an den Ausgaben sieht, ist allerdings der zweite Code-Block nicht erkannt worden ?!?

Ist es eigentlich wirklich nötig mit md5 zu arbeiten??? Könnte man nicht einfach ein dict anlegen und mit einfachen ID's arbeiten? Also einfach: ~~1~~

Hi, Danke fürs feedback.
Also ich bin noch am Arbeiten, also ist auch der Code noch nicht perfekt.
Ich werde einfach formatierung natürlich nicht in eine eigene Funktion stecken, das war auch nur zum Testen.
Aber der Parser soll durch eine Pluginschnittstelle erweiterbar sein.

Der Grund, warum ich md5 Tokens verwende ist der, dass ich diese Im Text an der Position einsetze, wo später wieder der extrahierte Block hinkommt.
Ein ~~1~~ kann schon mal von einem Benutzer hingeschrieben werden, aber ein ~~irgendein_md5_wert~~ kommt wohl selten vor.
Ich habe vorher sowas drin gehabt: <token(1)> und den Text mit einem regex überprüfen lassen, ob <token([0-9]+)> vorkommt, aber habe mich dann doch für die md5 Lösung entschieden.

Wenn ich fertig damit bin werde ich es posten.

//Edit: Das Parserkonzept habe ich auch schon bei meine StormCatWiki verwendet, aber dort will ich es bei Zeiten überarbeiten. Aber mit macht die Arbeit mit python mehr Spaß als mit PHP, so steht dieses Projekt wieder mal

blackbird hat geschrieben:Aber der Parser soll durch eine Pluginschnittstelle erweiterbar sein.

Was auch nicht schlecht, wäre, wenn die Formatierungs-Regeln ausgelagert werden würden... Also der eine nimmt

Code: Alles auswählen

, der andere lieber <code>...

[quote="blackbird"]Ein ~~1~~ kann schon mal von einem Benutzer hingeschrieben werden...[/quote]
Das stimmt natürlich, aber man könnte es vorher Escapen!

Ok. Also die Basisfunktionen habe ich mal fertig.
Jetzt werde ich noch den Code aufräumen, die Plugin Schnittstelle einbauen und das Ganze etwas Stresstesten.

Code: Alles auswählen

# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------- #
#   BBCode Parser                                                              #
#   part of pythonBB                                                           #
#       licensed under the GPL                                                 #
# ---------------------------------------------------------------------------- #
#      This program is free software; you can redistribute it and/or           #
#       modify it under the terms of the GNU General Public License            #
#     as published by the Free Software Foundation;  either version 2          #
#         of the License, or (at your option) any later version.               #
# ---------------------------------------------------------------------------- #

import re
from random import random
from md5 import md5
from time import time

PBB_AUTHOR = ("Armin Ronacher", "armin.ronacher@active-4.com")
PBB_VERSION = (0,3,46)

## FIX ME
LANGUAGE = {
    "SIMPLE_QUOTE":         "Zitat:",
    "AUTHOR_QUOTE":         "%s schrieb:"
}

"""
    this module implements a simple BBCode parser for pythonBB
    Here a small example for using it:
        from buildin.parser import parser
        
        text = file("input.txt", "r").read()
        p = parser(text)
        text = p.parse()
        file("output.txt", "w").write(text)
"""

class parser:
    def __init__(self, text):
        self.text = text
        self.table = {}
        self.parse()
        
    def parse(self):
        text = self.text
        text = self.htmlspecialchars(text)
        text = self.extract(text)
        text = self.inlineformat(text)
        text = self.paragraphs(text)
        text = self.assamble(text)
        self.table = {}
        return text
        
    def htmlspecialchars(self, text):
        rules = [
            ("&",   "&"),
            ("< ",  "<"),
            (">",   ">")
        ]
        for rule in rules:
            text = text.replace(rule[0], rule[1])
        return text
        
    def extract(self, text):
        areas = [
            ("\[code\](.*?)\[/code\](?uism)", self.do_code),
            ("\[list(=(.*?))?\](.*?)\[/list](?uism)", self.do_list)
        ]
        for area in areas:
            try:
                while re.search(area[0], text):
                    token_id = self.__create_token_id()
                    result = re.search(area[0], text)
                    self.table[token_id] = (area[1], result)
                    text = text.replace(result.group(0), token_id, 1)
            except:
                pass
        return text
            
    def inlineformat(self, text):
        rules = [
            ("\[b\](.*?)\[/b\](?uism)", r"<strong>\1</strong>"),
            ("\[i\](.*?)\[/i\](?uism)", r"<em>\1</em>"),
            ("\[u\](.*?)\[/u\](?uism)", r"<u>\1</u>"),
            ("\[url(=(.*?))?\](.*?)\[/url\](?uism)", self.do_link),
            ("\[img(=\"(.*?)\")?\](.*?)\[/img\](?uism)", self.do_image),
            ("\[quote(=\"(.*?)\")?\](.*?)\[/quote\](?uism)", self.do_quote),
        ]
        for rule in rules:
            text = re.sub(rule[0], rule[1], text)
        return text
         
    def paragraphs(self, text):
        lines = re.split("\n{2,}", text)
        text = ""
        for line in lines:
            line = line.strip().replace("\n", "<br />\n")
            text += "<p>"+line+"</p>\n\n"
        return text
        
    def assamble(self, text):
        for token in self.table:
            handler = self.table[token][0]
            content = self.table[token][1]
            block = handler(content)
            text = text.replace(token, block)
        text = re.sub("<p>([\s]*?)</p>", "", text) #removes empty paragraphs
        return text
            
    def __create_token_id(self):
        while True:
            token = "~~"+md5(str(time() + random())).hexdigest()+"~~"
            if not self.table.has_key(token):
                return token
        
    def do_link(self, matchobj):
        caption = matchobj.group(3)        
        if matchobj.group(1) == None:
            link = matchobj.group(3)
        else:
            link = matchobj.group(2)
        return "<a href=\""+link+"\">"+caption+"</a>"
    
    def do_image(self, matchobj):
        href = matchobj.group(3)
        if matchobj.group(1) == None:
            title = matchobj.group(3)
        else:
            title = matchobj.group(2)
        return "<img src=\""+href+"\" title=\""+title+"\" alt=\""+title+"\" />"
    
    def do_quote(self, matchobj):
        text = matchobj.group(3)        
        if matchobj.group(1) == None:
            title = LANGUAGE["SIMPLE_QUOTE"]
        else:
            title = matchobj.group(2)
            if not title.endswith(":"):
                title = LANGUAGE["AUTHOR_QUOTE"].replace("%s", title)
        return "</p><div class=\"quote\"><h2>"+title+"</h2><p>"+text+"</p></div><p>"
    
    def do_code(self, matchobj):
        content = matchobj.group(1)
        if content[0] == "\n": content = content[1:]
        result = "</p><code>"+content+"</code><p>"
        return result
        
    def do_list(self, matchobj):
        content = self.inlineformat(matchobj.group(3))
        content = re.sub("\[\*\](.*?)\n", r"<li>\1</li>\n", content)   
        if matchobj.group(1) == None:
            list_type = "*"
        else:
            list_type = matchobj.group(2)
        if list_type == "1":
            result == "<ol class=\"numeric\">"+content+"</ol>"
        elif list_type == "a":
            result = "<ol class=\"alpha\">"+content+"</ol>"
        elif list_type == "i":
            result = "<ol class=\"roman\">"+content+"</ol>"
        else:
            result = "<ul>"+content+"</ul>"
        result = "</p>"+result+"<p>"
        return result

Es gibt Probleme beim Zusammenspiel von paragraphs() und

Code: Alles auswählen

 :( Bei mir habe ich das selbe Problem mit Überschriften. Leider weiß paragraphs() nicht das Blockelemente seperat zu handhaben sind :(

Hier mal ein Beispiel:
[quote]hier ist ein wenig code:
[code]
Formatieren kann man mit [Befehl] d.h.:
[url] - Für Links
[b] - Fett
[code] - Codezeilen

Noch eine Normale Zeile, und wieder code:

Code: Alles auswählen

Hier ist noch mal [code] drin

Geht's?[/quote]
Produziert:
[quote]hier ist ein wenig code: 
<code>Formatieren kann man mit [Befehl] d.h.:
[url] - Für Links
- Fett

Code: Alles auswählen

 - Codezeilen
</code>

<p>Noch eine Normale Zeile, und wieder code:<br />
</p><code>Hier ist noch mal [code] drin</code><p><br />
Geht's?</p>[/quote]

blackbird hat geschrieben:Ok. Also die Basisfunktionen habe ich mal fertig.
Jetzt werde ich noch den Code aufräumen, die Plugin Schnittstelle einbauen und das Ganze etwas Stresstesten.

Wie sieht's eigentlich aus? Hast du weiter gemacht???

EDIT: Ist wohl das hier http://trac.pocoo.org/browser/pocoo/tru ... /bbcode.py daraus geworden

Das deutsche Python-Forum

regex und Funktionen

regex und Funktionen