rudimentären Interpreter basteln

dvdlly · Samstag 17. Juli 2021, 18:44

Hi,

Ich bastle gerade einen einfachen Interpreter, der bisher additionsbefehle ausführt. Für die Eingabe wird ein Fehler geworfen:

File "interpreter.py", line 111, in parseInt
return int(result)
ValueError: invalid literal for int() with base 10: ''

Ich verstehe nicht warum, denn der Eingabestring hat keine whitespaces. Kann mir jemand sagen was falsch ist?

Code: Alles auswählen

# Token types
#
# EOF (end-of-file) token is used to indicate that
# there is no more input left for lexical analysis
INTEGER, PLUS, EOF = 'INTEGER', 'PLUS','EOF'


class Token(object):
    def __init__(self, type, value):
        # token type: INTEGER, PLUS, or EOF
        self.type = type
        # token value: 0, 1, 2. 3, 4, 5, 6, 7, 8, 9, '+', or None
        self.value = value

    def __str__(self):
        """
        String representation of the class instance.

        Examples:
            Token(INTEGER, 3)
            Token(PLUS '+')
        """
        return 'Token({type}, {value})'.format(
            type=self.type,
            value=repr(self.value)
        )

    def __repr__(self):
        return self.__str__()


class Interpreter(object):
    def __init__(self, text):
        # client string input, e.g. "3+5"
        self.text = text
        # self.pos is an index into self.text
        self.pos = 0
        # current token instance
        self.current_token = None

        self.current_char = self.text[self.pos]

    def error(self):
        raise Exception('Error parsing input')


    def get_next_token(self):
        """Lexical analyzer (also known as scanner or tokenizer)

        This method is responsible for breaking a sentence
        apart into tokens. One token at a time.
        """
        text = self.text

        # is self.pos index past the end of the self.text ?
        # if so, then return EOF token because there is no more
        # input left to convert into tokens
        if self.pos > len(text) - 1:
            return Token(EOF, None)

        # get a character at the position self.pos and decide
        # what token to create based on the single character
        current_char = text[self.pos]

        while (current_char == " "):
            self.pos +=1
            current_char = text[self.pos]

        # if the character is a digit then convert it to
        # integer, create an INTEGER token, increment self.pos
        # index to point to the next character after the digit,
        # and return the INTEGER token
        if current_char.isdigit():
            token = Token(INTEGER, self.parseInt())
            return token

        if current_char == '+':
            token = Token(PLUS, current_char)
            self.pos += 1
            return token

        if current_char == ' ':
            self.pos += 1
            return None

        self.error()

    def eat(self, token_type):
        # compare the current token type with the passed token
        # type and if they match then "eat" the current token
        # and assign the next token to the self.current_token,
        # otherwise raise an exception.
        if self.current_token.type == token_type:
            self.current_token = self.get_next_token()
        else:
            self.error()

    def advance(self):
        self.pos += 1
        if self.pos > len(self.text)-1:
            self.current_char = None
        else:
            self.current_char = self.text[self.pos]

    def parseInt(self):
        result = ""
        while self.current_char is not None and self.current_char.isdigit():
            result += self.current_char
            self.advance()
        print(result)
        return int(result)



    def expr(self):
        """expr -> INTEGER PLUS INTEGER"""
        # set current token to the first token taken from the input
        self.current_token = self.get_next_token()

        # we expect the current token to be a single-digit integer

        lval = self.current_token
        self.eat(INTEGER)
        # we expect the current token to be a '+' token
        self.current_token = self.get_next_token()
        self.eat(PLUS)

        # we expect the current token to be a single-digit integer
        self.current_token = self.get_next_token()
        rval = self.current_token
        self.eat(INTEGER)
        # after the above call the self.current_token is set to
        # EOF token

        # at this point INTEGER PLUS INTEGER sequence of tokens
        # has been successfully found and the method can just
        # return the result of adding two integers, thus
        # effectively interpreting client input
        result = lval + rval
        return result


def main():
    while True:
        try:
            # To run under Python3 replace 'raw_input' call
            # with 'input'
            text = input('calc> ')
        except EOFError:
            break
        if not text:
            continue
        interpreter = Interpreter(text)
        result = interpreter.expr()
        print(result)


if __name__ == '__main__':
    main()

__blackjack__ · Sonntag 18. Juli 2021, 00:15

@dvdlly: Also für meinen Geschmack wird `get_next_token()` zu oft aufgerufen. Das passiert jedes mal in `eat()` und `expr()` ruft `eat()` *und* `get_next_token()` auf.

Weitere Anmerkungen: Von `object` braucht man nicht mehr explizit erben.

Für die Token-Typ-Konstanten könnte man das `enum`-Modul verwenden.

`__str__()` und `__repr__()` sind ”falsch herum” implementiert. Du willst hier nur `__repr__()` implementieren, und zwar so wie Du `__str__()` implementiert hast, denn `object.__str__()` ruft `__repr__()` auf. Das braucht man dann gar nicht selbst implementieren.

Letztlich würde ich bei der Klasse aber einfach einen Typ mit `collections.namedtuple()` erstellen.

Code: Alles auswählen

In [263]: from collections import namedtuple                                    

In [264]: INTEGER, PLUS, EOF = "INTEGER", "PLUS", "EOF"  # TODO Use `enum`? 
     ...:  
     ...: Token = namedtuple("Token", "type value")                             

In [265]: Token(INTEGER, 42)                                                    
Out[265]: Token(type='INTEGER', value=42)

In [266]: Token(INTEGER, 42).type                                               
Out[266]: 'INTEGER'

In [267]: Token(INTEGER, 42).value                                              
Out[267]: 42

Die `error()`-Methode ist nicht wirklich eine Methode.

Es fehlen Unit-Tests. Die sind bei so etwas wirklich hilfreich, weil man einen Haufen Testfälle schreiben kann, die man dann immer bei Code-Änderungen laufen lassen kann, um zu sehen ob man nichts kaputt verbessert hat. Zum Beispiel könntest Du mal eine ansonsten gültige Eingabe mit Leerzeichen am Ende verfüttern, also "1+2 ". Für mein empfinden sollte das gehen, der Code macht da aber im Moment sicher nicht mit.

In welchem Fall würde denn das ``if current_char == " ":`` am Ende von `get_next_token()` ausgeführt? Die ``while``-Schleife die Leerzeichen überspringt sollte das eigentlich zu totem Code machen, oder? Auch ein Fall für Unit-Tests + Coverage.

`parseInt()` sollte `parse_int()` geschrieben werden.

``continue`` würde ich vermeiden. Man kann das in den allermeisten Fällen genau so gut ohne diese Anweisung ausdrücken, deren Sprung im Code nicht durch die eingerückte Struktur sichtbar gemacht wird.

Man muss nicht jedes Zwischenergebnis an einen Namen binden.

Zwischenstand, ungetestet und immer noch mit dem Ursprungsproblem:

Code: Alles auswählen

#!/usr/bin/env python3
from collections import namedtuple

# Token types
#
# EOF (end-of-file) token is used to indicate that
# there is no more input left for lexical analysis
INTEGER, PLUS, EOF = "INTEGER", "PLUS", "EOF"  # TODO Use `enum`?


Token = namedtuple("Token", "type value")


class Interpreter:
    def __init__(self, text):
        # client string input, e.g. "3+5"
        self.text = text
        # self.pos is an index into self.text
        self.pos = 0
        # current token instance
        self.current_token = None

        self.current_char = self.text[self.pos]

    @staticmethod
    def error():
        raise Exception("Error parsing input")

    def advance(self):
        self.pos += 1
        self.current_char = (
            None if self.pos > len(self.text) - 1 else self.text[self.pos]
        )

    def parse_int(self):
        result = ""
        while self.current_char is not None and self.current_char.isdigit():
            result += self.current_char
            self.advance()
        # print(result)
        return int(result)

    def get_next_token(self):
        """Lexical analyzer (also known as scanner or tokenizer)

        This method is responsible for breaking a sentence
        apart into tokens. One token at a time.
        """
        text = self.text

        # is self.pos index past the end of the self.text ?
        # if so, then return EOF token because there is no more
        # input left to convert into tokens
        if self.pos > len(text) - 1:
            return Token(EOF, None)

        # get a character at the position self.pos and decide
        # what token to create based on the single character
        current_char = text[self.pos]

        while current_char == " ":
            self.pos += 1
            current_char = text[self.pos]

        # if the character is a digit then convert it to
        # integer, create an INTEGER token, increment self.pos
        # index to point to the next character after the digit,
        # and return the INTEGER token
        if current_char.isdigit():
            return Token(INTEGER, self.parse_int())

        if current_char == "+":
            self.pos += 1
            return Token(PLUS, current_char)
        #
        # FIXME Dead Code.
        #
        if current_char == " ":
            self.pos += 1
            return None

        self.error()

    def eat(self, token_type):
        # compare the current token type with the passed token
        # type and if they match then "eat" the current token
        # and assign the next token to the self.current_token,
        # otherwise raise an exception.
        if self.current_token.type == token_type:
            self.current_token = self.get_next_token()
        else:
            self.error()

    def expr(self):
        """expr -> INTEGER PLUS INTEGER"""
        # set current token to the first token taken from the input
        self.current_token = self.get_next_token()

        # we expect the current token to be a single-digit integer

        lval = self.current_token
        self.eat(INTEGER)
        # we expect the current token to be a "+" token
        self.current_token = self.get_next_token()
        self.eat(PLUS)

        # we expect the current token to be a single-digit integer
        self.current_token = self.get_next_token()
        rval = self.current_token
        self.eat(INTEGER)
        # after the above call the self.current_token is set to
        # EOF token

        # at this point INTEGER PLUS INTEGER sequence of tokens
        # has been successfully found and the method can just
        # return the result of adding two integers, thus
        # effectively interpreting client input
        return lval + rval


def main():
    while True:
        try:
            text = input("calc> ")
        except EOFError:
            break
        
        if text:
            print(Interpreter(text).expr())


if __name__ == "__main__":
    main()

Ich finde es übrigens komisch das die kleineste Eingabeeinheit eine Addition ist und nicht eine einfache Zahl. Und die Kommentare suggerieren, dass die Zahlen nur aus einer Ziffer bestehen dürfen, der Code parst aber scheinbar auch ganze Zahlen mit mehr Ziffern.

dirk009 · Sonntag 18. Juli 2021, 00:18

Hallo @dvdlly,

a) in der Methode get_next_token hast Du vergessen self vor current_char zu schreiben. (current_char -> self.current_char)
b) in der Methode expr wird get_next_token zu oft aufgerufen (zweimal auskommentiert)
c) in der Methode expr fehlt der Zusatz .value (result = lval.value + rval.value)

Testlauf (mit Debugausgaben):
calc> 1+3
1
INTEGER
PLUS
3
INTEGER
4
calc>

Cheers,
Dirk

Code: Alles auswählen

# Token types
#
# EOF (end-of-file) token is used to indicate that
# there is no more input left for lexical analysis
INTEGER, PLUS, EOF = 'INTEGER', 'PLUS','EOF'


class Token(object):
    def __init__(self, type, value):
        # token type: INTEGER, PLUS, or EOF
        self.type = type
        # token value: 0, 1, 2. 3, 4, 5, 6, 7, 8, 9, '+', or None
        self.value = value

    def __str__(self):
        """
        String representation of the class instance.

        Examples:
            Token(INTEGER, 3)
            Token(PLUS '+')
        """
        return 'Token({type}, {value})'.format(
            type=self.type,
            value=repr(self.value)
        )

    def __repr__(self):
        return self.__str__()


class Interpreter(object):
    def __init__(self, text):
        # client string input, e.g. "3+5"
        self.text = text
        # self.pos is an index into self.text
        self.pos = 0
        # current token instance
        self.current_token = None

        self.current_char = self.text[self.pos]

    def error(self):
        raise Exception('Error parsing input')


    def get_next_token(self):
        """Lexical analyzer (also known as scanner or tokenizer)

        This method is responsible for breaking a sentence
        apart into tokens. One token at a time.
        """
        text = self.text

        # is self.pos index past the end of the self.text ?
        # if so, then return EOF token because there is no more
        # input left to convert into tokens
        if self.pos > len(text) - 1:
            return Token(EOF, None)

        # get a character at the position self.pos and decide
        # what token to create based on the single character
        self.current_char = text[self.pos]

        while (self.current_char == " "):
            self.pos +=1
            self.current_char = text[self.pos]

        # if the character is a digit then convert it to
        # integer, create an INTEGER token, increment self.pos
        # index to point to the next character after the digit,
        # and return the INTEGER token
        if self.current_char.isdigit():
            token = Token(INTEGER, self.parseInt())
            return token

        if self.current_char == '+':
            token = Token(PLUS, self.current_char)
            self.pos += 1
            return token

        if self.current_char == ' ':
            self.pos += 1
            return None

        self.error()

    def eat(self, token_type):
        # compare the current token type with the passed token
        # type and if they match then "eat" the current token
        # and assign the next token to the self.current_token,
        # otherwise raise an exception.
        print(self.current_token.type)
        if self.current_token.type == token_type:
            self.current_token = self.get_next_token()
        else:
            self.error()

    def advance(self):
        self.pos += 1
        if self.pos > len(self.text)-1:
            self.current_char = None
        else:
            self.current_char = self.text[self.pos]

    def parseInt(self):
        result = ""
        while self.current_char is not None and self.current_char.isdigit():
            result += self.current_char
            self.advance()
        print(result)
        return int(result)



    def expr(self):
        """expr -> INTEGER PLUS INTEGER"""
        # set current token to the first token taken from the input
        self.current_token = self.get_next_token()

        # we expect the current token to be a single-digit integer

        lval = self.current_token
        self.eat(INTEGER)
        # we expect the current token to be a '+' token
        # self.current_token = self.get_next_token()
        self.eat(PLUS)

        # we expect the current token to be a single-digit integer
        #self.current_token = self.get_next_token()
        rval = self.current_token
        self.eat(INTEGER)
        # after the above call the self.current_token is set to
        # EOF token

        # at this point INTEGER PLUS INTEGER sequence of tokens
        # has been successfully found and the method can just
        # return the result of adding two integers, thus
        # effectively interpreting client input
        result = lval.value + rval.value
        return result


def main():
    while True:
        try:
            # To run under Python3 replace 'raw_input' call
            # with 'input'
            text = input('calc> ')
        except EOFError:
            break
        if not text:
            continue
        interpreter = Interpreter(text)
        result = interpreter.expr()
        print(result)


if __name__ == '__main__':
    main()