c-sscanf

Code-Stücke können hier veröffentlicht werden.
Antworten
jerch
User
Beiträge: 1669
Registriert: Mittwoch 4. März 2009, 14:19

Zwar empfiehlt die Pythondoku, scanf mittels Regex zu implementieren, allerdings war mir der Parseaufwand für Spezialfälle zu hoch, deshalb hier eine Variante mittels ctypes unter Linux (Python 2):

Code: Alles auswählen

# coding: utf-8

import re
from ctypes import (
    CDLL, create_string_buffer, create_unicode_buffer, byref,
    c_byte, c_ubyte,
    c_short, c_ushort,
    c_int, c_uint,
    c_long, c_ulong,
    c_longlong, c_ulonglong,
    c_size_t,
    c_float, c_double, c_longdouble,
    c_char_p, c_wchar_p, c_void_p)

libc = CDLL("libc.so.6")

C_SCANF_TYPES = {
    'i'   : c_int,
    'hhi' : c_byte,
    'hi'  : c_short,
    'li'  : c_long,
    'lli' : c_longlong,
    'ji'  : c_longlong,
    'zi'  : c_size_t,
    'ti'  : c_longlong,

    'd'   : c_int,
    'hhd' : c_byte,
    'hd'  : c_short,
    'ld'  : c_long,
    'lld' : c_longlong,
    'jd'  : c_longlong,
    'zd'  : c_size_t,
    'td'  : c_longlong,

    'u'   : c_uint,
    'hhu' : c_ubyte,
    'hu'  : c_ushort,
    'lu'  : c_ulong,
    'llu' : c_ulonglong,
    'ju'  : c_ulonglong,
    'zu'  : c_size_t,
    'tu'  : c_longlong,

    'o'   : c_uint,
    'hho' : c_ubyte,
    'ho'  : c_ushort,
    'lo'  : c_ulong,
    'llo' : c_ulonglong,
    'jo'  : c_ulonglong,
    'zo'  : c_size_t,
    'to'  : c_longlong,

    'x'   : c_uint,
    'hhx' : c_ubyte,
    'hx'  : c_ushort,
    'lx'  : c_ulong,
    'llx' : c_ulonglong,
    'jx'  : c_ulonglong,
    'zx'  : c_size_t,
    'tx'  : c_longlong,

    'f'   : c_float,
    'lf'  : c_double,
    'Lf'  : c_longdouble,
    'e'   : c_float,
    'le'  : c_double,
    'Le'  : c_longdouble,
    'g'   : c_float,
    'lg'  : c_double,
    'Lg'  : c_longdouble,
    'a'   : c_float,
    'la'  : c_double,
    'La'  : c_longdouble,

    'c'   : lambda l: create_string_buffer(l),  # c_char_p,
    'lc'  : lambda l: create_unicode_buffer(l),  # c_wchar_p,
    's'   : lambda l: create_string_buffer(l),  # c_char_p,
    'ls'  : lambda l: create_unicode_buffer(l),  # c_wchar_p,
    
    ']'   : lambda l: create_string_buffer(l),  # c_char_p,c_char_p,
    #'l[]' : c_wchar_p, done in function

    'p'   : c_void_p,

    'n'   : c_int,
    'hhn' : c_byte,
    'hn'  : c_short,
    'ln'  : c_long,
    'lln' : c_longlong,
    'jn'  : c_longlong,
    'zn'  : c_size_t,
    'tn'  : c_longlong,
}


def sscanf(fmt, s):
    """
    clib sscanf for Python.
    For unicode strings use the l-versions of the string specifiers
    (%ls instead of %s).
    
    Returns a list with the filled up specifiers in order.
    """
    length = len(s)
    args = []
    arg_objects = []
    parts = re.findall('%([^ \t\n\r\f\v%%*]+)', fmt)
    for part in parts:
        ctor = None

        # search most appropriate type constructor
        for pos in range(-1, -len(part)-1, -1):
            try:
                ctor = C_SCANF_TYPES[part[pos:]]
            except KeyError:
                break
        if not ctor:
            raise Exception('cannot handle token "%%%s"' % part)

        # special handling of string types
        if part[-1:] in ('c', 's', ']'):
            if part[-1:] == ']' and part.find('l[') != -1:
                ctor = lambda l: create_unicode_buffer(l)
            obj = ctor(length)
        else:
            obj = ctor()

        arg_objects.append(obj)
        args.append(byref(obj))

    if isinstance(s, unicode):
        filled = libc.swscanf(
            create_unicode_buffer(s), create_unicode_buffer(fmt), *args)
    else:
        filled = libc.sscanf(
            create_string_buffer(s), create_string_buffer(fmt), *args)
    result = []
    for i in range(filled):
        result.append(arg_objects[i].value)
    return result


if __name__ == '__main__':
    # some tests

    print sscanf('%s %s %%', 'abc defg')
    print sscanf(u'%ls %ls %%', u'abc defg')

    print sscanf(u'%ls', u'äüöß')

    print sscanf('%5c %s - %d %f %x', 'ttttt abc - 123 -123.12345e-12 1b')
    print sscanf(u'%5lc %ls - %d %f %x', u'ttttt abc - 123 -123.12345e-12 1b')

    print sscanf('%*5c%s', 'tttttabc')
    print sscanf(u'%*5lc%s', u'tttttabc')

    print sscanf(u'%3l[ä]%*l[ä] %d', u'ääääääääääää 1')
Antworten