c-sscanf
Verfasst: Mittwoch 9. März 2016, 17:32
Zwar empfiehlt die Pythondoku, scanf mittels Regex zu implementieren, allerdings war mir der Parseaufwand für Spezialfälle zu hoch, deshalb hier eine Variante mittels ctypes unter Linux (Python 2):
Code: Alles auswählen
# coding: utf-8
import re
from ctypes import (
CDLL, create_string_buffer, create_unicode_buffer, byref,
c_byte, c_ubyte,
c_short, c_ushort,
c_int, c_uint,
c_long, c_ulong,
c_longlong, c_ulonglong,
c_size_t,
c_float, c_double, c_longdouble,
c_char_p, c_wchar_p, c_void_p)
libc = CDLL("libc.so.6")
C_SCANF_TYPES = {
'i' : c_int,
'hhi' : c_byte,
'hi' : c_short,
'li' : c_long,
'lli' : c_longlong,
'ji' : c_longlong,
'zi' : c_size_t,
'ti' : c_longlong,
'd' : c_int,
'hhd' : c_byte,
'hd' : c_short,
'ld' : c_long,
'lld' : c_longlong,
'jd' : c_longlong,
'zd' : c_size_t,
'td' : c_longlong,
'u' : c_uint,
'hhu' : c_ubyte,
'hu' : c_ushort,
'lu' : c_ulong,
'llu' : c_ulonglong,
'ju' : c_ulonglong,
'zu' : c_size_t,
'tu' : c_longlong,
'o' : c_uint,
'hho' : c_ubyte,
'ho' : c_ushort,
'lo' : c_ulong,
'llo' : c_ulonglong,
'jo' : c_ulonglong,
'zo' : c_size_t,
'to' : c_longlong,
'x' : c_uint,
'hhx' : c_ubyte,
'hx' : c_ushort,
'lx' : c_ulong,
'llx' : c_ulonglong,
'jx' : c_ulonglong,
'zx' : c_size_t,
'tx' : c_longlong,
'f' : c_float,
'lf' : c_double,
'Lf' : c_longdouble,
'e' : c_float,
'le' : c_double,
'Le' : c_longdouble,
'g' : c_float,
'lg' : c_double,
'Lg' : c_longdouble,
'a' : c_float,
'la' : c_double,
'La' : c_longdouble,
'c' : lambda l: create_string_buffer(l), # c_char_p,
'lc' : lambda l: create_unicode_buffer(l), # c_wchar_p,
's' : lambda l: create_string_buffer(l), # c_char_p,
'ls' : lambda l: create_unicode_buffer(l), # c_wchar_p,
']' : lambda l: create_string_buffer(l), # c_char_p,c_char_p,
#'l[]' : c_wchar_p, done in function
'p' : c_void_p,
'n' : c_int,
'hhn' : c_byte,
'hn' : c_short,
'ln' : c_long,
'lln' : c_longlong,
'jn' : c_longlong,
'zn' : c_size_t,
'tn' : c_longlong,
}
def sscanf(fmt, s):
"""
clib sscanf for Python.
For unicode strings use the l-versions of the string specifiers
(%ls instead of %s).
Returns a list with the filled up specifiers in order.
"""
length = len(s)
args = []
arg_objects = []
parts = re.findall('%([^ \t\n\r\f\v%%*]+)', fmt)
for part in parts:
ctor = None
# search most appropriate type constructor
for pos in range(-1, -len(part)-1, -1):
try:
ctor = C_SCANF_TYPES[part[pos:]]
except KeyError:
break
if not ctor:
raise Exception('cannot handle token "%%%s"' % part)
# special handling of string types
if part[-1:] in ('c', 's', ']'):
if part[-1:] == ']' and part.find('l[') != -1:
ctor = lambda l: create_unicode_buffer(l)
obj = ctor(length)
else:
obj = ctor()
arg_objects.append(obj)
args.append(byref(obj))
if isinstance(s, unicode):
filled = libc.swscanf(
create_unicode_buffer(s), create_unicode_buffer(fmt), *args)
else:
filled = libc.sscanf(
create_string_buffer(s), create_string_buffer(fmt), *args)
result = []
for i in range(filled):
result.append(arg_objects[i].value)
return result
if __name__ == '__main__':
# some tests
print sscanf('%s %s %%', 'abc defg')
print sscanf(u'%ls %ls %%', u'abc defg')
print sscanf(u'%ls', u'äüöß')
print sscanf('%5c %s - %d %f %x', 'ttttt abc - 123 -123.12345e-12 1b')
print sscanf(u'%5lc %ls - %d %f %x', u'ttttt abc - 123 -123.12345e-12 1b')
print sscanf('%*5c%s', 'tttttabc')
print sscanf(u'%*5lc%s', u'tttttabc')
print sscanf(u'%3l[ä]%*l[ä] %d', u'ääääääääääää 1')