Code: Alles auswählen
#!/usr/bin/python
"""
check-utf8 [files]
Checks a file about correct utf-8. See RFC2279.
"""
import sys
def check_file(fl):
state = None
cp = 0
lp = 1
while True:
buf = fl.read(1)
if len(buf) == 0:
if state != None:
return (lp, cp)
else:
return None
chr = ord(buf[0])
if state == None:
if (chr & 0x80) == 0x00:
if chr == ord('\n'):
lp = lp + 1
cp = 0
else:
cp = cp + 1
elif (chr & 0xE0) == 0xC0:
state = (1, 0)
elif (chr & 0xF0) == 0xE0:
state = (2, 0)
elif (chr & 0xF8) == 0xF0:
state = (3, 0)
elif (chr & 0xFC) == 0xF8:
state = (4, 0)
elif (chr & 0xFE) == 0xFC:
state = (5, 0)
else:
return (lp, cp)
elif isinstance(state, tuple):
(l, p) = state
p = p + 1
if (chr & 0xC0) == 0x80:
if p == l:
state = None
else:
state = (l, p)
else:
return (lp, cp)
for filename in sys.argv:
try:
pos = check_file(file(filename, "r"))
except IOError:
print >>stderr, repr(filename) + ": not found."
continue
if pos != None:
(lp, cp) = pos
print repr(filename) + ": failed at Line: " + str(lp) + ", Char: " + str(cp)