Mal als Beispiel etwas das ein bisschen einfacher ”obfuskiert”, in dem einfach nur alle nicht-whitespace-Zeichen durch ein ”Zensurzeichen” ersetzt werden:
Code: Alles auswählen
#!/usr/bin/env python3
import io
import re
import sys
from tokenize import ENCODING, ERRORTOKEN, STRING, tokenize
STRING_RE = re.compile(
r"""^([^'"]*('{3}|"{3}|'|"))(.*)(\2)$""", re.MULTILINE | re.DOTALL
)
"""
Regular expression that matches syntactically correct Python string literals,
including possible prefixes (f, r, u, …).
"""
def is_encodable(text, encoding):
"""
Test if given `text` can be encoded with given `encoding`.
>>> is_encodable("abc", "ascii")
True
>>> is_encodable("█", "ascii")
False
>>> is_encodable("█", "cp437")
True
>>> is_encodable("█", "utf-8")
True
"""
try:
text.encode(encoding)
except UnicodeEncodeError:
return False
return True
def redact_string(string_representation, redaction_character):
r"""
Redact the content of a given Python `string_representation` with the given
`redaction_character`. Only non-whitespace characters are replaced with the
character.
>>> redact_string('"abc def"', "█")
'"███ ███"'
>>> redact_string("f'This is an f-string.'", "#")
"f'#### ## ## #########'"
>>> redact_string("'''multi\nline\nstring'''", "X")
"'''XXXXX\nXXXX\nXXXXXX'''"
"""
return STRING_RE.sub(
lambda match: (
match[1] + re.sub(r"\S", redaction_character, match[3]) + match[4]
),
string_representation,
)
def get_lines(lines, start_position, end_position=(None, None)):
"""
Get lines from `start_position` to `end_position`, or `start_position` to
the end of the text if `end_position` is not given.
Start and end are given as tuples of line number and column number, and the
first and last line are sliced at the column numbers.
Line numbers start at 1 and column numbers at 0!
>>> lines = ["first", "second", "third", "fourth"]
>>> get_lines(lines, (2, 1), (4, 3))
['econd', 'third', 'fou']
>>> get_lines(lines, (3, 2))
['ird', 'fourth']
"""
start_line, start_column = start_position
end_line, end_column = end_position
result = lines[slice(start_line - 1, end_line)]
if len(result) == 1:
return [result[0][slice(start_column, end_column)]]
else:
result[0] = result[0][slice(start_column, None)]
result[-1] = result[-1][slice(None, end_column)]
return result
def analyze(source_bytes):
"""
Get encoding and string tokens from given Python source.
>>> analyze(b'print("Hello, World!")')
('utf-8', [TokenInfo(type=3 (STRING), string='"Hello, World!"', start=(1, 6), end=(1, 21), line='print("Hello, World!")')])
"""
tokens = tokenize(io.BytesIO(source_bytes).readline)
encoding_token = next(tokens)
if encoding_token.type != ENCODING:
raise ValueError(f"expected encoding, got {encoding_token!r}")
encoding = encoding_token.string
string_tokens = []
for token in tokens:
if token.type == ERRORTOKEN:
raise SyntaxError(f"{token.string} in line {token.start[0]}")
if token.type == STRING:
string_tokens.append(token)
return encoding, string_tokens
def redact_strings(source_bytes, encoding, string_tokens):
r"""
Redact the `string_tokens` in given `source_bytes`. `encoding` is used to
decode the input and encode the output.
>>> source = b'''\
... # coding: ascii
... print("Hello, World!")
... '''
>>> encoding, string_tokens = analyze(source)
>>> redact_strings(source, encoding, string_tokens)
b'# coding: ascii\nprint("XXXXXX XXXXXX")\n'
"""
redaction_character = "█" if is_encodable("█", encoding) else "X"
source_lines = source_bytes.decode(encoding).splitlines(keepends=True)
result = []
start_position = 1, 0
for token in string_tokens:
end_position = token.start
#
# Copy source code before the current string token as is.
#
result.extend(get_lines(source_lines, start_position, end_position))
result.append(redact_string(token.string, redaction_character))
start_position = token.end
#
# Copy source code after last string token as is.
#
result.extend(get_lines(source_lines, start_position))
return "".join(result).encode(encoding)
def main():
source_bytes = sys.stdin.buffer.read()
encoding, string_tokens = analyze(source_bytes)
sys.stdout.buffer.write(
redact_strings(source_bytes, encoding, string_tokens)
)
if __name__ == "__main__":
main()
Das ganze auf sich selbst angewendet:
Code: Alles auswählen
#!/usr/bin/env python3
import io
import re
import sys
from tokenize import ENCODING, ERRORTOKEN, STRING, tokenize
STRING_RE = re.compile(
r"""█████████████████████████████████""", re.MULTILINE | re.DOTALL
)
"""
███████ ██████████ ████ ███████ █████████████ ███████ ██████ ██████ █████████
█████████ ████████ ████████ ███ ██ ██ ███
"""
def is_encodable(text, encoding):
"""
████ ██ █████ ██████ ███ ██ ███████ ████ █████ ███████████
███ ███████████████████ ████████
████
███ █████████████████ ████████
█████
███ █████████████████ ████████
████
███ █████████████████ ████████
████
"""
try:
text.encode(encoding)
except UnicodeEncodeError:
return False
return True
def redact_string(string_representation, redaction_character):
r"""
██████ ███ ███████ ██ █ █████ ██████ ███████████████████████ ████ ███ █████
██████████████████████ ████ ██████████████ ██████████ ███ ████████ ████ ███
██████████
███ ███████████████████ ██████ ████
█████ █████
███ █████████████████████ ██ ██ ████████████ ████
███████ ██ ██ ███████████
███ ██████████████████████████████████████████ ████
███████████████████████████
"""
return STRING_RE.sub(
lambda match: (
match[1] + re.sub(r"██", redaction_character, match[3]) + match[4]
),
string_representation,
)
def get_lines(lines, start_position, end_position=(None, None)):
"""
███ █████ ████ ████████████████ ██ ███████████████ ██ ████████████████ ██
███ ███ ██ ███ ████ ██ ██████████████ ██ ███ ██████
█████ ███ ███ ███ █████ ██ ██████ ██ ████ ██████ ███ ██████ ███████ ███ ███
█████ ███ ████ ████ ███ ██████ ██ ███ ██████ ████████
████ ███████ █████ ██ █ ███ ██████ ███████ ██ ██
███ █████ █ █████████ █████████ ████████ █████████
███ ████████████████ ███ ███ ███ ███
█████████ ████████ ██████
███ ████████████████ ███ ███
███████ █████████
"""
start_line, start_column = start_position
end_line, end_column = end_position
result = lines[slice(start_line - 1, end_line)]
if len(result) == 1:
return [result[0][slice(start_column, end_column)]]
else:
result[0] = result[0][slice(start_column, None)]
result[-1] = result[-1][slice(None, end_column)]
return result
def analyze(source_bytes):
"""
███ ████████ ███ ██████ ██████ ████ █████ ██████ ███████
███ ███████████████████████ ██████████
█████████ █████████████████ █████████ ███████████████ █████████ █████████ ███ ███████ ████ ███████████████████ ████████████
"""
tokens = tokenize(io.BytesIO(source_bytes).readline)
encoding_token = next(tokens)
if encoding_token.type != ENCODING:
raise ValueError(f"████████ █████████ ███ ██████████████████")
encoding = encoding_token.string
string_tokens = []
for token in tokens:
if token.type == ERRORTOKEN:
raise SyntaxError(f"██████████████ ██ ████ ████████████████")
if token.type == STRING:
string_tokens.append(token)
return encoding, string_tokens
def redact_strings(source_bytes, encoding, string_tokens):
r"""
██████ ███ ███████████████ ██ █████ ███████████████ ██████████ ██ ████ ██
██████ ███ █████ ███ ██████ ███ ███████
███ ██████ █ █████
███ █ ███████ █████
███ █████████████ ████████
███ ███
███ █████████ █████████████ █ ███████████████
███ ██████████████████████ █████████ ██████████████
███ ███████ ████████████████████ ███████████
"""
redaction_character = "█" if is_encodable("█", encoding) else "█"
source_lines = source_bytes.decode(encoding).splitlines(keepends=True)
result = []
start_position = 1, 0
for token in string_tokens:
end_position = token.start
#
# Copy source code before the current string token as is.
#
result.extend(get_lines(source_lines, start_position, end_position))
result.append(redact_string(token.string, redaction_character))
start_position = token.end
#
# Copy source code after last string token as is.
#
result.extend(get_lines(source_lines, start_position))
return "".join(result).encode(encoding)
def main():
source_bytes = sys.stdin.buffer.read()
encoding, string_tokens = analyze(source_bytes)
sys.stdout.buffer.write(
redact_strings(source_bytes, encoding, string_tokens)
)
if __name__ == "████████":
main()