Sanitize string unpack by default the same way tw does it

This sadly probably comes with some hefty performance implications
This commit is contained in:
ChillerDragon 2023-04-07 12:45:09 +02:00
parent 1a864d903c
commit a9ba5bd703
2 changed files with 131 additions and 4 deletions

View file

@ -1,4 +1,6 @@
from twnet_parser.packer import *
from twnet_parser.packer import Unpacker, pack_int, pack_str
from twnet_parser.packer import \
NO_SANITIZE, SANITIZE, SANITIZE_CC, SKIP_START_WHITESPACES
def test_unpack_ints_and_strings() -> None:
u = Unpacker(b'\x01\x02\x03\x01foo\x00bar\x00')
@ -25,6 +27,101 @@ def test_non_ascii_repack() -> None:
u = Unpacker(data)
assert u.get_str() == '💩'
def test_string_sanitize_should_not_affect_plain_ascii():
u = Unpacker(b'\x41\x41\x00\x42\x42\x00\x42\x42\x00\x42\x42\x00')
assert u.get_str(NO_SANITIZE) == 'AA'
assert u.get_str(SANITIZE) == 'BB'
assert u.get_str(SANITIZE_CC) == 'BB'
assert u.get_str(SKIP_START_WHITESPACES) == 'BB'
assert u.get_str() == ''
def test_string_no_sanitize_keep_space():
u = Unpacker(b'\x20\x41\x41\x00')
assert u.get_str(NO_SANITIZE) == ' AA'
def test_string_no_sanitize_should_not_crash_on_invalid_utf8():
u = Unpacker(b'\x80\n\x41\x41\n\x00')
assert u.get_str(NO_SANITIZE) == '\nAA\n'
def test_string_no_sanitize_keep_newline():
u = Unpacker(b'\n\x41\x41\n\x00')
assert u.get_str(NO_SANITIZE) == '\nAA\n'
def test_string_no_sanitize_keep_r_and_t():
u = Unpacker(b'\r\x41\x41\t\x00')
assert u.get_str(NO_SANITIZE) == '\rAA\t'
def test_string_sanitize_keep_r_and_t():
u = Unpacker(b'\r\x41\x41\t\x00')
assert u.get_str(SANITIZE) == '\rAA\t'
def test_string_sanitize_by_default_keep_r_and_t():
u = Unpacker(b'\r\x41\x41\t\x00')
assert u.get_str() == '\rAA\t'
def test_string_sanitize_strip_x01_x02():
u = Unpacker(b'\x41\x01\x02\x41\x00')
assert u.get_str(SANITIZE) == 'A A'
def test_string_sanitize_cc_strip_r_and_t():
u = Unpacker(b'\x41\x01\x02\x41\x00')
assert u.get_str(SANITIZE_CC) == 'A A'
u = Unpacker(b'\r\x41\x41\t\x00')
assert u.get_str(SANITIZE_CC) == ' AA '
u = Unpacker(b'\r\t\r\r\r\x00')
assert u.get_str(SANITIZE_CC) == ' '
def test_string_sanitize_cc_strip_x01_x02():
u = Unpacker(b'\x01\x02\x00')
assert u.get_str(SANITIZE_CC) == ' '
u = Unpacker(b'foo\x01bar\x02\x01baz\x00')
assert u.get_str(SANITIZE_CC) == 'foo bar baz'
# TODO: should we strip that? Do not think so since tw is full of unicode
# only single bytes lower than 32 are stripped
# but multi byte unicode with bytes over 32 should be fine
# def test_string_sanitize_cc_strip_poop_emoji():
# u = Unpacker('💩'.encode('utf-8') + b'\x00')
# assert u.get_str(SANITIZE_CC) == len('💩'.encode('utf-8')) * ' '
# u = Unpacker(b'foo' + '💩'.encode('utf-8') + b'bar\x00')
# assert u.get_str(SANITIZE_CC) == f"foo{len('💩'.encode('utf-8')) * ' '}bar"
def test_string_sanitize_cc_keep_poop_emoji():
u = Unpacker('💩'.encode('utf-8') + b'\x00')
assert u.get_str(SANITIZE_CC) == '💩'
u = Unpacker(b'foo' + '💩'.encode('utf-8') + b'bar\x00')
assert u.get_str(SANITIZE_CC) == 'foo💩bar'
def test_string_sanitize_keep_poop_emoji():
u = Unpacker('💩'.encode('utf-8') + b'\x00')
assert u.get_str(SANITIZE) == '💩'
u = Unpacker(b'foo' + '💩'.encode('utf-8') + b'bar\x00')
assert u.get_str(SANITIZE) == 'foo💩bar'
def test_string_no_sanitize_keep_poop_emoji():
u = Unpacker('💩'.encode('utf-8') + b'\x00')
assert u.get_str(NO_SANITIZE) == '💩'
u = Unpacker(b'foo' + '💩'.encode('utf-8') + b'bar\x00')
assert u.get_str(NO_SANITIZE) == 'foo💩bar'
def test_string_skip_start_whitespaces_strip_leading_spaces():
u = Unpacker(b' \x00')
assert u.get_str(SKIP_START_WHITESPACES) == ''
u = Unpacker(b'\t \t \t foo bar \t baz\x00')
assert u.get_str(SKIP_START_WHITESPACES) == 'foo bar \t baz'
# TODO: check tw code what this should do
# def test_string_skip_start_whitespaces_strip_x01_x02():
# """
# https://chillerdragon.github.io/teeworlds-protocol/07/fundamentals.html#string_packing
#
# UB in this spec :shrug:
# """
# u = Unpacker(b'\x01\x02\x00')
# assert u.get_str(SKIP_START_WHITESPACES) == ''
# u = Unpacker(b'\tfoo\x01bar\x02\x01baz\x00')
# assert u.get_str(SKIP_START_WHITESPACES) == 'foo bar baz'
def test_raw_repack_at_end() -> None:
data: bytes = b''
data += pack_int(1)

View file

@ -1,5 +1,7 @@
#!/usr/bin/env python
from typing import Literal, Final
# Before chaning the current packer code to extend it
# Consider having two packers
#
@ -13,6 +15,11 @@
# and is attached to a class instance that
# keeps track of a state
NO_SANITIZE: Final[int] = 0
SANITIZE: Final[int] = 1
SANITIZE_CC: Final[int] = 2
SKIP_START_WHITESPACES: Final[int] = 3
class Unpacker():
def __init__(self, data: bytes) -> None:
self._data = data
@ -64,12 +71,35 @@ class Unpacker():
res ^= -sign
return res
def get_str(self) -> str:
# TODO: optimize performance
# I am highly confident iterating byte by byte is very
# expensive in python
# and something common as byte filtering has to have
# a fast alternative
#
# If there is nothing from the python standard
# this might be worth writing in Cython
# external C or rust
def get_str(self, sanitize: Literal[0,1,2,3] = 1) -> str:
str_end: int = self.data().find(b'\x00')
res: bytes = self.data()[:str_end]
self.idx += str_end + 1
# TODO: add saitize and sanitize cc
return res.decode('utf-8')
if sanitize == NO_SANITIZE:
return res.decode('utf-8', 'ignore')
elif sanitize == SANITIZE:
return bytes( \
[x if x > 32 or x in (9, 10, 13) else 32 for x in res]) \
.decode('utf-8', 'ignore' \
)
elif sanitize == SANITIZE_CC:
return bytes( \
[x if x > 32 else 32 for x in res]) \
.decode('utf-8', 'ignore' \
)
elif sanitize == SKIP_START_WHITESPACES:
return res.decode('utf-8').lstrip()
else:
raise ValueError(f"Error: invalid sanitize mode {sanitize}")
# TODO: optimize performance and benchmark in tests
def pack_int(num: int) -> bytes: