5072: Updated unicode script generation (fixes https://github.com/ddnet/ddnet/issues/5017) r=def- a=Chairn

I also updated the version to 15.0.0. However, confusables are still from version 14 as https://www.unicode.org/Public/security/15.0.0/ is empty.

CC `@heinrich5991` 

## Checklist

- [ ] Tested the change ingame
- [ ] Provided screenshots if it is a visual change
- [ ] Tested in combination with possibly related configuration options
- [ ] Written a unit test if it works standalone, system.c especially
- [ ] Considered possible null pointers and out of bounds array indexing
- [ ] Changed no physics that affect existing maps
- [ ] Tested the change with [ASan+UBSan or valgrind's memcheck](https://github.com/ddnet/ddnet/#using-addresssanitizer--undefinedbehavioursanitizer-or-valgrinds-memcheck) (optional)


Co-authored-by: Chairn <chairn.nq@hotmail.fr>
This commit is contained in:
bors[bot] 2022-05-15 11:17:15 +00:00 committed by GitHub
commit e346e3e186
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 25730 additions and 25299 deletions

View file

@ -1608,8 +1608,10 @@ set_src(BASE GLOB_RECURSE src/base
tl/sorted_array.h
tl/threading.h
unicode/confusables.cpp
unicode/confusables.h
unicode/confusables_data.h
unicode/tolower.cpp
unicode/tolower.h
unicode/tolower_data.h
vmath.h
)

View file

@ -6,7 +6,9 @@ os.chdir(os.path.dirname(__file__) + "/..")
PATH = "src/"
EXCEPTIONS = [
"src/base/unicode/confusables.h",
"src/base/unicode/confusables_data.h",
"src/base/unicode/tolower.h",
"src/base/unicode/tolower_data.h",
"src/tools/config_common.h"
]

View file

@ -4,9 +4,11 @@
# - http://www.unicode.org/Public/security/<VERSION>/confusables.txt
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the file
# `src/base/unicode/confusables_data.h`.
# If executed as a script, it will generate the contents of the files
# python3 scripts/generate_unicode_confusables_data.py header > `src/base/unicode/confusables.h`,
# python3 scripts/generate_unicode_confusables_data.py data > `src/base/unicode/confusables_data.h`.
import sys
import unicode
def generate_decompositions():
@ -48,22 +50,7 @@ def generate_decompositions():
return {c: gen(c) for c in interesting}
def main():
decompositions = generate_decompositions()
# Deduplicate
decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
len_set = sorted(set(len(x) for x in decomposition_set))
if len(len_set) > 8:
raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")
cur_offset = 0
decomposition_offsets = []
for d in decomposition_set:
decomposition_offsets.append(cur_offset)
cur_offset += len(d)
def gen_header(decompositions, len_set):
print("""\
#include <stdint.h>
@ -80,19 +67,31 @@ struct DECOMP_SLICE
print("};")
print()
print("static const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
print("extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];")
print("extern const int32_t decomp_chars[NUM_DECOMPS];")
print("extern const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS];")
print("extern const int32_t decomp_data[];")
def gen_data(decompositions, decomposition_set, decomposition_offsets, len_set):
print("""\
#ifndef CONFUSABLES_DATA
#error "This file should only be included in `confusables.cpp`"
#endif
""")
print("const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
for l in len_set:
print("\t{},".format(l))
print("};")
print()
print("static const int32_t decomp_chars[NUM_DECOMPS] = {")
print("const int32_t decomp_chars[NUM_DECOMPS] = {")
for k in sorted(decompositions):
print("\t0x{:x},".format(k))
print("};")
print()
print("static const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
print("const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
for k in sorted(decompositions):
d = decompositions[k]
i = decomposition_set.index(tuple(d))
@ -101,11 +100,35 @@ struct DECOMP_SLICE
print("};")
print()
print("static const int32_t decomp_data[] = {")
print("const int32_t decomp_data[] = {")
for d in decomposition_set:
for c in d:
print("\t0x{:x},".format(c))
print("};")
def main():
decompositions = generate_decompositions()
# Deduplicate
decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
len_set = sorted(set(len(x) for x in decomposition_set))
if len(len_set) > 8:
raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")
cur_offset = 0
decomposition_offsets = []
for d in decomposition_set:
decomposition_offsets.append(cur_offset)
cur_offset += len(d)
header = "header" in sys.argv
data = "data" in sys.argv
if header:
gen_header(decompositions, len_set)
elif data:
gen_data(decompositions, decomposition_set, decomposition_offsets, len_set)
if __name__ == '__main__':
main()

View file

@ -4,17 +4,17 @@
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the file
# `src/base/unicode/tolower_data.h`.
# python3 scripts/generate_unicode_tolower.py header > `src/base/unicode/tolower.h`,
# python3 scripts/generate_unicode_tolower.py data > `src/base/unicode/tolower_data.h`.
import sys
import unicode
def generate_cases():
ud = unicode.data()
return [(unicode.unhex(u["Value"]), unicode.unhex(u["Simple_Lowercase_Mapping"])) for u in ud if u["Simple_Lowercase_Mapping"]]
def main():
cases = generate_cases()
def gen_header(cases):
print("""\
#include <stdint.h>
@ -26,13 +26,32 @@ struct UPPER_LOWER
enum
{{
\tNUM_TOLOWER={},
\tNUM_TOLOWER = {},
}};
static const struct UPPER_LOWER tolower[NUM_TOLOWER] = {{""".format(len(cases)))
extern const struct UPPER_LOWER tolowermap[];""".format(len(cases)))
def gen_data(cases):
print("""\
#ifndef TOLOWER_DATA
#error "This file must only be included in `tolower.cpp`"
#endif
const struct UPPER_LOWER tolowermap[] = {""")
for upper_code, lower_code in cases:
print("\t{{{}, {}}},".format(upper_code, lower_code))
print("};")
def main():
cases = generate_cases()
header = "header" in sys.argv
data = "data" in sys.argv
if header:
gen_header(cases)
elif data:
gen_data(cases)
if __name__ == '__main__':
main()

View file

@ -1 +1 @@
12.0.0
15.0.0

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,18 @@
#include <stdint.h>
struct DECOMP_SLICE
{
uint16_t offset : 13;
uint16_t length : 3;
};
enum
{
NUM_DECOMP_LENGTHS = 8,
NUM_DECOMPS = 9770,
};
extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];
extern const int32_t decomp_chars[NUM_DECOMPS];
extern const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS];
extern const int32_t decomp_data[];

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
#include <stdint.h>
struct UPPER_LOWER
{
int32_t upper;
int32_t lower;
};
enum
{
NUM_TOLOWER = 1433,
};
extern const struct UPPER_LOWER tolowermap[];

File diff suppressed because it is too large Load diff