Updated unicode script generation

This commit is contained in:
Chairn 2022-05-06 20:31:24 +02:00
parent e14fc102a6
commit 5925181acc
9 changed files with 25702 additions and 25274 deletions

View file

@ -4,10 +4,12 @@
# - http://www.unicode.org/Public/security/<VERSION>/confusables.txt
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the file
# `src/base/unicode/confusables_data.h`.
# If executed as a script, it will generate the contents of the files
# python3 scripts/generate_unicode_confusables_data.py header > `src/base/unicode/confusables_data.h`,
# python3 scripts/generate_unicode_confusables_data.py source > `src/base/unicode/confusables_data.h`.
import unicode
import sys
def generate_decompositions():
ud = unicode.data()
@ -48,22 +50,7 @@ def generate_decompositions():
return {c: gen(c) for c in interesting}
def main():
decompositions = generate_decompositions()
# Deduplicate
decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
len_set = sorted(set(len(x) for x in decomposition_set))
if len(len_set) > 8:
raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")
cur_offset = 0
decomposition_offsets = []
for d in decomposition_set:
decomposition_offsets.append(cur_offset)
cur_offset += len(d)
def gen_header(decompositions, len_set):
print("""\
#include <stdint.h>
@ -80,19 +67,31 @@ struct DECOMP_SLICE
print("};")
print()
print("static const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
print("extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];")
print("extern const int32_t decomp_chars[NUM_DECOMPS];")
print("extern const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS];")
print("extern const int32_t decomp_data[];")
def gen_source(decompositions, decomposition_set, decomposition_offsets, len_set):
print("""\
#ifndef CONFUSABLES_DATA
#error "This file should only be included in `confusables.cpp`"
#endif
""")
print("const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
for l in len_set:
print("\t{},".format(l))
print("};")
print()
print("static const int32_t decomp_chars[NUM_DECOMPS] = {")
print("const int32_t decomp_chars[NUM_DECOMPS] = {")
for k in sorted(decompositions):
print("\t0x{:x},".format(k))
print("};")
print()
print("static const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
print("const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
for k in sorted(decompositions):
d = decompositions[k]
i = decomposition_set.index(tuple(d))
@ -101,11 +100,36 @@ struct DECOMP_SLICE
print("};")
print()
print("static const int32_t decomp_data[] = {")
print("const int32_t decomp_data[] = {")
for d in decomposition_set:
for c in d:
print("\t0x{:x},".format(c))
print("};")
def main():
decompositions = generate_decompositions()
# Deduplicate
decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
len_set = sorted(set(len(x) for x in decomposition_set))
if len(len_set) > 8:
raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")
cur_offset = 0
decomposition_offsets = []
for d in decomposition_set:
decomposition_offsets.append(cur_offset)
cur_offset += len(d)
header = "header" in sys.argv
source = "source" in sys.argv
if header:
gen_header(decompositions, len_set)
elif source:
gen_source(decompositions, decomposition_set, decomposition_offsets, len_set)
if __name__ == '__main__':
main()

View file

@ -4,17 +4,17 @@
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the file
# `src/base/unicode/tolower_data.h`.
# python3 scripts/generate_unicode_tolower.py header > `src/base/unicode/tolower_data.h`,
# python3 scripts/generate_unicode_tolower.py source > `src/base/unicode/tolower_data.c`.
import unicode
import sys
def generate_cases():
ud = unicode.data()
return [(unicode.unhex(u["Value"]), unicode.unhex(u["Simple_Lowercase_Mapping"])) for u in ud if u["Simple_Lowercase_Mapping"]]
def main():
cases = generate_cases()
def gen_header(cases):
print("""\
#include <stdint.h>
@ -26,13 +26,32 @@ struct UPPER_LOWER
enum
{{
\tNUM_TOLOWER={},
\tNUM_TOLOWER = {},
}};
static const struct UPPER_LOWER tolower[NUM_TOLOWER] = {{""".format(len(cases)))
extern const struct UPPER_LOWER tolowermap[];""".format(len(cases)))
def gen_source(cases):
print("""\
#ifndef TOLOWER_DATA
#error "This file must only be included in `tolower.cpp`"
#endif
const struct UPPER_LOWER tolowermap[] = {""")
for upper_code, lower_code in cases:
print("\t{{{}, {}}},".format(upper_code, lower_code))
print("};")
def main():
cases = generate_cases()
header = "header" in sys.argv
source = "source" in sys.argv
if header:
gen_header(cases)
elif source:
gen_source(cases)
if __name__ == '__main__':
main()

View file

@ -1 +1 @@
12.0.0
15.0.0

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ struct DECOMP_SLICE
enum
{
NUM_DECOMP_LENGTHS = 8,
NUM_DECOMPS = 9606,
NUM_DECOMPS = 9770,
};
extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@ struct UPPER_LOWER
enum
{
NUM_TOLOWER = 1390,
NUM_TOLOWER = 1433,
};
extern const struct UPPER_LOWER tolowermap[];