ddnet/scripts/generate_unicode_confusables_data.py

136 lines
3.9 KiB
Python
Raw Normal View History

# Needs UnicodeData.txt and confusables.txt in the current directory.
#
# Those can be obtained from unicode.org:
# - http://www.unicode.org/Public/security/<VERSION>/confusables.txt
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
2022-05-06 18:31:24 +00:00
# If executed as a script, it will generate the contents of the files
# python3 scripts/generate_unicode_confusables_data.py header > `src/base/unicode/confusables_data.h`,
# python3 scripts/generate_unicode_confusables_data.py source > `src/base/unicode/confusables_data.h`.
import unicode
2022-05-06 18:31:24 +00:00
import sys
def generate_decompositions():
2020-12-02 14:22:26 +00:00
ud = unicode.data()
con = unicode.confusables()
2020-12-02 14:22:26 +00:00
category = lambda x: {unicode.unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
2020-12-02 14:22:26 +00:00
# TODO: Is this correct? They changed the decompositioning format
nfd = {unicode.unhex(u["Value"]): unicode.unhex_sequence(u["Decomposition_Type"]) for u in ud}
nfd = {k: v for k, v in nfd.items() if v}
con = {unicode.unhex(c["Value"]): unicode.unhex_sequence(c["Target"]) for c in con}
2020-12-02 14:22:26 +00:00
# C: Control
# M: Combining
# Z: Space
ignore = category("C") | category("M") | category("Z")
2020-12-02 14:22:26 +00:00
con[0x006C] = [0x0069] # LATIN SMALL LETTER L -> LATIN SMALL LETTER I
con[0x00A1] = [0x0069] # INVERTED EXCLAMATION MARK -> LATIN SMALL LETTER I
2020-12-02 14:22:26 +00:00
con[0x2800] = [] # BRAILLE PATTERN BLANK
con[0xFFFC] = [] # OBJECT REPLACEMENT CHARACTER
2020-12-02 14:22:26 +00:00
interesting = ignore | set(nfd) | set(con)
2020-12-02 14:22:26 +00:00
def apply(l, replacements):
return [d for c in l for d in replacements.get(c, [c])]
2020-12-02 14:22:26 +00:00
def gen(c):
result = [c]
while True:
first = apply(result, nfd)
second = apply(first, con)
# Apply substitutions until convergence.
if result == first and result == second:
break
result = second
return [c for c in result if c not in ignore]
2020-12-02 14:22:26 +00:00
return {c: gen(c) for c in interesting}
2022-05-06 18:31:24 +00:00
def gen_header(decompositions, len_set):
2020-12-02 14:22:26 +00:00
print("""\
#include <stdint.h>
struct DECOMP_SLICE
{
\tuint16_t offset : 13;
\tuint16_t length : 3;
};
""")
2020-12-02 14:22:26 +00:00
print("enum")
print("{")
print("\tNUM_DECOMP_LENGTHS = {},".format(len(len_set)))
print("\tNUM_DECOMPS = {},".format(len(decompositions)))
2020-12-02 14:22:26 +00:00
print("};")
print()
2022-05-06 18:31:24 +00:00
print("extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];")
print("extern const int32_t decomp_chars[NUM_DECOMPS];")
print("extern const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS];")
print("extern const int32_t decomp_data[];")
def gen_source(decompositions, decomposition_set, decomposition_offsets, len_set):
print("""\
#ifndef CONFUSABLES_DATA
#error "This file should only be included in `confusables.cpp`"
#endif
""")
print("const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
2020-12-02 14:22:26 +00:00
for l in len_set:
print("\t{},".format(l))
print("};")
print()
2022-05-06 18:31:24 +00:00
print("const int32_t decomp_chars[NUM_DECOMPS] = {")
2020-12-02 14:22:26 +00:00
for k in sorted(decompositions):
print("\t0x{:x},".format(k))
print("};")
print()
2022-05-06 18:31:24 +00:00
print("const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
2020-12-02 14:22:26 +00:00
for k in sorted(decompositions):
d = decompositions[k]
i = decomposition_set.index(tuple(d))
l = len_set.index(len(d))
print("\t{{{}, {}}},".format(decomposition_offsets[i], l))
print("};")
print()
2022-05-06 18:31:24 +00:00
print("const int32_t decomp_data[] = {")
2020-12-02 14:22:26 +00:00
for d in decomposition_set:
for c in d:
print("\t0x{:x},".format(c))
print("};")
2022-05-06 18:31:24 +00:00
def main():
decompositions = generate_decompositions()
# Deduplicate
decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
len_set = sorted(set(len(x) for x in decomposition_set))
if len(len_set) > 8:
raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")
cur_offset = 0
decomposition_offsets = []
for d in decomposition_set:
decomposition_offsets.append(cur_offset)
cur_offset += len(d)
header = "header" in sys.argv
source = "source" in sys.argv
if header:
gen_header(decompositions, len_set)
elif source:
gen_source(decompositions, decomposition_set, decomposition_offsets, len_set)
if __name__ == '__main__':
2020-12-02 14:22:26 +00:00
main()
2022-05-06 18:31:24 +00:00