Merge pull request #1423 from ddnet/utf8_nocase

UTF8 nocase compare & use for chat TAB completion
This commit is contained in:
Dennis Felsing 2019-01-09 08:54:25 +01:00 committed by GitHub
commit 71ec8cc294
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 7252 additions and 5544 deletions

2
.gitignore vendored
View file

@ -40,7 +40,6 @@ DDNet-Server
DDNet-Server-Launcher
config_retrieve
config_store
confusables
crapnet
dilate
dummy_map
@ -58,6 +57,7 @@ tileset_borderfix
tileset_borderrem
tileset_borderset
twping
unicode_confusables
uuid
versionsrv

View file

@ -565,8 +565,6 @@ generate_source("src/game/generated/server_data.h" "server_content_header")
# Sources
set_glob(BASE GLOB_RECURSE src/base
color.h
confusables.c
confusables_data.h
detect.h
hash.c
hash.h
@ -584,6 +582,10 @@ set_glob(BASE GLOB_RECURSE src/base
tl/sorted_array.h
tl/string.h
tl/threading.h
unicode/confusables.c
unicode/confusables_data.h
unicode/tolower.c
unicode/tolower_data.h
vmath.h
)
set_glob(ENGINE_INTERFACE GLOB src/engine
@ -1094,7 +1096,6 @@ set_glob(TOOLS GLOB src/tools
config_common.h
config_retrieve.cpp
config_store.cpp
confusables.cpp
crapnet.cpp
dilate.cpp
dummy_map.cpp
@ -1108,6 +1109,7 @@ set_glob(TOOLS GLOB src/tools
tileset_borderfix.cpp
tileset_borderrem.cpp
tileset_borderset.cpp
unicode_confusables.cpp
uuid.cpp
)
foreach(ABS_T ${TOOLS})

View file

@ -6,7 +6,8 @@ os.chdir(os.path.dirname(__file__) + "/..")
PATH = "src/"
EXCEPTIONS = [
"src/base/confusables_data.h",
"src/base/unicode/confusables_data.h",
"src/base/unicode/tolower_data.h",
"src/tools/config_common.h"
]

View file

@ -5,51 +5,20 @@
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the file
# `src/base/confusables_data.h`.
# `src/base/unicode/confusables_data.h`.
import csv
def confusables():
with open('confusables.txt', encoding='utf-8-sig') as f:
# Filter comments
f = map(lambda line: line.split('#')[0], f)
return list(csv.DictReader(f, fieldnames=['Value', 'Target', 'Category'], delimiter=';'))
UNICODEDATA_FIELDS = (
"Value",
"Name",
"General_Category",
"Canonical_Combining_Class",
"Bidi_Class",
"Decomposition",
"Numeric",
"Bidi_Mirrored",
"Unicode_1_Name",
"ISO_Comment",
"Simple_Uppercase_Mapping",
"Simple_Lowercase_Mapping",
"Simple_Titlecase_Mapping",
)
def unicodedata():
with open('UnicodeData.txt') as f:
return list(csv.DictReader(f, fieldnames=UNICODEDATA_FIELDS, delimiter=';'))
def unhex(s):
return int(s, 16)
def unhex_sequence(s):
return [unhex(x) for x in s.split()] if '<' not in s else None
import unicode
def generate_decompositions():
ud = unicodedata()
con = confusables()
ud = unicode.data()
con = unicode.confusables()
category = lambda x: {unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
category = lambda x: {unicode.unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
nfd = {unhex(u["Value"]): unhex_sequence(u["Decomposition"]) for u in ud}
# TODO: Is this correct? They changed the decompositioning format
nfd = {unicode.unhex(u["Value"]): unicode.unhex_sequence(u["Decomposition_Type"]) for u in ud}
nfd = {k: v for k, v in nfd.items() if v}
con = {unhex(c["Value"]): unhex_sequence(c["Target"]) for c in con}
con = {unicode.unhex(c["Value"]): unicode.unhex_sequence(c["Target"]) for c in con}
# C: Control
# M: Combining

View file

@ -0,0 +1,38 @@
# Needs UnicodeData.txt in the current directory.
#
# It can be obtained from unicode.org:
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the file
# `src/base/unicode/tolower_data.h`.
import unicode
def generate_cases():
ud = unicode.data()
return [(unicode.unhex(u["Value"]), unicode.unhex(u["Simple_Lowercase_Mapping"])) for u in ud if u["Simple_Lowercase_Mapping"]]
def main():
cases = generate_cases()
print("""\
#include <stdint.h>
struct UPPER_LOWER
{{
\tint32_t upper;
\tint32_t lower;
}};
enum
{{
\tNUM_TOLOWER={},
}};
static const struct UPPER_LOWER tolower[NUM_TOLOWER] = {{""".format(len(cases)))
for upper_code, lower_code in cases:
print("\t{{{}, {}}},".format(upper_code, lower_code))
print("};")
if __name__ == '__main__':
main()

35
scripts/unicode.py Normal file
View file

@ -0,0 +1,35 @@
import csv
def confusables():
with open('confusables.txt', encoding='utf-8-sig') as f:
# Filter comments
f = map(lambda line: line.split('#')[0], f)
return list(csv.DictReader(f, fieldnames=['Value', 'Target', 'Category'], delimiter=';'))
UNICODEDATA_FIELDS = (
"Value",
"Name",
"General_Category",
"Canonical_Combining_Class",
"Bidi_Class",
"Decomposition_Type",
"Decomposition_Mapping",
"Numeric_Type",
"Numeric_Mapping",
"Bidi_Mirrored",
"Unicode_1_Name",
"ISO_Comment",
"Simple_Uppercase_Mapping",
"Simple_Lowercase_Mapping",
"Simple_Titlecase_Mapping",
)
def data():
with open('UnicodeData.txt') as f:
return list(csv.DictReader(f, fieldnames=UNICODEDATA_FIELDS, delimiter=';'))
def unhex(s):
return int(s, 16)
def unhex_sequence(s):
return [unhex(x) for x in s.split()] if '<' not in s else None

View file

@ -2367,7 +2367,7 @@ int str_comp_nocase(const char *a, const char *b)
#endif
}
int str_comp_nocase_num(const char *a, const char *b, const int num)
int str_comp_nocase_num(const char *a, const char *b, int num)
{
#if defined(CONF_FAMILY_WINDOWS)
return _strnicmp(a, b, num);
@ -2381,7 +2381,7 @@ int str_comp(const char *a, const char *b)
return strcmp(a, b);
}
int str_comp_num(const char *a, const char *b, const int num)
int str_comp_num(const char *a, const char *b, int num)
{
return strncmp(a, b, num);
}
@ -2721,6 +2721,63 @@ int str_toint(const char *str) { return atoi(str); }
int str_toint_base(const char *str, int base) { return strtol(str, NULL, base); }
float str_tofloat(const char *str) { return atof(str); }
int str_utf8_comp_nocase(const char *a, const char *b)
{
int code_a;
int code_b;
while(*a && *b)
{
code_a = str_utf8_tolower(str_utf8_decode(&a));
code_b = str_utf8_tolower(str_utf8_decode(&b));
if(code_a != code_b)
return code_a - code_b;
}
return (unsigned char)*a - (unsigned char)*b;
}
int str_utf8_comp_nocase_num(const char *a, const char *b, int num)
{
int code_a;
int code_b;
const char *old_a = a;
while(*a && *b)
{
if(a - old_a >= num)
return 0;
code_a = str_utf8_tolower(str_utf8_decode(&a));
code_b = str_utf8_tolower(str_utf8_decode(&b));
if(code_a != code_b)
return code_a - code_b;
}
return (unsigned char)*a - (unsigned char)*b;
}
const char *str_utf8_find_nocase(const char *haystack, const char *needle)
{
while(*haystack) /* native implementation */
{
const char *a = haystack;
const char *b = needle;
const char *a_next = a;
const char *b_next = b;
while(*a && *b && str_utf8_tolower(str_utf8_decode(&a_next)) == str_utf8_tolower(str_utf8_decode(&b_next)))
{
a = a_next;
b = b_next;
}
if(!(*b))
return haystack;
str_utf8_decode(&haystack);
}
return 0;
}
int str_utf8_isspace(int code)
{

View file

@ -159,7 +159,7 @@ void mem_zero(void *block, unsigned size);
size - Size of the data to compare
Returns:
<0 - Block a is lesser than block b
<0 - Block a is less than block b
0 - Block a is equal to block b
>0 - Block a is greater than block b
*/
@ -676,7 +676,7 @@ int net_host_lookup(const char *hostname, NETADDR *addr, int types);
b - Address to compare to.
Returns:
<0 - Address a is lesser than address b
<0 - Address a is less than address b
0 - Address a is equal to address b
>0 - Address a is greater than address b
*/
@ -691,7 +691,7 @@ int net_addr_comp(const NETADDR *a, const NETADDR *b);
b - Address to compare to.
Returns:
<0 - Address a is lesser than address b
<0 - Address a is less than address b
0 - Address a is equal to address b
>0 - Address a is greater than address b
*/
@ -1114,26 +1114,26 @@ char *str_skip_whitespaces(char *str);
/*
Function: str_comp_nocase
Compares to strings case insensitive.
Compares to strings case insensitively.
Parameters:
a - String to compare.
b - String to compare.
Returns:
<0 - String a is lesser than string b
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
Remarks:
- Only garanted to work with a-z/A-Z.
- Only guaranteed to work with a-z/A-Z.
- The strings are treated as zero-terminated strings.
*/
int str_comp_nocase(const char *a, const char *b);
/*
Function: str_comp_nocase_num
Compares up to num characters of two strings case insensitive.
Compares up to num characters of two strings case insensitively.
Parameters:
a - String to compare.
@ -1141,15 +1141,16 @@ int str_comp_nocase(const char *a, const char *b);
num - Maximum characters to compare
Returns:
<0 - String a is lesser than string b
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
Remarks:
- Only garanted to work with a-z/A-Z.
- Only guaranteed to work with a-z/A-Z.
(use str_utf8_comp_nocase_num for unicode support)
- The strings are treated as zero-terminated strings.
*/
int str_comp_nocase_num(const char *a, const char *b, const int num);
int str_comp_nocase_num(const char *a, const char *b, int num);
/*
Function: str_comp
@ -1160,7 +1161,7 @@ int str_comp_nocase_num(const char *a, const char *b, const int num);
b - String to compare.
Returns:
<0 - String a is lesser than string b
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
@ -1179,14 +1180,14 @@ int str_comp(const char *a, const char *b);
num - Maximum characters to compare
Returns:
<0 - String a is lesser than string b
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
Remarks:
- The strings are treated as zero-terminated strings.
*/
int str_comp_num(const char *a, const char *b, const int num);
int str_comp_num(const char *a, const char *b, int num);
/*
Function: str_comp_filenames
@ -1197,7 +1198,7 @@ int str_comp_num(const char *a, const char *b, const int num);
b - String to compare.
Returns:
<0 - String a is lesser than string b
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
@ -1300,7 +1301,7 @@ int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int
/*
Function: str_find_nocase
Finds a string inside another string case insensitive.
Finds a string inside another string case insensitively.
Parameters:
haystack - String to search in
@ -1311,7 +1312,8 @@ int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int
Returns NULL of needle could not be found.
Remarks:
- Only garanted to work with a-z/A-Z.
- Only guaranteed to work with a-z/A-Z.
(use str_utf8_find_nocase for unicode support)
- The strings are treated as zero-terminated strings.
*/
const char *str_find_nocase(const char *haystack, const char *needle);
@ -1631,6 +1633,66 @@ int str_utf8_to_skeleton(const char *str, int *buf, int buf_len);
*/
int str_utf8_comp_confusable(const char *a, const char *b);
/*
Function: str_utf8_tolower
Converts the given Unicode codepoint to lowercase (locale insensitive).
Parameters:
code - Unicode codepoint to convert.
Returns:
Lowercase codepoint
*/
int str_utf8_tolower(int code);
/*
Function: str_utf8_comp_nocase
Compares two utf8 strings case insensitively.
Parameters:
a - String to compare.
b - String to compare.
Returns:
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
*/
int str_utf8_comp_nocase(const char *a, const char *b);
/*
Function: str_utf8_comp_nocase_num
Compares up to num bytes of two utf8 strings case insensitively.
Parameters:
a - String to compare.
b - String to compare.
num - Maximum characters to compare
Returns:
<0 - String a is less than string b
0 - String a is equal to string b
>0 - String a is greater than string b
*/
int str_utf8_comp_nocase_num(const char *a, const char *b, int num);
/*
Function: str_utf8_find_nocase
Finds a utf8 string inside another utf8 string case insensitively.
Parameters:
haystack - String to search in
needle - String to search for
Returns:
A pointer into haystack where the needle was found.
Returns NULL of needle could not be found.
Remarks:
- The strings are treated as zero-terminated strings.
*/
const char *str_utf8_find_nocase(const char *haystack, const char *needle);
/*
Function: str_utf8_isspace
Checks whether the given Unicode codepoint renders as space.

1
src/base/unicode/VERSION Normal file
View file

@ -0,0 +1 @@
12.0.0

View file

@ -1,6 +1,6 @@
#include "confusables_data.h"
#include "system.h"
#include "../system.h"
#include <stddef.h>

View file

@ -0,0 +1,22 @@
#include <stdlib.h>
#include "tolower_data.h"
static int compul(const void *a, const void *b)
{
struct UPPER_LOWER *ul_a = (struct UPPER_LOWER *) a;
struct UPPER_LOWER *ul_b = (struct UPPER_LOWER *) b;
return ul_a->upper - ul_b->upper;
}
int str_utf8_tolower(int code)
{
struct UPPER_LOWER key;
struct UPPER_LOWER *res;
key.upper = code;
res = bsearch(&key, tolower, NUM_TOLOWER, sizeof(struct UPPER_LOWER), compul);
if(res == NULL)
return code;
return res->lower;
}

File diff suppressed because it is too large Load diff

View file

@ -327,11 +327,11 @@ bool CChat::OnInput(IInput::CEvent Event)
bool Found = false;
if(SearchType == 1)
{
if(str_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)) &&
str_find_nocase(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer))
if(str_utf8_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)) &&
str_utf8_find_nocase(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer))
Found = true;
}
else if(!str_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)))
else if(!str_utf8_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)))
Found = true;
if(Found)

0
src/in Normal file
View file

View file

@ -55,6 +55,35 @@ TEST(Str, Utf8CompConfusables)
EXPECT_TRUE(str_utf8_comp_confusable("aceiou", "ąçęįǫų") == 0);
}
TEST(Str, Utf8ToLower)
{
EXPECT_TRUE(str_utf8_tolower('A') == 'a');
EXPECT_TRUE(str_utf8_tolower('z') == 'z');
EXPECT_TRUE(str_utf8_tolower(192) == 224); // À -> à
EXPECT_TRUE(str_utf8_tolower(7882) == 7883); // Ị -> ị
EXPECT_TRUE(str_utf8_comp_nocase("ÖlÜ", "ölü") == 0);
EXPECT_TRUE(str_utf8_comp_nocase("ÜlÖ", "ölü") > 0); // ü > ö
EXPECT_TRUE(str_utf8_comp_nocase("ÖlÜ", "ölüa") < 0); // NULL < a
EXPECT_TRUE(str_utf8_comp_nocase("ölüa", "ÖlÜ") > 0); // a < NULL
const char a[2] = {-128, 0};
const char b[2] = {0, 0};
EXPECT_TRUE(str_utf8_comp_nocase(a, b) > 0);
EXPECT_TRUE(str_utf8_comp_nocase(b, a) < 0);
EXPECT_TRUE(str_utf8_comp_nocase_num("ÖlÜ", "ölüa", 3) == 0);
EXPECT_TRUE(str_utf8_comp_nocase_num("ÖlÜ", "ölüa", 4) != 0);
EXPECT_TRUE(str_utf8_comp_nocase_num("a", "z", 0) == 0);
EXPECT_TRUE(str_utf8_comp_nocase_num("a", "z", 1) != 0);
const char str[] = "ÄÖÜ";
EXPECT_TRUE(str_utf8_find_nocase(str, "ä") == str);
EXPECT_TRUE(str_utf8_find_nocase(str, "ö") == str+2);
EXPECT_TRUE(str_utf8_find_nocase(str, "ü") == str+4);
EXPECT_TRUE(str_utf8_find_nocase(str, "z") == NULL);
}
TEST(Str, Startswith)
{
EXPECT_TRUE(str_startswith("abcdef", "abc"));

View file

@ -5,7 +5,7 @@ int main(int argc, const char **argv) // ignore_convention
dbg_logger_stdout();
if(argc < 1 + 2)
{
dbg_msg("usage", "%s STR1 STR2", argv[0] ? argv[0] : "confusables");
dbg_msg("usage", "%s STR1 STR2", argv[0] ? argv[0] : "unicode_confusables");
return -1;
}
dbg_msg("conf", "not_confusable=%d", str_utf8_comp_confusable(argv[1], argv[2]));