Merge pull request #1423 from ddnet/utf8_nocase

UTF8 nocase compare & use for chat TAB completion
2024-11-10 10:08:18 +00:00 · 2019-01-09 08:54:25 +01:00 · 2019-01-09 08:54:25 +01:00 · 71ec8cc294
parent f82ab4cb21 faa3cc195d
commit 71ec8cc294
17 changed files with 7252 additions and 5544 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,6 @@ DDNet-Server
 DDNet-Server-Launcher
 config_retrieve
 config_store
 confusables
 crapnet
 dilate
 dummy_map
@ -58,6 +57,7 @@ tileset_borderfix
 tileset_borderrem
 tileset_borderset
 twping
 unicode_confusables
 uuid
 versionsrv
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -565,8 +565,6 @@ generate_source("src/game/generated/server_data.h" "server_content_header")
 # Sources
 set_glob(BASE GLOB_RECURSE src/base
  color.h
  confusables.c
  confusables_data.h
  detect.h
  hash.c
  hash.h
@ -584,6 +582,10 @@ set_glob(BASE GLOB_RECURSE src/base
  tl/sorted_array.h
  tl/string.h
  tl/threading.h
  unicode/confusables.c
  unicode/confusables_data.h
  unicode/tolower.c
  unicode/tolower_data.h
  vmath.h
 )
 set_glob(ENGINE_INTERFACE GLOB src/engine
@ -1094,7 +1096,6 @@ set_glob(TOOLS GLOB src/tools
  config_common.h
  config_retrieve.cpp
  config_store.cpp
  confusables.cpp
  crapnet.cpp
  dilate.cpp
  dummy_map.cpp
@ -1108,6 +1109,7 @@ set_glob(TOOLS GLOB src/tools
  tileset_borderfix.cpp
  tileset_borderrem.cpp
  tileset_borderset.cpp
  unicode_confusables.cpp
  uuid.cpp
 )
 foreach(ABS_T ${TOOLS})
--- a/scripts/check_header_guards.py
+++ b/scripts/check_header_guards.py
@ -6,7 +6,8 @@ os.chdir(os.path.dirname(__file__) + "/..")
 PATH = "src/"
 EXCEPTIONS = [
-	"src/base/confusables_data.h",
+	"src/base/unicode/confusables_data.h",
 	"src/base/unicode/tolower_data.h",
 	"src/tools/config_common.h"
 ]
--- a/scripts/generate_unicode_confusables_data.py
+++ b/scripts/generate_unicode_confusables_data.py
@ -5,51 +5,20 @@
 # - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
 #
 # If executed as a script, it will generate the contents of the file
-# `src/base/confusables_data.h`.
+# `src/base/unicode/confusables_data.h`.
-import csv
+import unicode
 def confusables():
    with open('confusables.txt', encoding='utf-8-sig') as f:
        # Filter comments
        f = map(lambda line: line.split('#')[0], f)
        return list(csv.DictReader(f, fieldnames=['Value', 'Target', 'Category'], delimiter=';'))
 UNICODEDATA_FIELDS = (
    "Value",
    "Name",
    "General_Category",
    "Canonical_Combining_Class",
    "Bidi_Class",
    "Decomposition",
    "Numeric",
    "Bidi_Mirrored",
    "Unicode_1_Name",
    "ISO_Comment",
    "Simple_Uppercase_Mapping",
    "Simple_Lowercase_Mapping",
    "Simple_Titlecase_Mapping",
 )
 def unicodedata():
    with open('UnicodeData.txt') as f:
        return list(csv.DictReader(f, fieldnames=UNICODEDATA_FIELDS, delimiter=';'))
 def unhex(s):
    return int(s, 16)
 def unhex_sequence(s):
    return [unhex(x) for x in s.split()] if '<' not in s else None
 def generate_decompositions():
-    ud = unicodedata()
+    ud = unicode.data()
-    con = confusables()
+    con = unicode.confusables()
-    category = lambda x: {unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
+    category = lambda x: {unicode.unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
-    nfd = {unhex(u["Value"]): unhex_sequence(u["Decomposition"]) for u in ud}
+    # TODO: Is this correct? They changed the decompositioning format
    nfd = {unicode.unhex(u["Value"]): unicode.unhex_sequence(u["Decomposition_Type"]) for u in ud}
    nfd = {k: v for k, v in nfd.items() if v}
-    con = {unhex(c["Value"]): unhex_sequence(c["Target"]) for c in con}
+    con = {unicode.unhex(c["Value"]): unicode.unhex_sequence(c["Target"]) for c in con}
    # C: Control
    # M: Combining
--- a/scripts/generate_unicode_tolower.py
+++ b/scripts/generate_unicode_tolower.py
@ -0,0 +1,38 @@
 # Needs UnicodeData.txt in the current directory.
 #
 # It can be obtained from unicode.org:
 # - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
 #
 # If executed as a script, it will generate the contents of the file
 # `src/base/unicode/tolower_data.h`.
 import unicode
 def generate_cases():
    ud = unicode.data()
    return [(unicode.unhex(u["Value"]), unicode.unhex(u["Simple_Lowercase_Mapping"])) for u in ud if u["Simple_Lowercase_Mapping"]]
 def main():
    cases = generate_cases()
    print("""\
 #include <stdint.h>
 struct UPPER_LOWER
 {{
 \tint32_t upper;
 \tint32_t lower;
 }};
 enum
 {{
 \tNUM_TOLOWER={},
 }};
 static const struct UPPER_LOWER tolower[NUM_TOLOWER] = {{""".format(len(cases)))
    for upper_code, lower_code in cases:
        print("\t{{{}, {}}},".format(upper_code, lower_code))
    print("};")
 if __name__ == '__main__':
    main()
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@ -0,0 +1,35 @@
 import csv
 def confusables():
    with open('confusables.txt', encoding='utf-8-sig') as f:
        # Filter comments
        f = map(lambda line: line.split('#')[0], f)
        return list(csv.DictReader(f, fieldnames=['Value', 'Target', 'Category'], delimiter=';'))
 UNICODEDATA_FIELDS = (
    "Value",
    "Name",
    "General_Category",
    "Canonical_Combining_Class",
    "Bidi_Class",
    "Decomposition_Type",
    "Decomposition_Mapping",
    "Numeric_Type",
    "Numeric_Mapping",
    "Bidi_Mirrored",
    "Unicode_1_Name",
    "ISO_Comment",
    "Simple_Uppercase_Mapping",
    "Simple_Lowercase_Mapping",
    "Simple_Titlecase_Mapping",
 )
 def data():
    with open('UnicodeData.txt') as f:
        return list(csv.DictReader(f, fieldnames=UNICODEDATA_FIELDS, delimiter=';'))
 def unhex(s):
    return int(s, 16)
 def unhex_sequence(s):
    return [unhex(x) for x in s.split()] if '<' not in s else None
--- a/src/base/system.c
+++ b/src/base/system.c
@ -2367,7 +2367,7 @@ int str_comp_nocase(const char *a, const char *b)
 #endif
 }
-int str_comp_nocase_num(const char *a, const char *b, const int num)
+int str_comp_nocase_num(const char *a, const char *b, int num)
 {
 #if defined(CONF_FAMILY_WINDOWS)
 	return _strnicmp(a, b, num);
@ -2381,7 +2381,7 @@ int str_comp(const char *a, const char *b)
 	return strcmp(a, b);
 }
-int str_comp_num(const char *a, const char *b, const int num)
+int str_comp_num(const char *a, const char *b, int num)
 {
 	return strncmp(a, b, num);
 }
@ -2721,6 +2721,63 @@ int str_toint(const char *str) { return atoi(str); }
 int str_toint_base(const char *str, int base) { return strtol(str, NULL, base); }
 float str_tofloat(const char *str) { return atof(str); }
 int str_utf8_comp_nocase(const char *a, const char *b)
 {
 	int code_a;
 	int code_b;
 	while(*a && *b)
 	{
 		code_a = str_utf8_tolower(str_utf8_decode(&a));
 		code_b = str_utf8_tolower(str_utf8_decode(&b));
 		if(code_a != code_b)
 			return code_a - code_b;
 	}
 	return (unsigned char)*a - (unsigned char)*b;
 }
 int str_utf8_comp_nocase_num(const char *a, const char *b, int num)
 {
 	int code_a;
 	int code_b;
 	const char *old_a = a;
 	while(*a && *b)
 	{
 		if(a - old_a >= num)
 			return 0;
 		code_a = str_utf8_tolower(str_utf8_decode(&a));
 		code_b = str_utf8_tolower(str_utf8_decode(&b));
 		if(code_a != code_b)
 			return code_a - code_b;
 	}
 	return (unsigned char)*a - (unsigned char)*b;
 }
 const char *str_utf8_find_nocase(const char *haystack, const char *needle)
 {
 	while(*haystack) /* native implementation */
 	{
 		const char *a = haystack;
 		const char *b = needle;
 		const char *a_next = a;
 		const char *b_next = b;
 		while(*a && *b && str_utf8_tolower(str_utf8_decode(&a_next)) == str_utf8_tolower(str_utf8_decode(&b_next)))
 		{
 			a = a_next;
 			b = b_next;
 		}
 		if(!(*b))
 			return haystack;
 		str_utf8_decode(&haystack);
 	}
 	return 0;
 }
 int str_utf8_isspace(int code)
 {
--- a/src/base/system.h
+++ b/src/base/system.h
@ -159,7 +159,7 @@ void mem_zero(void *block, unsigned size);
 		size - Size of the data to compare
 	Returns:
-		<0 - Block a is lesser than block b
+		<0 - Block a is less than block b
 		0 - Block a is equal to block b
 		>0 - Block a is greater than block b
 */
@ -676,7 +676,7 @@ int net_host_lookup(const char *hostname, NETADDR *addr, int types);
 		b - Address to compare to.
 	Returns:
-		<0 - Address a is lesser than address b
+		<0 - Address a is less than address b
 		0 - Address a is equal to address b
 		>0 - Address a is greater than address b
 */
@ -691,7 +691,7 @@ int net_addr_comp(const NETADDR *a, const NETADDR *b);
 		b - Address to compare to.
 	Returns:
-		<0 - Address a is lesser than address b
+		<0 - Address a is less than address b
 		0 - Address a is equal to address b
 		>0 - Address a is greater than address b
 */
@ -1114,26 +1114,26 @@ char *str_skip_whitespaces(char *str);
 /*
 	Function: str_comp_nocase
-		Compares to strings case insensitive.
+		Compares to strings case insensitively.
 	Parameters:
 		a - String to compare.
 		b - String to compare.
 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
 	Remarks:
-		- Only garanted to work with a-z/A-Z.
+		- Only guaranteed to work with a-z/A-Z.
 		- The strings are treated as zero-terminated strings.
 */
 int str_comp_nocase(const char *a, const char *b);
 /*
 	Function: str_comp_nocase_num
-		Compares up to num characters of two strings case insensitive.
+		Compares up to num characters of two strings case insensitively.
 	Parameters:
 		a - String to compare.
@ -1141,15 +1141,16 @@ int str_comp_nocase(const char *a, const char *b);
 		num - Maximum characters to compare
 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
 	Remarks:
-		- Only garanted to work with a-z/A-Z.
+		- Only guaranteed to work with a-z/A-Z.
 		  (use str_utf8_comp_nocase_num for unicode support)
 		- The strings are treated as zero-terminated strings.
 */
-int str_comp_nocase_num(const char *a, const char *b, const int num);
+int str_comp_nocase_num(const char *a, const char *b, int num);
 /*
 	Function: str_comp
@ -1160,7 +1161,7 @@ int str_comp_nocase_num(const char *a, const char *b, const int num);
 		b - String to compare.
 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
@ -1179,14 +1180,14 @@ int str_comp(const char *a, const char *b);
 		num - Maximum characters to compare
 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
 	Remarks:
 		- The strings are treated as zero-terminated strings.
 */
-int str_comp_num(const char *a, const char *b, const int num);
+int str_comp_num(const char *a, const char *b, int num);
 /*
 	Function: str_comp_filenames
@ -1197,7 +1198,7 @@ int str_comp_num(const char *a, const char *b, const int num);
 		b - String to compare.
 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
@ -1300,7 +1301,7 @@ int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int
 /*
 	Function: str_find_nocase
-		Finds a string inside another string case insensitive.
+		Finds a string inside another string case insensitively.
 	Parameters:
 		haystack - String to search in
@ -1311,7 +1312,8 @@ int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int
 		Returns NULL of needle could not be found.
 	Remarks:
-		- Only garanted to work with a-z/A-Z.
+		- Only guaranteed to work with a-z/A-Z.
 		  (use str_utf8_find_nocase for unicode support)
 		- The strings are treated as zero-terminated strings.
 */
 const char *str_find_nocase(const char *haystack, const char *needle);
@ -1631,6 +1633,66 @@ int str_utf8_to_skeleton(const char *str, int *buf, int buf_len);
 */
 int str_utf8_comp_confusable(const char *a, const char *b);
 /*
 	Function: str_utf8_tolower
 		Converts the given Unicode codepoint to lowercase (locale insensitive).
 	Parameters:
 		code - Unicode codepoint to convert.
 	Returns:
 		Lowercase codepoint
 */
 int str_utf8_tolower(int code);
 /*
 	Function: str_utf8_comp_nocase
 		Compares two utf8 strings case insensitively.
 	Parameters:
 		a - String to compare.
 		b - String to compare.
 	Returns:
 		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
 */
 int str_utf8_comp_nocase(const char *a, const char *b);
 /*
 	Function: str_utf8_comp_nocase_num
 		Compares up to num bytes of two utf8 strings case insensitively.
 	Parameters:
 		a - String to compare.
 		b - String to compare.
 		num - Maximum characters to compare
 	Returns:
 		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b
 */
 int str_utf8_comp_nocase_num(const char *a, const char *b, int num);
 /*
 	Function: str_utf8_find_nocase
 		Finds a utf8 string inside another utf8 string case insensitively.
 	Parameters:
 		haystack - String to search in
 		needle - String to search for
 	Returns:
 		A pointer into haystack where the needle was found.
 		Returns NULL of needle could not be found.
 	Remarks:
 		- The strings are treated as zero-terminated strings.
 */
 const char *str_utf8_find_nocase(const char *haystack, const char *needle);
 /*
 	Function: str_utf8_isspace
 		Checks whether the given Unicode codepoint renders as space.
--- a/src/base/unicode/VERSION
+++ b/src/base/unicode/VERSION
@ -0,0 +1 @@
 12.0.0
--- a/src/base/unicode/confusables.c
+++ b/src/base/unicode/confusables.c
@ -1,6 +1,6 @@
 #include "confusables_data.h"
-#include "system.h"
+#include "../system.h"
 #include <stddef.h>
--- a/src/base/unicode/confusables_data.h
+++ b/src/base/unicode/confusables_data.h
--- a/src/base/unicode/tolower.c
+++ b/src/base/unicode/tolower.c
@ -0,0 +1,22 @@
 #include <stdlib.h>
 #include "tolower_data.h"
 static int compul(const void *a, const void *b)
 {
  struct UPPER_LOWER *ul_a = (struct UPPER_LOWER *) a;
  struct UPPER_LOWER *ul_b = (struct UPPER_LOWER *) b;
  return ul_a->upper - ul_b->upper;
 }
 int str_utf8_tolower(int code)
 {
  struct UPPER_LOWER key;
  struct UPPER_LOWER *res;
  key.upper = code;
  res = bsearch(&key, tolower, NUM_TOLOWER, sizeof(struct UPPER_LOWER), compul);
  if(res == NULL)
    return code;
  return res->lower;
 }
--- a/src/base/unicode/tolower_data.h
+++ b/src/base/unicode/tolower_data.h
--- a/src/game/client/components/chat.cpp
+++ b/src/game/client/components/chat.cpp
@ -327,11 +327,11 @@ bool CChat::OnInput(IInput::CEvent Event)
 			bool Found = false;
 			if(SearchType == 1)
 			{
-				if(str_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)) &&
+				if(str_utf8_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)) &&
-					str_find_nocase(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer))
+					str_utf8_find_nocase(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer))
 					Found = true;
 			}
-			else if(!str_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)))
+			else if(!str_utf8_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)))
 				Found = true;
 			if(Found)
--- a/src/in
+++ b/src/in
--- a/src/test/str.cpp
+++ b/src/test/str.cpp
@ -55,6 +55,35 @@ TEST(Str, Utf8CompConfusables)
 	EXPECT_TRUE(str_utf8_comp_confusable("aceiou", "ąçęįǫų") == 0);
 }
 TEST(Str, Utf8ToLower)
 {
 	EXPECT_TRUE(str_utf8_tolower('A') == 'a');
 	EXPECT_TRUE(str_utf8_tolower('z') == 'z');
 	EXPECT_TRUE(str_utf8_tolower(192) == 224); // À -> à
 	EXPECT_TRUE(str_utf8_tolower(7882) == 7883); // Ị -> ị
 	EXPECT_TRUE(str_utf8_comp_nocase("ÖlÜ", "ölü") == 0);
 	EXPECT_TRUE(str_utf8_comp_nocase("ÜlÖ", "ölü") > 0); // ü > ö
 	EXPECT_TRUE(str_utf8_comp_nocase("ÖlÜ", "ölüa") < 0); // NULL < a
 	EXPECT_TRUE(str_utf8_comp_nocase("ölüa", "ÖlÜ") > 0); // a < NULL
 	const char a[2] = {-128, 0};
 	const char b[2] = {0, 0};
 	EXPECT_TRUE(str_utf8_comp_nocase(a, b) > 0);
 	EXPECT_TRUE(str_utf8_comp_nocase(b, a) < 0);
 	EXPECT_TRUE(str_utf8_comp_nocase_num("ÖlÜ", "ölüa", 3) == 0);
 	EXPECT_TRUE(str_utf8_comp_nocase_num("ÖlÜ", "ölüa", 4) != 0);
 	EXPECT_TRUE(str_utf8_comp_nocase_num("a", "z", 0) == 0);
 	EXPECT_TRUE(str_utf8_comp_nocase_num("a", "z", 1) != 0);
 	const char str[] = "ÄÖÜ";
 	EXPECT_TRUE(str_utf8_find_nocase(str, "ä") == str);
 	EXPECT_TRUE(str_utf8_find_nocase(str, "ö") == str+2);
 	EXPECT_TRUE(str_utf8_find_nocase(str, "ü") == str+4);
 	EXPECT_TRUE(str_utf8_find_nocase(str, "z") == NULL);
 }
 TEST(Str, Startswith)
 {
 	EXPECT_TRUE(str_startswith("abcdef", "abc"));
--- a/src/tools/unicode_confusables.cpp
+++ b/src/tools/unicode_confusables.cpp
@ -5,7 +5,7 @@ int main(int argc, const char **argv) // ignore_convention
 	dbg_logger_stdout();
 	if(argc < 1 + 2)
 	{
-		dbg_msg("usage", "%s STR1 STR2", argv[0] ? argv[0] : "confusables");
+		dbg_msg("usage", "%s STR1 STR2", argv[0] ? argv[0] : "unicode_confusables");
 		return -1;
 	}
 	dbg_msg("conf", "not_confusable=%d", str_utf8_comp_confusable(argv[1], argv[2]));