Merge pull request #1423 from ddnet/utf8_nocase

UTF8 nocase compare & use for chat TAB completion
2024-09-20 09:34:19 +00:00 · 2019-01-09 08:54:25 +01:00 · 2019-01-09 08:54:25 +01:00 · 71ec8cc294
parent f82ab4cb21 faa3cc195d
commit 71ec8cc294
17 changed files with 7252 additions and 5544 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,6 @@ DDNet-Server
 DDNet-Server-Launcher
 config_retrieve
 config_store
-confusables
 crapnet
 dilate
 dummy_map
@ -58,6 +57,7 @@ tileset_borderfix
 tileset_borderrem
 tileset_borderset
 twping
+unicode_confusables
 uuid
 versionsrv

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -565,8 +565,6 @@ generate_source("src/game/generated/server_data.h" "server_content_header")
 # Sources
 set_glob(BASE GLOB_RECURSE src/base
  color.h
-  confusables.c
-  confusables_data.h
  detect.h
  hash.c
  hash.h
@ -584,6 +582,10 @@ set_glob(BASE GLOB_RECURSE src/base
  tl/sorted_array.h
  tl/string.h
  tl/threading.h
+  unicode/confusables.c
+  unicode/confusables_data.h
+  unicode/tolower.c
+  unicode/tolower_data.h
  vmath.h
 )
 set_glob(ENGINE_INTERFACE GLOB src/engine
@ -1094,7 +1096,6 @@ set_glob(TOOLS GLOB src/tools
  config_common.h
  config_retrieve.cpp
  config_store.cpp
-  confusables.cpp
  crapnet.cpp
  dilate.cpp
  dummy_map.cpp
@ -1108,6 +1109,7 @@ set_glob(TOOLS GLOB src/tools
  tileset_borderfix.cpp
  tileset_borderrem.cpp
  tileset_borderset.cpp
+  unicode_confusables.cpp
  uuid.cpp
 )
 foreach(ABS_T ${TOOLS})
--- a/scripts/check_header_guards.py
+++ b/scripts/check_header_guards.py
@ -6,7 +6,8 @@ os.chdir(os.path.dirname(__file__) + "/..")

 PATH = "src/"
 EXCEPTIONS = [
-	"src/base/confusables_data.h",
+	"src/base/unicode/confusables_data.h",
+	"src/base/unicode/tolower_data.h",
 	"src/tools/config_common.h"
 ]

--- a/scripts/generate_unicode_confusables_data.py
+++ b/scripts/generate_unicode_confusables_data.py
@ -5,51 +5,20 @@
 # - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
 #
 # If executed as a script, it will generate the contents of the file
-# `src/base/confusables_data.h`.
+# `src/base/unicode/confusables_data.h`.

-import csv
-
-def confusables():
-    with open('confusables.txt', encoding='utf-8-sig') as f:
-        # Filter comments
-        f = map(lambda line: line.split('#')[0], f)
-        return list(csv.DictReader(f, fieldnames=['Value', 'Target', 'Category'], delimiter=';'))
-
-UNICODEDATA_FIELDS = (
-    "Value",
-    "Name",
-    "General_Category",
-    "Canonical_Combining_Class",
-    "Bidi_Class",
-    "Decomposition",
-    "Numeric",
-    "Bidi_Mirrored",
-    "Unicode_1_Name",
-    "ISO_Comment",
-    "Simple_Uppercase_Mapping",
-    "Simple_Lowercase_Mapping",
-    "Simple_Titlecase_Mapping",
-)
-
-def unicodedata():
-    with open('UnicodeData.txt') as f:
-        return list(csv.DictReader(f, fieldnames=UNICODEDATA_FIELDS, delimiter=';'))
-
-def unhex(s):
-    return int(s, 16)
-
-def unhex_sequence(s):
-    return [unhex(x) for x in s.split()] if '<' not in s else None
+import unicode

 def generate_decompositions():
-    ud = unicodedata()
-    con = confusables()
+    ud = unicode.data()
+    con = unicode.confusables()

-    category = lambda x: {unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
+    category = lambda x: {unicode.unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}

-    nfd = {unhex(u["Value"]): unhex_sequence(u["Decomposition"]) for u in ud}
+    # TODO: Is this correct? They changed the decompositioning format
+    nfd = {unicode.unhex(u["Value"]): unicode.unhex_sequence(u["Decomposition_Type"]) for u in ud}
    nfd = {k: v for k, v in nfd.items() if v}
-    con = {unhex(c["Value"]): unhex_sequence(c["Target"]) for c in con}
+    con = {unicode.unhex(c["Value"]): unicode.unhex_sequence(c["Target"]) for c in con}

    # C: Control
    # M: Combining
--- a/scripts/generate_unicode_tolower.py
+++ b/scripts/generate_unicode_tolower.py
@ -0,0 +1,38 @@
+# Needs UnicodeData.txt in the current directory.
+#
+# It can be obtained from unicode.org:
+# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
+#
+# If executed as a script, it will generate the contents of the file
+# `src/base/unicode/tolower_data.h`.
+
+import unicode
+
+def generate_cases():
+    ud = unicode.data()
+    return [(unicode.unhex(u["Value"]), unicode.unhex(u["Simple_Lowercase_Mapping"])) for u in ud if u["Simple_Lowercase_Mapping"]]
+
+def main():
+    cases = generate_cases()
+
+    print("""\
+#include <stdint.h>
+
+struct UPPER_LOWER
+{{
+\tint32_t upper;
+\tint32_t lower;
+}};
+
+enum
+{{
+\tNUM_TOLOWER={},
+}};
+
+static const struct UPPER_LOWER tolower[NUM_TOLOWER] = {{""".format(len(cases)))
+    for upper_code, lower_code in cases:
+        print("\t{{{}, {}}},".format(upper_code, lower_code))
+    print("};")
+
+if __name__ == '__main__':
+    main()
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@ -0,0 +1,35 @@
+import csv
+
+def confusables():
+    with open('confusables.txt', encoding='utf-8-sig') as f:
+        # Filter comments
+        f = map(lambda line: line.split('#')[0], f)
+        return list(csv.DictReader(f, fieldnames=['Value', 'Target', 'Category'], delimiter=';'))
+
+UNICODEDATA_FIELDS = (
+    "Value",
+    "Name",
+    "General_Category",
+    "Canonical_Combining_Class",
+    "Bidi_Class",
+    "Decomposition_Type",
+    "Decomposition_Mapping",
+    "Numeric_Type",
+    "Numeric_Mapping",
+    "Bidi_Mirrored",
+    "Unicode_1_Name",
+    "ISO_Comment",
+    "Simple_Uppercase_Mapping",
+    "Simple_Lowercase_Mapping",
+    "Simple_Titlecase_Mapping",
+)
+
+def data():
+    with open('UnicodeData.txt') as f:
+        return list(csv.DictReader(f, fieldnames=UNICODEDATA_FIELDS, delimiter=';'))
+
+def unhex(s):
+    return int(s, 16)
+
+def unhex_sequence(s):
+    return [unhex(x) for x in s.split()] if '<' not in s else None
--- a/src/base/system.c
+++ b/src/base/system.c
@ -2367,7 +2367,7 @@ int str_comp_nocase(const char *a, const char *b)
 #endif
 }

-int str_comp_nocase_num(const char *a, const char *b, const int num)
+int str_comp_nocase_num(const char *a, const char *b, int num)
 {
 #if defined(CONF_FAMILY_WINDOWS)
 	return _strnicmp(a, b, num);
@ -2381,7 +2381,7 @@ int str_comp(const char *a, const char *b)
 	return strcmp(a, b);
 }

-int str_comp_num(const char *a, const char *b, const int num)
+int str_comp_num(const char *a, const char *b, int num)
 {
 	return strncmp(a, b, num);
 }
@ -2721,6 +2721,63 @@ int str_toint(const char *str) { return atoi(str); }
 int str_toint_base(const char *str, int base) { return strtol(str, NULL, base); }
 float str_tofloat(const char *str) { return atof(str); }

+int str_utf8_comp_nocase(const char *a, const char *b)
+{
+	int code_a;
+	int code_b;
+
+	while(*a && *b)
+	{
+		code_a = str_utf8_tolower(str_utf8_decode(&a));
+		code_b = str_utf8_tolower(str_utf8_decode(&b));
+
+		if(code_a != code_b)
+			return code_a - code_b;
+	}
+	return (unsigned char)*a - (unsigned char)*b;
+}
+
+int str_utf8_comp_nocase_num(const char *a, const char *b, int num)
+{
+	int code_a;
+	int code_b;
+	const char *old_a = a;
+
+	while(*a && *b)
+	{
+		if(a - old_a >= num)
+			return 0;
+
+		code_a = str_utf8_tolower(str_utf8_decode(&a));
+		code_b = str_utf8_tolower(str_utf8_decode(&b));
+
+		if(code_a != code_b)
+			return code_a - code_b;
+	}
+
+	return (unsigned char)*a - (unsigned char)*b;
+}
+
+const char *str_utf8_find_nocase(const char *haystack, const char *needle)
+{
+	while(*haystack) /* native implementation */
+	{
+		const char *a = haystack;
+		const char *b = needle;
+		const char *a_next = a;
+		const char *b_next = b;
+		while(*a && *b && str_utf8_tolower(str_utf8_decode(&a_next)) == str_utf8_tolower(str_utf8_decode(&b_next)))
+		{
+			a = a_next;
+			b = b_next;
+		}
+		if(!(*b))
+			return haystack;
+		str_utf8_decode(&haystack);
+	}
+
+	return 0;
+}

 int str_utf8_isspace(int code)
 {
--- a/src/base/system.h
+++ b/src/base/system.h
@ -159,7 +159,7 @@ void mem_zero(void *block, unsigned size);
 		size - Size of the data to compare

 	Returns:
-		<0 - Block a is lesser than block b
+		<0 - Block a is less than block b
 		0 - Block a is equal to block b
 		>0 - Block a is greater than block b
 */
@ -676,7 +676,7 @@ int net_host_lookup(const char *hostname, NETADDR *addr, int types);
 		b - Address to compare to.

 	Returns:
-		<0 - Address a is lesser than address b
+		<0 - Address a is less than address b
 		0 - Address a is equal to address b
 		>0 - Address a is greater than address b
 */
@ -691,7 +691,7 @@ int net_addr_comp(const NETADDR *a, const NETADDR *b);
 		b - Address to compare to.

 	Returns:
-		<0 - Address a is lesser than address b
+		<0 - Address a is less than address b
 		0 - Address a is equal to address b
 		>0 - Address a is greater than address b
 */
@ -1114,26 +1114,26 @@ char *str_skip_whitespaces(char *str);

 /*
 	Function: str_comp_nocase
-		Compares to strings case insensitive.
+		Compares to strings case insensitively.

 	Parameters:
 		a - String to compare.
 		b - String to compare.

 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b

 	Remarks:
-		- Only garanted to work with a-z/A-Z.
+		- Only guaranteed to work with a-z/A-Z.
 		- The strings are treated as zero-terminated strings.
 */
 int str_comp_nocase(const char *a, const char *b);

 /*
 	Function: str_comp_nocase_num
-		Compares up to num characters of two strings case insensitive.
+		Compares up to num characters of two strings case insensitively.

 	Parameters:
 		a - String to compare.
@ -1141,15 +1141,16 @@ int str_comp_nocase(const char *a, const char *b);
 		num - Maximum characters to compare

 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b

 	Remarks:
-		- Only garanted to work with a-z/A-Z.
+		- Only guaranteed to work with a-z/A-Z.
+		  (use str_utf8_comp_nocase_num for unicode support)
 		- The strings are treated as zero-terminated strings.
 */
-int str_comp_nocase_num(const char *a, const char *b, const int num);
+int str_comp_nocase_num(const char *a, const char *b, int num);

 /*
 	Function: str_comp
@ -1160,7 +1161,7 @@ int str_comp_nocase_num(const char *a, const char *b, const int num);
 		b - String to compare.

 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b

@ -1179,14 +1180,14 @@ int str_comp(const char *a, const char *b);
 		num - Maximum characters to compare

 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b

 	Remarks:
 		- The strings are treated as zero-terminated strings.
 */
-int str_comp_num(const char *a, const char *b, const int num);
+int str_comp_num(const char *a, const char *b, int num);

 /*
 	Function: str_comp_filenames
@ -1197,7 +1198,7 @@ int str_comp_num(const char *a, const char *b, const int num);
 		b - String to compare.

 	Returns:
-		<0 - String a is lesser than string b
+		<0 - String a is less than string b
 		0 - String a is equal to string b
 		>0 - String a is greater than string b

@ -1300,7 +1301,7 @@ int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int

 /*
 	Function: str_find_nocase
-		Finds a string inside another string case insensitive.
+		Finds a string inside another string case insensitively.

 	Parameters:
 		haystack - String to search in
@ -1311,7 +1312,8 @@ int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int
 		Returns NULL of needle could not be found.

 	Remarks:
-		- Only garanted to work with a-z/A-Z.
+		- Only guaranteed to work with a-z/A-Z.
+		  (use str_utf8_find_nocase for unicode support)
 		- The strings are treated as zero-terminated strings.
 */
 const char *str_find_nocase(const char *haystack, const char *needle);
@ -1631,6 +1633,66 @@ int str_utf8_to_skeleton(const char *str, int *buf, int buf_len);
 */
 int str_utf8_comp_confusable(const char *a, const char *b);

+/*
+	Function: str_utf8_tolower
+		Converts the given Unicode codepoint to lowercase (locale insensitive).
+
+	Parameters:
+		code - Unicode codepoint to convert.
+
+	Returns:
+		Lowercase codepoint
+*/
+int str_utf8_tolower(int code);
+
+/*
+	Function: str_utf8_comp_nocase
+		Compares two utf8 strings case insensitively.
+
+	Parameters:
+		a - String to compare.
+		b - String to compare.
+
+	Returns:
+		<0 - String a is less than string b
+		0 - String a is equal to string b
+		>0 - String a is greater than string b
+*/
+int str_utf8_comp_nocase(const char *a, const char *b);
+
+/*
+	Function: str_utf8_comp_nocase_num
+		Compares up to num bytes of two utf8 strings case insensitively.
+
+	Parameters:
+		a - String to compare.
+		b - String to compare.
+		num - Maximum characters to compare
+
+	Returns:
+		<0 - String a is less than string b
+		0 - String a is equal to string b
+		>0 - String a is greater than string b
+*/
+int str_utf8_comp_nocase_num(const char *a, const char *b, int num);
+
+/*
+	Function: str_utf8_find_nocase
+		Finds a utf8 string inside another utf8 string case insensitively.
+
+	Parameters:
+		haystack - String to search in
+		needle - String to search for
+
+	Returns:
+		A pointer into haystack where the needle was found.
+		Returns NULL of needle could not be found.
+
+	Remarks:
+		- The strings are treated as zero-terminated strings.
+*/
+const char *str_utf8_find_nocase(const char *haystack, const char *needle);
+
 /*
 	Function: str_utf8_isspace
 		Checks whether the given Unicode codepoint renders as space.
--- a/src/base/unicode/VERSION
+++ b/src/base/unicode/VERSION
@ -0,0 +1 @@
+12.0.0
--- a/src/base/unicode/confusables.c
+++ b/src/base/unicode/confusables.c
@ -1,6 +1,6 @@
 #include "confusables_data.h"

-#include "system.h"
+#include "../system.h"

 #include <stddef.h>

--- a/src/base/unicode/confusables_data.h
+++ b/src/base/unicode/confusables_data.h
--- a/src/base/unicode/tolower.c
+++ b/src/base/unicode/tolower.c
@ -0,0 +1,22 @@
+#include <stdlib.h>
+
+#include "tolower_data.h"
+
+static int compul(const void *a, const void *b)
+{
+  struct UPPER_LOWER *ul_a = (struct UPPER_LOWER *) a;
+  struct UPPER_LOWER *ul_b = (struct UPPER_LOWER *) b;
+  return ul_a->upper - ul_b->upper;
+}
+
+int str_utf8_tolower(int code)
+{
+  struct UPPER_LOWER key;
+  struct UPPER_LOWER *res;
+  key.upper = code;
+  res = bsearch(&key, tolower, NUM_TOLOWER, sizeof(struct UPPER_LOWER), compul);
+
+  if(res == NULL)
+    return code;
+  return res->lower;
+}
--- a/src/base/unicode/tolower_data.h
+++ b/src/base/unicode/tolower_data.h
--- a/src/game/client/components/chat.cpp
+++ b/src/game/client/components/chat.cpp
@ -327,11 +327,11 @@ bool CChat::OnInput(IInput::CEvent Event)
 			bool Found = false;
 			if(SearchType == 1)
 			{
-				if(str_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)) &&
-					str_find_nocase(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer))
+				if(str_utf8_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)) &&
+					str_utf8_find_nocase(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer))
 					Found = true;
 			}
-			else if(!str_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)))
+			else if(!str_utf8_comp_nocase_num(m_pClient->m_aClients[Index2].m_aName, m_aCompletionBuffer, str_length(m_aCompletionBuffer)))
 				Found = true;

 			if(Found)
--- a/src/in
+++ b/src/in
--- a/src/test/str.cpp
+++ b/src/test/str.cpp
@ -55,6 +55,35 @@ TEST(Str, Utf8CompConfusables)
 	EXPECT_TRUE(str_utf8_comp_confusable("aceiou", "ąçęįǫų") == 0);
 }

+TEST(Str, Utf8ToLower)
+{
+	EXPECT_TRUE(str_utf8_tolower('A') == 'a');
+	EXPECT_TRUE(str_utf8_tolower('z') == 'z');
+	EXPECT_TRUE(str_utf8_tolower(192) == 224); // À -> à
+	EXPECT_TRUE(str_utf8_tolower(7882) == 7883); // Ị -> ị
+
+	EXPECT_TRUE(str_utf8_comp_nocase("ÖlÜ", "ölü") == 0);
+	EXPECT_TRUE(str_utf8_comp_nocase("ÜlÖ", "ölü") > 0); // ü > ö
+	EXPECT_TRUE(str_utf8_comp_nocase("ÖlÜ", "ölüa") < 0); // NULL < a
+	EXPECT_TRUE(str_utf8_comp_nocase("ölüa", "ÖlÜ") > 0); // a < NULL
+
+	const char a[2] = {-128, 0};
+	const char b[2] = {0, 0};
+	EXPECT_TRUE(str_utf8_comp_nocase(a, b) > 0);
+	EXPECT_TRUE(str_utf8_comp_nocase(b, a) < 0);
+
+	EXPECT_TRUE(str_utf8_comp_nocase_num("ÖlÜ", "ölüa", 3) == 0);
+	EXPECT_TRUE(str_utf8_comp_nocase_num("ÖlÜ", "ölüa", 4) != 0);
+	EXPECT_TRUE(str_utf8_comp_nocase_num("a", "z", 0) == 0);
+	EXPECT_TRUE(str_utf8_comp_nocase_num("a", "z", 1) != 0);
+
+	const char str[] = "ÄÖÜ";
+	EXPECT_TRUE(str_utf8_find_nocase(str, "ä") == str);
+	EXPECT_TRUE(str_utf8_find_nocase(str, "ö") == str+2);
+	EXPECT_TRUE(str_utf8_find_nocase(str, "ü") == str+4);
+	EXPECT_TRUE(str_utf8_find_nocase(str, "z") == NULL);
+}
+
 TEST(Str, Startswith)
 {
 	EXPECT_TRUE(str_startswith("abcdef", "abc"));
--- a/src/tools/unicode_confusables.cpp
+++ b/src/tools/unicode_confusables.cpp
@ -5,7 +5,7 @@ int main(int argc, const char **argv) // ignore_convention
 	dbg_logger_stdout();
 	if(argc < 1 + 2)
 	{
-		dbg_msg("usage", "%s STR1 STR2", argv[0] ? argv[0] : "confusables");
+		dbg_msg("usage", "%s STR1 STR2", argv[0] ? argv[0] : "unicode_confusables");
 		return -1;
 	}
 	dbg_msg("conf", "not_confusable=%d", str_utf8_comp_confusable(argv[1], argv[2]));