Add a function to fix UTF-8 truncation at the end of a string

This can be used after truncating at byte boundaries.
This commit is contained in:
heinrich5991 2021-12-20 02:05:07 +01:00
parent 107438bead
commit ae9944550c
3 changed files with 57 additions and 0 deletions

View file

@ -3276,6 +3276,23 @@ int str_utf8_rewind(const char *str, int cursor)
return cursor;
}
int str_utf8_fix_truncation(char *str)
{
int len = str_length(str);
if(len > 0)
{
int last_char_index = str_utf8_rewind(str, len);
const char *last_char = str + last_char_index;
// Fix truncated UTF-8.
if(str_utf8_decode(&last_char) == -1)
{
str[last_char_index] = 0;
return last_char_index;
}
}
return len;
}
int str_utf8_forward(const char *str, int cursor)
{
const char *buf = str + cursor;

View file

@ -2002,6 +2002,19 @@ void str_utf8_trim_right(char *str);
*/
int str_utf8_rewind(const char *str, int cursor);
/*
Function: str_utf8_fix_truncation
Fixes truncation of a Unicode character at the end of a UTF-8
string.
Returns:
The new string length.
Parameters:
str - utf8 string
*/
int str_utf8_fix_truncation(char *str);
/*
Function: str_utf8_forward
Moves a cursor forwards in an utf8 string

View file

@ -95,6 +95,33 @@ TEST(Str, Utf8ToLower)
EXPECT_TRUE(str_utf8_find_nocase(str, "z") == NULL);
}
TEST(Str, Utf8FixTruncation)
{
char aaBuf[][32] = {
"",
"\xff",
"abc",
"abc\xff",
"blub\xffxyz",
"привет Наташа\xff",
"до свидания\xffОлег",
};
const char *apExpected[] = {
"",
"",
"abc",
"abc",
"blub\xffxyz",
"привет Наташа",
"до свидания\xffОлег",
};
for(unsigned i = 0; i < sizeof(aaBuf) / sizeof(aaBuf[0]); i++)
{
EXPECT_EQ(str_utf8_fix_truncation(aaBuf[i]), str_length(apExpected[i]));
EXPECT_STREQ(aaBuf[i], apExpected[i]);
}
}
TEST(Str, Startswith)
{
EXPECT_TRUE(str_startswith("abcdef", "abc"));