Add utility functions for converting UTF-8 bytes ↔ chars offsets

Add `str_utf8_offset_bytes_to_chars` and `str_utf8_offset_chars_to_bytes` functions to base system to convert between byte and UTF-8 character offsets in UTF-8 strings.

Previously, this was separately implemented in the textrender and in the lineinput helper.

These textrender functions are entirely replaced by the new functions:

- `ITextRender::SelectionToUTF8OffSets` (by `str_utf8_offset_chars_to_bytes`)
- `ITextRender::UTF8OffToDecodedOff` (by `str_utf8_offset_bytes_to_chars`)
- `ITextRender::DecodedOffToUTF8Off` (by `str_utf8_offset_chars_to_bytes`)

These lineinput helper functions are reimplemented using the new functions:

- `CLineInput::OffsetFromActualToDisplay` (uses `str_utf8_offset_bytes_to_chars`)
- `CLineInput::OffsetFromDisplayToActual` (uses `str_utf8_offset_chars_to_bytes`)
This commit is contained in:
Robert Müller 2023-08-15 21:32:56 +02:00
parent 107ac6705d
commit af3870a64d
7 changed files with 125 additions and 147 deletions

View file

@ -3953,6 +3953,34 @@ void str_utf8_stats(const char *str, size_t max_size, size_t max_count, size_t *
}
}
size_t str_utf8_offset_bytes_to_chars(const char *str, size_t byte_offset)
{
size_t char_offset = 0;
size_t current_offset = 0;
while(current_offset < byte_offset)
{
const size_t prev_byte_offset = current_offset;
current_offset = str_utf8_forward(str, current_offset);
if(current_offset == prev_byte_offset)
break;
char_offset++;
}
return char_offset;
}
size_t str_utf8_offset_chars_to_bytes(const char *str, size_t char_offset)
{
size_t byte_offset = 0;
for(size_t i = 0; i < char_offset; i++)
{
const size_t prev_byte_offset = byte_offset;
byte_offset = str_utf8_forward(str, byte_offset);
if(byte_offset == prev_byte_offset)
break;
}
return byte_offset;
}
unsigned str_quickhash(const char *str)
{
unsigned hash = 5381;

View file

@ -2491,6 +2491,32 @@ int str_utf8_check(const char *str);
*/
void str_utf8_stats(const char *str, size_t max_size, size_t max_count, size_t *size, size_t *count);
/**
* Converts a byte offset of a utf8 string to the utf8 character offset.
*
* @param text Pointer to the string.
* @param byte_offset Offset in bytes.
*
* @return Offset in utf8 characters. Clamped to the maximum length of the string in utf8 characters.
*
* @remark The string is treated as a zero-terminated utf8 string.
* @remark It's the user's responsibility to make sure the bounds are aligned.
*/
size_t str_utf8_offset_bytes_to_chars(const char *str, size_t byte_offset);
/**
* Converts a utf8 character offset of a utf8 string to the byte offset.
*
* @param text Pointer to the string.
* @param char_offset Offset in utf8 characters.
*
* @return Offset in bytes. Clamped to the maximum length of the string in bytes.
*
* @remark The string is treated as a zero-terminated utf8 string.
* @remark It's the user's responsibility to make sure the bounds are aligned.
*/
size_t str_utf8_offset_chars_to_bytes(const char *str, size_t char_offset);
/*
Function: str_next_token
Writes the next token after str into buf, returns the rest of the string.

View file

@ -2218,103 +2218,6 @@ public:
return WidthOfText;
}
bool SelectionToUTF8OffSets(const char *pText, int SelStart, int SelEnd, int &OffUTF8Start, int &OffUTF8End) const override
{
const char *pIt = pText;
OffUTF8Start = -1;
OffUTF8End = -1;
int CharCount = 0;
while(*pIt)
{
const char *pTmp = pIt;
int Character = str_utf8_decode(&pTmp);
if(Character == -1)
return false;
if(CharCount == SelStart)
OffUTF8Start = (int)((std::intptr_t)(pIt - pText));
if(CharCount == SelEnd)
OffUTF8End = (int)((std::intptr_t)(pIt - pText));
pIt = pTmp;
++CharCount;
}
if(CharCount == SelStart)
OffUTF8Start = (int)((std::intptr_t)(pIt - pText));
if(CharCount == SelEnd)
OffUTF8End = (int)((std::intptr_t)(pIt - pText));
return OffUTF8Start != -1 && OffUTF8End != -1;
}
bool UTF8OffToDecodedOff(const char *pText, int UTF8Off, int &DecodedOff) const override
{
const char *pIt = pText;
DecodedOff = -1;
int CharCount = 0;
while(*pIt)
{
if((int)(intptr_t)(pIt - pText) == UTF8Off)
{
DecodedOff = CharCount;
return true;
}
const char *pTmp = pIt;
int Character = str_utf8_decode(&pTmp);
if(Character == -1)
return false;
pIt = pTmp;
++CharCount;
}
if((int)(std::intptr_t)(pIt - pText) == UTF8Off)
{
DecodedOff = CharCount;
return true;
}
return false;
}
bool DecodedOffToUTF8Off(const char *pText, int DecodedOff, int &UTF8Off) const override
{
const char *pIt = pText;
UTF8Off = -1;
int CharCount = 0;
while(*pIt)
{
const char *pTmp = pIt;
int Character = str_utf8_decode(&pTmp);
if(Character == -1)
return false;
if(CharCount == DecodedOff)
{
UTF8Off = (int)((std::intptr_t)(pIt - pText));
return true;
}
pIt = pTmp;
++CharCount;
}
if(CharCount == DecodedOff)
UTF8Off = (int)((std::intptr_t)(pIt - pText));
return UTF8Off != -1;
}
void OnPreWindowResize() override
{
for(auto *pTextContainer : m_vpTextContainers)

View file

@ -286,10 +286,6 @@ public:
virtual float GetGlyphOffsetX(int FontSize, char TextCharacter) const = 0;
virtual int CalculateTextWidth(const char *pText, int TextLength, int FontWidth, int FontSize) const = 0;
virtual bool SelectionToUTF8OffSets(const char *pText, int SelStart, int SelEnd, int &OffUTF8Start, int &OffUTF8End) const = 0;
virtual bool UTF8OffToDecodedOff(const char *pText, int UTF8Off, int &DecodedOff) const = 0;
virtual bool DecodedOffToUTF8Off(const char *pText, int DecodedOff, int &UTF8Off) const = 0;
// old foolish interface
virtual void TextColor(float r, float g, float b, float a) = 0;
virtual void TextColor(ColorRGBA rgb) = 0;

View file

@ -765,12 +765,9 @@ void CGameConsole::OnRender()
if(m_WantsSelectionCopy)
{
const bool HasNewLine = !SelectionString.empty();
int OffUTF8Start = 0;
int OffUTF8End = 0;
if(TextRender()->SelectionToUTF8OffSets(pEntry->m_aText, pConsole->m_CurSelStart, pConsole->m_CurSelEnd, OffUTF8Start, OffUTF8End))
{
SelectionString.insert(0, (std::string(&pEntry->m_aText[OffUTF8Start], OffUTF8End - OffUTF8Start) + (HasNewLine ? "\n" : "")));
}
const size_t OffUTF8Start = str_utf8_offset_chars_to_bytes(pEntry->m_aText, pConsole->m_CurSelStart);
const size_t OffUTF8End = str_utf8_offset_chars_to_bytes(pEntry->m_aText, pConsole->m_CurSelEnd);
SelectionString.insert(0, (std::string(&pEntry->m_aText[OffUTF8Start], OffUTF8End - OffUTF8Start) + (HasNewLine ? "\n" : "")));
}
pConsole->m_HasSelection = true;
}

View file

@ -170,32 +170,14 @@ size_t CLineInput::OffsetFromActualToDisplay(size_t ActualOffset) const
{
if(!IsHidden())
return ActualOffset;
size_t DisplayOffset = 0;
size_t CurrentOffset = 0;
while(CurrentOffset < ActualOffset)
{
const size_t PrevOffset = CurrentOffset;
CurrentOffset = str_utf8_forward(m_pStr, CurrentOffset);
if(CurrentOffset == PrevOffset)
break;
DisplayOffset++;
}
return DisplayOffset;
return str_utf8_offset_bytes_to_chars(m_pStr, ActualOffset);
}
size_t CLineInput::OffsetFromDisplayToActual(size_t DisplayOffset) const
{
if(!IsHidden())
return DisplayOffset;
size_t ActualOffset = 0;
for(size_t i = 0; i < DisplayOffset; i++)
{
const size_t PrevOffset = ActualOffset;
ActualOffset = str_utf8_forward(m_pStr, ActualOffset);
if(ActualOffset == PrevOffset)
break;
}
return ActualOffset;
return str_utf8_offset_chars_to_bytes(m_pStr, DisplayOffset);
}
bool CLineInput::ProcessInput(const IInput::CEvent &Event)
@ -462,11 +444,11 @@ STextBoundingBox CLineInput::Render(const CUIRect *pRect, float FontSize, int Al
m_LastCompositionCursorPos = CaretOffset;
const size_t DisplayCompositionEnd = DisplayCursorOffset + Input()->GetCompositionLength();
Cursor.m_CursorMode = TEXT_CURSOR_CURSOR_MODE_SET;
TextRender()->UTF8OffToDecodedOff(pDisplayStr, CaretOffset, Cursor.m_CursorCharacter);
Cursor.m_CursorCharacter = str_utf8_offset_bytes_to_chars(pDisplayStr, CaretOffset);
Cursor.m_CalculateSelectionMode = TEXT_CURSOR_SELECTION_MODE_SET;
Cursor.m_SelectionHeightFactor = 0.1f;
TextRender()->UTF8OffToDecodedOff(pDisplayStr, DisplayCursorOffset, Cursor.m_SelectionStart);
TextRender()->UTF8OffToDecodedOff(pDisplayStr, DisplayCompositionEnd, Cursor.m_SelectionEnd);
Cursor.m_SelectionStart = str_utf8_offset_bytes_to_chars(pDisplayStr, DisplayCursorOffset);
Cursor.m_SelectionEnd = str_utf8_offset_bytes_to_chars(pDisplayStr, DisplayCompositionEnd);
TextRender()->TextSelectionColor(1.0f, 1.0f, 1.0f, 0.8f);
TextRender()->TextEx(&Cursor, pDisplayStr);
TextRender()->TextSelectionColor(TextRender()->DefaultTextSelectionColor());
@ -476,38 +458,30 @@ STextBoundingBox CLineInput::Render(const CUIRect *pRect, float FontSize, int Al
const size_t Start = OffsetFromActualToDisplay(GetSelectionStart());
const size_t End = OffsetFromActualToDisplay(GetSelectionEnd());
Cursor.m_CursorMode = m_MouseSelection.m_Selecting ? TEXT_CURSOR_CURSOR_MODE_CALCULATE : TEXT_CURSOR_CURSOR_MODE_SET;
TextRender()->UTF8OffToDecodedOff(pDisplayStr, CaretOffset, Cursor.m_CursorCharacter);
Cursor.m_CursorCharacter = str_utf8_offset_bytes_to_chars(pDisplayStr, CaretOffset);
Cursor.m_CalculateSelectionMode = m_MouseSelection.m_Selecting ? TEXT_CURSOR_SELECTION_MODE_CALCULATE : TEXT_CURSOR_SELECTION_MODE_SET;
TextRender()->UTF8OffToDecodedOff(pDisplayStr, Start, Cursor.m_SelectionStart);
TextRender()->UTF8OffToDecodedOff(pDisplayStr, End, Cursor.m_SelectionEnd);
Cursor.m_SelectionStart = str_utf8_offset_bytes_to_chars(pDisplayStr, Start);
Cursor.m_SelectionEnd = str_utf8_offset_bytes_to_chars(pDisplayStr, End);
TextRender()->TextEx(&Cursor, pDisplayStr);
}
else
{
Cursor.m_CursorMode = m_MouseSelection.m_Selecting ? TEXT_CURSOR_CURSOR_MODE_CALCULATE : TEXT_CURSOR_CURSOR_MODE_SET;
TextRender()->UTF8OffToDecodedOff(pDisplayStr, CaretOffset, Cursor.m_CursorCharacter);
Cursor.m_CursorCharacter = str_utf8_offset_bytes_to_chars(pDisplayStr, CaretOffset);
Cursor.m_CalculateSelectionMode = m_MouseSelection.m_Selecting ? TEXT_CURSOR_SELECTION_MODE_CALCULATE : TEXT_CURSOR_SELECTION_MODE_NONE;
TextRender()->TextEx(&Cursor, pDisplayStr);
}
if(Cursor.m_CursorMode == TEXT_CURSOR_CURSOR_MODE_CALCULATE)
{
int NewCursorOffset;
TextRender()->DecodedOffToUTF8Off(pDisplayStr, Cursor.m_CursorCharacter, NewCursorOffset);
if(NewCursorOffset >= 0)
{
SetCursorOffset(OffsetFromDisplayToActual(NewCursorOffset));
}
const size_t NewCursorOffset = str_utf8_offset_chars_to_bytes(pDisplayStr, Cursor.m_CursorCharacter);
SetCursorOffset(OffsetFromDisplayToActual(NewCursorOffset));
}
if(Cursor.m_CalculateSelectionMode == TEXT_CURSOR_SELECTION_MODE_CALCULATE)
{
int NewSelectionStart, NewSelectionEnd;
TextRender()->DecodedOffToUTF8Off(pDisplayStr, Cursor.m_SelectionStart, NewSelectionStart);
TextRender()->DecodedOffToUTF8Off(pDisplayStr, Cursor.m_SelectionEnd, NewSelectionEnd);
if(NewSelectionStart >= 0 && NewSelectionEnd >= 0)
{
SetSelection(OffsetFromDisplayToActual(NewSelectionStart), OffsetFromDisplayToActual(NewSelectionEnd));
}
const size_t NewSelectionStart = str_utf8_offset_chars_to_bytes(pDisplayStr, Cursor.m_SelectionStart);
const size_t NewSelectionEnd = str_utf8_offset_chars_to_bytes(pDisplayStr, Cursor.m_SelectionEnd);
SetSelection(OffsetFromDisplayToActual(NewSelectionStart), OffsetFromDisplayToActual(NewSelectionEnd));
}
m_CaretPosition = Cursor.m_CursorRenderedPosition;

View file

@ -696,6 +696,60 @@ TEST(Str, Utf8Stats)
EXPECT_EQ(Count, 3);
}
TEST(Str, Utf8OffsetBytesToChars)
{
EXPECT_EQ(str_utf8_offset_bytes_to_chars("", 0), 0);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("", 100), 0);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("abc", 0), 0);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("abc", 1), 1);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("abc", 2), 2);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("abc", 3), 3);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("abc", 100), 3);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 0), 0);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 2), 1);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 4), 2);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 6), 3);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 8), 4);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 10), 5);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 12), 6);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("любовь", 100), 6);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("DDNet最好了", 5), 5);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("DDNet最好了", 8), 6);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("DDNet最好了", 11), 7);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("DDNet最好了", 14), 8);
EXPECT_EQ(str_utf8_offset_bytes_to_chars("DDNet最好了", 100), 8);
}
TEST(Str, Utf8OffsetCharsToBytes)
{
EXPECT_EQ(str_utf8_offset_chars_to_bytes("", 0), 0);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("", 100), 0);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("abc", 0), 0);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("abc", 1), 1);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("abc", 2), 2);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("abc", 3), 3);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("abc", 100), 3);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 0), 0);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 1), 2);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 2), 4);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 3), 6);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 4), 8);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 5), 10);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 6), 12);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("любовь", 100), 12);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("DDNet最好了", 5), 5);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("DDNet最好了", 6), 8);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("DDNet最好了", 7), 11);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("DDNet最好了", 8), 14);
EXPECT_EQ(str_utf8_offset_chars_to_bytes("DDNet最好了", 100), 14);
}
TEST(Str, Time)
{
char aBuf[32] = "foobar";