Add a flag to skip a UTF-8 BOM when opening files for reading

https://en.wikipedia.org/w/index.php?title=Byte_order_mark&oldid=1059783325#UTF-8

This allows to read text files created by Windows users transparently,
even if they contain a UTF-8 BOM.
This commit is contained in:
heinrich5991 2021-12-17 21:56:31 +01:00
parent 625ce302dd
commit 49c861372a
4 changed files with 85 additions and 5 deletions

View file

@ -2263,6 +2263,7 @@ if(GTEST_FOUND OR DOWNLOAD_GTEST)
fs.cpp
git_revision.cpp
hash.cpp
io.cpp
jobs.cpp
json.cpp
mapbugs.cpp

View file

@ -311,13 +311,13 @@ void mem_zero(void *block, unsigned size)
memset(block, 0, size);
}
IOHANDLE io_open(const char *filename, int flags)
IOHANDLE io_open_impl(const char *filename, int flags)
{
dbg_assert(flags == IOFLAG_READ || flags == IOFLAG_WRITE || flags == IOFLAG_APPEND, "flags must be read, write or append");
dbg_assert(flags == (IOFLAG_READ | IOFLAG_SKIP_BOM) || flags == IOFLAG_READ || flags == IOFLAG_WRITE || flags == IOFLAG_APPEND, "flags must be read, read+skipbom, write or append");
#if defined(CONF_FAMILY_WINDOWS)
WCHAR wBuffer[IO_MAX_PATH_LENGTH];
MultiByteToWideChar(CP_UTF8, 0, filename, -1, wBuffer, sizeof(wBuffer) / sizeof(WCHAR));
if(flags == IOFLAG_READ)
if((flags & IOFLAG_READ) != 0)
return (IOHANDLE)_wfsopen(wBuffer, L"rb", _SH_DENYNO);
if(flags == IOFLAG_WRITE)
return (IOHANDLE)_wfsopen(wBuffer, L"wb", _SH_DENYNO);
@ -325,7 +325,7 @@ IOHANDLE io_open(const char *filename, int flags)
return (IOHANDLE)_wfsopen(wBuffer, L"ab", _SH_DENYNO);
return 0x0;
#else
if(flags == IOFLAG_READ)
if((flags & IOFLAG_READ) != 0)
return (IOHANDLE)fopen(filename, "rb");
if(flags == IOFLAG_WRITE)
return (IOHANDLE)fopen(filename, "wb");
@ -335,6 +335,21 @@ IOHANDLE io_open(const char *filename, int flags)
#endif
}
IOHANDLE io_open(const char *filename, int flags)
{
IOHANDLE result = io_open_impl(filename, flags);
unsigned char buf[3];
if((flags & IOFLAG_SKIP_BOM) == 0 || !result)
{
return result;
}
if(io_read(result, buf, sizeof(buf)) != 3 || buf[0] != 0xef || buf[1] != 0xbb || buf[2] != 0xbf)
{
io_seek(result, 0, IOSEEK_START);
}
return result;
}
unsigned io_read(IOHANDLE io, void *buffer, unsigned size)
{
return fread(buffer, 1, size, (FILE *)io);

View file

@ -171,6 +171,7 @@ enum
IOFLAG_READ = 1,
IOFLAG_WRITE = 2,
IOFLAG_APPEND = 4,
IOFLAG_SKIP_BOM = 8,
IOSEEK_START = 0,
IOSEEK_CUR = 1,
@ -187,7 +188,7 @@ typedef struct IOINTERNAL *IOHANDLE;
Parameters:
filename - File to open.
flags - A set of flags. IOFLAG_READ, IOFLAG_WRITE, IOFLAG_APPEND.
flags - A set of flags. IOFLAG_READ, IOFLAG_WRITE, IOFLAG_APPEND, IOFLAG_SKIP_BOM.
Returns:
Returns a handle to the file on success and 0 on failure.

63
src/test/io.cpp Normal file
View file

@ -0,0 +1,63 @@
#include "test.h"
#include <gtest/gtest.h>
#include <base/system.h>
void TestFileRead(const char *pWritten, bool SkipBom, const char *pRead)
{
CTestInfo Info;
char aBuf[512] = {0};
IOHANDLE File = io_open(Info.m_aFilename, IOFLAG_WRITE);
ASSERT_TRUE(File);
EXPECT_EQ(io_write(File, pWritten, str_length(pWritten)), str_length(pWritten));
EXPECT_FALSE(io_close(File));
File = io_open(Info.m_aFilename, IOFLAG_READ | (SkipBom ? IOFLAG_SKIP_BOM : 0));
ASSERT_TRUE(File);
EXPECT_EQ(io_read(File, aBuf, sizeof(aBuf)), str_length(pRead));
EXPECT_TRUE(mem_comp(aBuf, pRead, str_length(pRead)) == 0);
EXPECT_FALSE(io_close(File));
fs_remove(Info.m_aFilename);
}
TEST(Io, Read1)
{
TestFileRead("", false, "");
}
TEST(Io, Read2)
{
TestFileRead("abc", false, "abc");
}
TEST(Io, Read3)
{
TestFileRead("\xef\xbb\xbf", false, "\xef\xbb\xbf");
}
TEST(Io, Read4)
{
TestFileRead("\xef\xbb\xbfxyz", false, "\xef\xbb\xbfxyz");
}
TEST(Io, ReadBom1)
{
TestFileRead("", true, "");
}
TEST(Io, ReadBom2)
{
TestFileRead("abc", true, "abc");
}
TEST(Io, ReadBom3)
{
TestFileRead("\xef\xbb\xbf", true, "");
}
TEST(Io, ReadBom4)
{
TestFileRead("\xef\xbb\xbfxyz", true, "xyz");
}
TEST(Io, ReadBom5)
{
TestFileRead("\xef\xbb\xbf\xef\xbb\xbf", true, "\xef\xbb\xbf");
}
TEST(Io, ReadBom6)
{
TestFileRead("\xef\xbb\xbfxyz\xef\xbb\xbf", true, "xyz\xef\xbb\xbf");
}