From 77b6f439f7d4d53106725d3745a81e563a7d2f18 Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Tue, 18 Jun 2024 11:03:48 +0800 Subject: [PATCH] feat: add encoding convertion functions between utf8 and utf16, utf32. - add bidirectional convertion between utf8 and utf16, utf32 in every platforms. - add testbench for new added functions. --- src/EncodingHelper.cpp | 138 ++++++++++++++++++++++++++++++++++++++++- src/EncodingHelper.hpp | 19 +++++- testbench/main.cpp | 117 +++++++++++++++++++++++++++++----- 3 files changed, 255 insertions(+), 19 deletions(-) diff --git a/src/EncodingHelper.cpp b/src/EncodingHelper.cpp index 75d54e1..76f6cef 100644 --- a/src/EncodingHelper.cpp +++ b/src/EncodingHelper.cpp @@ -1,8 +1,11 @@ #include "EncodingHelper.hpp" -#if YYCC_OS == YYCC_OS_WINDOWS + +#include namespace YYCC::EncodingHelper { +#if YYCC_OS == YYCC_OS_WINDOWS + bool WcharToChar(const wchar_t* src, std::string& dest, UINT codepage) { int count, write_result; @@ -65,6 +68,137 @@ namespace YYCC::EncodingHelper { return ret; } +#endif + + template || std::is_same_v<_TChar, char32_t>, int> = 0> + bool UTF8ToUTFOther(const char* src, std::basic_string<_TChar>& dest) { + // Reference: + // https://zh.cppreference.com/w/cpp/string/multibyte/mbrtoc32 + // https://zh.cppreference.com/w/cpp/string/multibyte/mbrtoc16 + // https://learn.microsoft.com/zh-cn/cpp/c-runtime-library/reference/mbrtoc16-mbrtoc323?view=msvc-170 + // + // Due to the same reason introduced in UTFOtherToUTF8, + // we use these function as convertion function. + + // init src string + if (src == nullptr) return false; + std::string src_string(src); + // init result string + dest.clear(); + + // init essential cvt variables + std::mbstate_t state {}; + _TChar c1632; + const char* ptr = src_string.c_str(); + const char* end = src_string.c_str() + src_string.size() + 1; + + // start convertion + while (true) { + // do convertion + size_t rc; + if constexpr (std::is_same_v<_TChar, char16_t>) { + rc = std::mbrtoc16(&c1632, ptr, end - ptr, &state); + } else { + rc = std::mbrtoc32(&c1632, ptr, end - ptr, &state); + } + if (!rc) break; + + // check result + if (rc == static_cast(-1)) { + // encoding error, return false + return false; + } else if (rc == static_cast(-2)) { + // insufficient sequence, return false + return false; + } else if (rc == static_cast(-3)) { + // UTF16 pair case (usually is emoji, one emoji is represented by 2 UTF16) + // + // only push result char but do not increase pointer + // because this char is output from state. + dest.push_back(c1632); + } else { + // normal case + // append to result + dest.push_back(c1632); + // inc ptr + ptr += rc; + } + } + + return true; + } + + bool UTF8ToUTF16(const char* src, std::u16string& dest) { + return UTF8ToUTFOther(src, dest); + } + std::u16string UTF8ToUTF16(const char* src) { + std::u16string ret; + if (!UTF8ToUTF16(src, ret)) ret.clear(); + return ret; + } + bool UTF8ToUTF32(const char* src, std::u32string& dest) { + return UTF8ToUTFOther(src, dest); + } + std::u32string UTF8ToUTF32(const char* src) { + std::u32string ret; + if (!UTF8ToUTF32(src, ret)) ret.clear(); + return ret; + } + + template || std::is_same_v<_TChar, char32_t>, int> = 0> + bool UTFOtherToUTF8(const _TChar* src, std::string& dest) { + // Reference: + // https://zh.cppreference.com/w/cpp/string/multibyte/c32rtomb + // https://zh.cppreference.com/w/cpp/string/multibyte/c16rtomb + // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/c16rtomb-c32rtomb1?view=msvc-170 + // + // Due to Microsoft implementation, c16rtomb and c32rtomb + // always convert UTF32 and UTF16 string into UTF8 string no matter current c locale. + // At the same time, most Linux use UTF8 as their locale. + // So using c16rtomb and c32rtomb do the convertion from UTF32 or UTF16 to UTF8 is reasonable. + + // initialize src string + if (src == nullptr) return false; + std::basic_string<_TChar> src_string(src); + // init result string + dest.clear(); + + // init essential cvt variables + std::mbstate_t state {}; + char out[MB_LEN_MAX] {}; + for (_TChar c : src_string) { + // do convertion + std::size_t rc; + if constexpr (std::is_same_v<_TChar, char16_t>) { + rc = std::c16rtomb(out, c, &state); + } else { + rc = std::c32rtomb(out, c, &state); + } + // convertion failed + if (rc == static_cast(-1)) return false; + // otherwise append result + dest.append(out, rc); + } + + return true; + } + + bool UTF16ToUTF8(const char16_t* src, std::string& dest) { + return UTFOtherToUTF8(src, dest); + } + std::string UTF16ToUTF8(const char16_t* src) { + std::string ret; + if (!UTF16ToUTF8(src, ret)) ret.clear(); + return ret; + } + bool UTF32ToUTF8(const char32_t* src, std::string& dest) { + return UTFOtherToUTF8(src, dest); + } + std::string UTF32ToUTF8(const char32_t* src) { + std::string ret; + if (!UTF32ToUTF8(src, ret)) ret.clear(); + return ret; + } + } -#endif diff --git a/src/EncodingHelper.hpp b/src/EncodingHelper.hpp index 9457e23..6fa7a59 100644 --- a/src/EncodingHelper.hpp +++ b/src/EncodingHelper.hpp @@ -1,15 +1,18 @@ #pragma once #include "YYCCInternal.hpp" -#if YYCC_OS == YYCC_OS_WINDOWS #include +#if YYCC_OS == YYCC_OS_WINDOWS #include "WinImportPrefix.hpp" #include #include "WinImportSuffix.hpp" +#endif namespace YYCC::EncodingHelper { +#if YYCC_OS == YYCC_OS_WINDOWS + bool WcharToChar(const wchar_t* src, std::string& dest, UINT codepage); bool WcharToUTF8(const wchar_t* src, std::string& dest); std::string WcharToChar(const wchar_t* src, UINT codepage); @@ -23,6 +26,16 @@ namespace YYCC::EncodingHelper { bool CharToChar(const char* src, std::string& dest, UINT src_codepage, UINT dest_codepage); std::string CharToChar(const char* src, UINT src_codepage, UINT dest_codepage); -} +#endif -#endif \ No newline at end of file + bool UTF8ToUTF16(const char* src, std::u16string& dest); + std::u16string UTF8ToUTF16(const char* src); + bool UTF8ToUTF32(const char* src, std::u32string& dest); + std::u32string UTF8ToUTF32(const char* src); + + bool UTF16ToUTF8(const char16_t* src, std::string& dest); + std::string UTF16ToUTF8(const char16_t* src); + bool UTF32ToUTF8(const char32_t* src, std::string& dest); + std::string UTF32ToUTF8(const char32_t* src); + +} diff --git a/testbench/main.cpp b/testbench/main.cpp index a188875..fc00331 100644 --- a/testbench/main.cpp +++ b/testbench/main.cpp @@ -4,23 +4,80 @@ namespace Console = YYCC::ConsoleHelper; namespace YYCCTestbench { +#pragma region UNICODE Test Data - // UTF8 Test String Table + // UNICODE Test Strings // Ref: https://stackoverflow.com/questions/478201/how-to-test-an-application-for-correct-encoding-e-g-utf-8 +#define TEST_UNICODE_STR_JAPAN "\u30E6\u30FC\u30B6\u30FC\u5225\u30B5\u30A4\u30C8" +#define TEST_UNICODE_STR_CHINA "\u7B80\u4F53\u4E2D\u6587" +#define TEST_UNICODE_STR_KOREA "\uD06C\uB85C\uC2A4 \uD50C\uB7AB\uD3FC\uC73C\uB85C" +#define TEST_UNICODE_STR_ISRAEL "\u05DE\u05D3\u05D5\u05E8\u05D9\u05DD \u05DE\u05D1\u05D5\u05E7\u05E9\u05D9\u05DD" +#define TEST_UNICODE_STR_EGYPT "\u0623\u0641\u0636\u0644 \u0627\u0644\u0628\u062D\u0648\u062B" +#define TEST_UNICODE_STR_GREECE "\u03A3\u1F72 \u03B3\u03BD\u03C9\u03C1\u03AF\u03B6\u03C9 \u1F00\u03C0\u1F78" +#define TEST_UNICODE_STR_RUSSIA "\u0414\u0435\u0441\u044F\u0442\u0443\u044E \u041C\u0435\u0436\u0434\u0443\u043D\u0430\u0440\u043E\u0434\u043D\u0443\u044E" +#define TEST_UNICODE_STR_THAILAND "\u0E41\u0E1C\u0E48\u0E19\u0E14\u0E34\u0E19\u0E2E\u0E31\u0E48\u0E19\u0E40\u0E2A\u0E37\u0E48\u0E2D\u0E21\u0E42\u0E17\u0E23\u0E21\u0E41\u0E2A\u0E19\u0E2A\u0E31\u0E07\u0E40\u0E27\u0E0A" +#define TEST_UNICODE_STR_FRANCE "fran\u00E7ais langue \u00E9trang\u00E8re" +#define TEST_UNICODE_STR_SPAIN "ma\u00F1ana ol\u00E9" +#define TEST_UNICODE_STR_MATHMATICS "\u222E E\u22C5da = Q, n \u2192 \u221E, \u2211 f(i) = \u220F g(i)" +#define TEST_UNICODE_STR_EMOJI "\U0001F363 \u2716 \U0001F37A" // sushi x beer mug + +#define CONCAT(prefix, strl) prefix ## strl +#define CPP_U8_LITERAL(strl) strl +#define CPP_U16_LITERAL(strl) CONCAT(u, strl) +#define CPP_U32_LITERAL(strl) CONCAT(U, strl) + static std::vector c_UTF8TestStrTable { - "\u30E6\u30FC\u30B6\u30FC\u5225\u30B5\u30A4\u30C8", // JAPAN - "\u7B80\u4F53\u4E2D\u6587", // CHINA - "\uD06C\uB85C\uC2A4 \uD50C\uB7AB\uD3FC\uC73C\uB85C", // KOREA - "\u05DE\u05D3\u05D5\u05E8\u05D9\u05DD \u05DE\u05D1\u05D5\u05E7\u05E9\u05D9\u05DD", // ISRAEL - "\u0623\u0641\u0636\u0644 \u0627\u0644\u0628\u062D\u0648\u062B", // EGYPT - "\u03A3\u1F72 \u03B3\u03BD\u03C9\u03C1\u03AF\u03B6\u03C9 \u1F00\u03C0\u1F78", // GREECE - "\u0414\u0435\u0441\u044F\u0442\u0443\u044E \u041C\u0435\u0436\u0434\u0443\u043D\u0430\u0440\u043E\u0434\u043D\u0443\u044E", // RUSSIA - "\u0E41\u0E1C\u0E48\u0E19\u0E14\u0E34\u0E19\u0E2E\u0E31\u0E48\u0E19\u0E40\u0E2A\u0E37\u0E48\u0E2D\u0E21\u0E42\u0E17\u0E23\u0E21\u0E41\u0E2A\u0E19\u0E2A\u0E31\u0E07\u0E40\u0E27\u0E0A", // THAILAND - "fran\u00E7ais langue \u00E9trang\u00E8re", // FRANCE - "ma\u00F1ana ol\u00E9", // SPAIN - "\u222E E\u22C5da = Q, n \u2192 \u221E, \u2211 f(i) = \u220F g(i)", // MATHMATICS - "\xF0\x9F\x8D\xA3 \xE2\x9C\x96 \xF0\x9F\x8D\xBA", // EMOJI + CPP_U8_LITERAL(TEST_UNICODE_STR_JAPAN), + CPP_U8_LITERAL(TEST_UNICODE_STR_CHINA), + CPP_U8_LITERAL(TEST_UNICODE_STR_KOREA), + CPP_U8_LITERAL(TEST_UNICODE_STR_ISRAEL), + CPP_U8_LITERAL(TEST_UNICODE_STR_EGYPT), + CPP_U8_LITERAL(TEST_UNICODE_STR_GREECE), + CPP_U8_LITERAL(TEST_UNICODE_STR_RUSSIA), + CPP_U8_LITERAL(TEST_UNICODE_STR_THAILAND), + CPP_U8_LITERAL(TEST_UNICODE_STR_FRANCE), + CPP_U8_LITERAL(TEST_UNICODE_STR_SPAIN), + CPP_U8_LITERAL(TEST_UNICODE_STR_MATHMATICS), + CPP_U8_LITERAL(TEST_UNICODE_STR_EMOJI), }; + static std::vector c_UTF16TestStrTable { + CPP_U16_LITERAL(TEST_UNICODE_STR_JAPAN), + CPP_U16_LITERAL(TEST_UNICODE_STR_CHINA), + CPP_U16_LITERAL(TEST_UNICODE_STR_KOREA), + CPP_U16_LITERAL(TEST_UNICODE_STR_ISRAEL), + CPP_U16_LITERAL(TEST_UNICODE_STR_EGYPT), + CPP_U16_LITERAL(TEST_UNICODE_STR_GREECE), + CPP_U16_LITERAL(TEST_UNICODE_STR_RUSSIA), + CPP_U16_LITERAL(TEST_UNICODE_STR_THAILAND), + CPP_U16_LITERAL(TEST_UNICODE_STR_FRANCE), + CPP_U16_LITERAL(TEST_UNICODE_STR_SPAIN), + CPP_U16_LITERAL(TEST_UNICODE_STR_MATHMATICS), + CPP_U16_LITERAL(TEST_UNICODE_STR_EMOJI), + }; + static std::vector c_UTF32TestStrTable { + CPP_U32_LITERAL(TEST_UNICODE_STR_JAPAN), + CPP_U32_LITERAL(TEST_UNICODE_STR_CHINA), + CPP_U32_LITERAL(TEST_UNICODE_STR_KOREA), + CPP_U32_LITERAL(TEST_UNICODE_STR_ISRAEL), + CPP_U32_LITERAL(TEST_UNICODE_STR_EGYPT), + CPP_U32_LITERAL(TEST_UNICODE_STR_GREECE), + CPP_U32_LITERAL(TEST_UNICODE_STR_RUSSIA), + CPP_U32_LITERAL(TEST_UNICODE_STR_THAILAND), + CPP_U32_LITERAL(TEST_UNICODE_STR_FRANCE), + CPP_U32_LITERAL(TEST_UNICODE_STR_SPAIN), + CPP_U32_LITERAL(TEST_UNICODE_STR_MATHMATICS), + CPP_U32_LITERAL(TEST_UNICODE_STR_EMOJI), + }; + +#undef CPP_U32_LITERAL +#undef CPP_U16_LITERAL +#undef CPP_U8_LITERAL +#undef CONCAT + +#pragma endregion + + + static void Assert(bool condition, const char* description) { if (condition) { @@ -68,6 +125,37 @@ namespace YYCCTestbench { } + static void EncodingTestbench() { + + // check the convertion between given string + size_t count = c_UTF8TestStrTable.size(); + for (size_t i = 0u; i < count; ++i) { + // get item + const auto& u8str = c_UTF8TestStrTable[i]; + const auto& u16str = c_UTF16TestStrTable[i]; + const auto& u32str = c_UTF32TestStrTable[i]; + + // create cache variables + std::string u8cache; + std::u16string u16cache; + std::u32string u32cache; + + // do convertion check + Assert(YYCC::EncodingHelper::UTF8ToUTF16(u8str.c_str(), u16cache), "YYCC::EncodingHelper::UTF8ToUTF16"); + Assert(u16cache == u16str, "YYCC::EncodingHelper::UTF8ToUTF16"); + + Assert(YYCC::EncodingHelper::UTF8ToUTF32(u8str.c_str(), u32cache), "YYCC::EncodingHelper::UTF8ToUTF32"); + Assert(u32cache == u32str, "YYCC::EncodingHelper::UTF8ToUTF32"); + + Assert(YYCC::EncodingHelper::UTF16ToUTF8(u16str.c_str(), u8cache), "YYCC::EncodingHelper::UTF16ToUTF8"); + Assert(u8cache == u8str, "YYCC::EncodingHelper::UTF16ToUTF8"); + + Assert(YYCC::EncodingHelper::UTF32ToUTF8(u32str.c_str(), u8cache), "YYCC::EncodingHelper::UTF32ToUTF8"); + Assert(u8cache == u8str, "YYCC::EncodingHelper::UTF32ToUTF8"); + } + + } + static void StringTestbench() { // Test Printf auto test_printf = YYCC::StringHelper::Printf("%s == %s", "Hello World", "Hello, world"); @@ -267,10 +355,11 @@ namespace YYCCTestbench { int main(int argc, char** args) { //YYCCTestbench::ConsoleTestbench(); + YYCCTestbench::EncodingTestbench(); //YYCCTestbench::StringTestbench(); //YYCCTestbench::ParserTestbench(); //YYCCTestbench::DialogTestbench(); - YYCCTestbench::ExceptionTestbench(); + //YYCCTestbench::ExceptionTestbench(); //YYCCTestbench::WinFctTestbench(); //YYCCTestbench::FsPathPatch(); }