feat: add encoding convertion functions between utf8 and utf16, utf32.

- add bidirectional convertion between utf8 and utf16, utf32 in every platforms. - add testbench for new added functions.
2024-06-18 11:03:48 +08:00
parent 0319be7e19
commit 77b6f439f7
3 changed files with 255 additions and 19 deletions
--- a/src/EncodingHelper.cpp
+++ b/src/EncodingHelper.cpp
@@ -1,8 +1,11 @@
 #include "EncodingHelper.hpp"
-#if YYCC_OS == YYCC_OS_WINDOWS
+
+#include <cuchar>

 namespace YYCC::EncodingHelper {

+#if YYCC_OS == YYCC_OS_WINDOWS
+
 	bool WcharToChar(const wchar_t* src, std::string& dest, UINT codepage) {
 		int count, write_result;

@@ -65,6 +68,137 @@ namespace YYCC::EncodingHelper {
 		return ret;
 	}

+#endif
+
+	template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
+	bool UTF8ToUTFOther(const char* src, std::basic_string<_TChar>& dest) {
+		// Reference: 
+		// https://zh.cppreference.com/w/cpp/string/multibyte/mbrtoc32
+		// https://zh.cppreference.com/w/cpp/string/multibyte/mbrtoc16
+		// https://learn.microsoft.com/zh-cn/cpp/c-runtime-library/reference/mbrtoc16-mbrtoc323?view=msvc-170
+		// 
+		// Due to the same reason introduced in UTFOtherToUTF8,
+		// we use these function as convertion function.
+		
+		// init src string
+		if (src == nullptr) return false;
+		std::string src_string(src);
+		// init result string
+		dest.clear();
+
+		// init essential cvt variables
+		std::mbstate_t state {};
+		_TChar c1632;
+		const char* ptr = src_string.c_str();
+		const char* end = src_string.c_str() + src_string.size() + 1;
+
+		// start convertion
+		while (true) {
+			// do convertion
+			size_t rc;
+			if constexpr (std::is_same_v<_TChar, char16_t>) {
+				rc = std::mbrtoc16(&c1632, ptr, end - ptr, &state);
+			} else {
+				rc = std::mbrtoc32(&c1632, ptr, end - ptr, &state);
+			}
+			if (!rc) break;
+
+			// check result
+			if (rc == static_cast<size_t>(-1)) {
+				// encoding error, return false
+				return false;
+			} else if (rc == static_cast<size_t>(-2)) {
+				// insufficient sequence, return false
+				return false;
+			} else if (rc == static_cast<size_t>(-3)) {
+				// UTF16 pair case (usually is emoji, one emoji is represented by 2 UTF16)
+				// 
+				// only push result char but do not increase pointer
+				// because this char is output from state.
+				dest.push_back(c1632);
+			} else {
+				// normal case
+				// append to result
+				dest.push_back(c1632);
+				// inc ptr
+				ptr += rc;
+			}
+		}
+
+		return true;
+	}
+
+	bool UTF8ToUTF16(const char* src, std::u16string& dest) {
+		return UTF8ToUTFOther<char16_t>(src, dest);
+	}
+	std::u16string UTF8ToUTF16(const char* src) {
+		std::u16string ret;
+		if (!UTF8ToUTF16(src, ret)) ret.clear();
+		return ret;
+	}
+	bool UTF8ToUTF32(const char* src, std::u32string& dest) {
+		return UTF8ToUTFOther<char32_t>(src, dest);
+	}
+	std::u32string UTF8ToUTF32(const char* src) {
+		std::u32string ret;
+		if (!UTF8ToUTF32(src, ret)) ret.clear();
+		return ret;
+	}
+
+	template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
+	bool UTFOtherToUTF8(const _TChar* src, std::string& dest) {
+		// Reference:
+		// https://zh.cppreference.com/w/cpp/string/multibyte/c32rtomb
+		// https://zh.cppreference.com/w/cpp/string/multibyte/c16rtomb
+		// https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/c16rtomb-c32rtomb1?view=msvc-170
+		// 
+		// Due to Microsoft implementation, c16rtomb and c32rtomb 
+		// always convert UTF32 and UTF16 string into UTF8 string no matter current c locale.
+		// At the same time, most Linux use UTF8 as their locale.
+		// So using c16rtomb and c32rtomb do the convertion from UTF32 or UTF16 to UTF8 is reasonable.
+
+		// initialize src string
+		if (src == nullptr) return false;
+		std::basic_string<_TChar> src_string(src);
+		// init result string
+		dest.clear();
+
+		// init essential cvt variables
+		std::mbstate_t state {};
+		char out[MB_LEN_MAX] {};
+		for (_TChar c : src_string) {
+			// do convertion
+			std::size_t rc;
+			if constexpr (std::is_same_v<_TChar, char16_t>) {
+				rc = std::c16rtomb(out, c, &state);
+			} else {
+				rc = std::c32rtomb(out, c, &state);
+			}
+			// convertion failed
+			if (rc == static_cast<size_t>(-1)) return false;
+			// otherwise append result
+			dest.append(out, rc);
+		}
+
+		return true;
+	}
+
+	bool UTF16ToUTF8(const char16_t* src, std::string& dest) {
+		return UTFOtherToUTF8<char16_t>(src, dest);
+	}
+	std::string UTF16ToUTF8(const char16_t* src) {
+		std::string ret;
+		if (!UTF16ToUTF8(src, ret)) ret.clear();
+		return ret;
+	}
+	bool UTF32ToUTF8(const char32_t* src, std::string& dest) {
+		return UTFOtherToUTF8<char32_t>(src, dest);
+	}
+	std::string UTF32ToUTF8(const char32_t* src) {
+		std::string ret;
+		if (!UTF32ToUTF8(src, ret)) ret.clear();
+		return ret;
+	}
+
 }

-#endif
--- a/src/EncodingHelper.hpp
+++ b/src/EncodingHelper.hpp
@@ -1,15 +1,18 @@
 #pragma once
 #include "YYCCInternal.hpp"
-#if YYCC_OS == YYCC_OS_WINDOWS

 #include <string>

+#if YYCC_OS == YYCC_OS_WINDOWS
 #include "WinImportPrefix.hpp"
 #include <Windows.h>
 #include "WinImportSuffix.hpp"
+#endif

 namespace YYCC::EncodingHelper {

+#if YYCC_OS == YYCC_OS_WINDOWS
+
 	bool WcharToChar(const wchar_t* src, std::string& dest, UINT codepage);
 	bool WcharToUTF8(const wchar_t* src, std::string& dest);
 	std::string WcharToChar(const wchar_t* src, UINT codepage);
@@ -23,6 +26,16 @@ namespace YYCC::EncodingHelper {
 	bool CharToChar(const char* src, std::string& dest, UINT src_codepage, UINT dest_codepage);
 	std::string CharToChar(const char* src, UINT src_codepage, UINT dest_codepage);

-}
-
 #endif
+
+	bool UTF8ToUTF16(const char* src, std::u16string& dest);
+	std::u16string UTF8ToUTF16(const char* src);
+	bool UTF8ToUTF32(const char* src, std::u32string& dest);
+	std::u32string UTF8ToUTF32(const char* src);
+
+	bool UTF16ToUTF8(const char16_t* src, std::string& dest);
+	std::string  UTF16ToUTF8(const char16_t* src);
+	bool UTF32ToUTF8(const char32_t* src, std::string& dest);
+	std::string  UTF32ToUTF8(const char32_t* src);
+
+}
--- a/testbench/main.cpp
+++ b/testbench/main.cpp
@@ -4,23 +4,80 @@
 namespace Console = YYCC::ConsoleHelper;

 namespace YYCCTestbench {
+#pragma region UNICODE Test Data

-	// UTF8 Test String Table
+	// UNICODE Test Strings
 	// Ref: https://stackoverflow.com/questions/478201/how-to-test-an-application-for-correct-encoding-e-g-utf-8
+#define TEST_UNICODE_STR_JAPAN "\u30E6\u30FC\u30B6\u30FC\u5225\u30B5\u30A4\u30C8"
+#define TEST_UNICODE_STR_CHINA "\u7B80\u4F53\u4E2D\u6587"
+#define TEST_UNICODE_STR_KOREA "\uD06C\uB85C\uC2A4 \uD50C\uB7AB\uD3FC\uC73C\uB85C"
+#define TEST_UNICODE_STR_ISRAEL "\u05DE\u05D3\u05D5\u05E8\u05D9\u05DD \u05DE\u05D1\u05D5\u05E7\u05E9\u05D9\u05DD"
+#define TEST_UNICODE_STR_EGYPT "\u0623\u0641\u0636\u0644 \u0627\u0644\u0628\u062D\u0648\u062B"
+#define TEST_UNICODE_STR_GREECE "\u03A3\u1F72 \u03B3\u03BD\u03C9\u03C1\u03AF\u03B6\u03C9 \u1F00\u03C0\u1F78"
+#define TEST_UNICODE_STR_RUSSIA "\u0414\u0435\u0441\u044F\u0442\u0443\u044E \u041C\u0435\u0436\u0434\u0443\u043D\u0430\u0440\u043E\u0434\u043D\u0443\u044E"
+#define TEST_UNICODE_STR_THAILAND "\u0E41\u0E1C\u0E48\u0E19\u0E14\u0E34\u0E19\u0E2E\u0E31\u0E48\u0E19\u0E40\u0E2A\u0E37\u0E48\u0E2D\u0E21\u0E42\u0E17\u0E23\u0E21\u0E41\u0E2A\u0E19\u0E2A\u0E31\u0E07\u0E40\u0E27\u0E0A"
+#define TEST_UNICODE_STR_FRANCE "fran\u00E7ais langue \u00E9trang\u00E8re"
+#define TEST_UNICODE_STR_SPAIN "ma\u00F1ana ol\u00E9"
+#define TEST_UNICODE_STR_MATHMATICS "\u222E E\u22C5da = Q,  n \u2192 \u221E, \u2211 f(i) = \u220F g(i)"
+#define TEST_UNICODE_STR_EMOJI "\U0001F363 \u2716 \U0001F37A" // sushi x beer mug
+
+#define CONCAT(prefix, strl) prefix ## strl
+#define CPP_U8_LITERAL(strl) strl
+#define CPP_U16_LITERAL(strl) CONCAT(u, strl)
+#define CPP_U32_LITERAL(strl) CONCAT(U, strl)
+
 	static std::vector<std::string> c_UTF8TestStrTable {
-		"\u30E6\u30FC\u30B6\u30FC\u5225\u30B5\u30A4\u30C8", // JAPAN
-		"\u7B80\u4F53\u4E2D\u6587", // CHINA
-		"\uD06C\uB85C\uC2A4 \uD50C\uB7AB\uD3FC\uC73C\uB85C", // KOREA
-		"\u05DE\u05D3\u05D5\u05E8\u05D9\u05DD \u05DE\u05D1\u05D5\u05E7\u05E9\u05D9\u05DD", // ISRAEL
-		"\u0623\u0641\u0636\u0644 \u0627\u0644\u0628\u062D\u0648\u062B", // EGYPT
-		"\u03A3\u1F72 \u03B3\u03BD\u03C9\u03C1\u03AF\u03B6\u03C9 \u1F00\u03C0\u1F78", // GREECE
-		"\u0414\u0435\u0441\u044F\u0442\u0443\u044E \u041C\u0435\u0436\u0434\u0443\u043D\u0430\u0440\u043E\u0434\u043D\u0443\u044E", // RUSSIA
-		"\u0E41\u0E1C\u0E48\u0E19\u0E14\u0E34\u0E19\u0E2E\u0E31\u0E48\u0E19\u0E40\u0E2A\u0E37\u0E48\u0E2D\u0E21\u0E42\u0E17\u0E23\u0E21\u0E41\u0E2A\u0E19\u0E2A\u0E31\u0E07\u0E40\u0E27\u0E0A", // THAILAND
-		"fran\u00E7ais langue \u00E9trang\u00E8re", // FRANCE
-		"ma\u00F1ana ol\u00E9", // SPAIN
-		"\u222E E\u22C5da = Q,  n \u2192 \u221E, \u2211 f(i) = \u220F g(i)", // MATHMATICS
-		"\xF0\x9F\x8D\xA3 \xE2\x9C\x96 \xF0\x9F\x8D\xBA", // EMOJI
+		CPP_U8_LITERAL(TEST_UNICODE_STR_JAPAN),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_CHINA),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_KOREA),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_ISRAEL),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_EGYPT),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_GREECE),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_RUSSIA),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_THAILAND),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_FRANCE),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_SPAIN),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_MATHMATICS),
+		CPP_U8_LITERAL(TEST_UNICODE_STR_EMOJI),
 	};
+	static std::vector<std::u16string> c_UTF16TestStrTable {
+		CPP_U16_LITERAL(TEST_UNICODE_STR_JAPAN),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_CHINA),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_KOREA),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_ISRAEL),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_EGYPT),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_GREECE),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_RUSSIA),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_THAILAND),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_FRANCE),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_SPAIN),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_MATHMATICS),
+		CPP_U16_LITERAL(TEST_UNICODE_STR_EMOJI),
+	};
+	static std::vector<std::u32string> c_UTF32TestStrTable {
+		CPP_U32_LITERAL(TEST_UNICODE_STR_JAPAN),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_CHINA),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_KOREA),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_ISRAEL),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_EGYPT),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_GREECE),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_RUSSIA),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_THAILAND),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_FRANCE),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_SPAIN),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_MATHMATICS),
+		CPP_U32_LITERAL(TEST_UNICODE_STR_EMOJI),
+	};
+
+#undef CPP_U32_LITERAL
+#undef CPP_U16_LITERAL
+#undef CPP_U8_LITERAL
+#undef CONCAT
+
+#pragma endregion
+
+
+

 	static void Assert(bool condition, const char* description) {
 		if (condition) {
@@ -68,6 +125,37 @@ namespace YYCCTestbench {

 	}

+	static void EncodingTestbench() {
+
+		// check the convertion between given string
+		size_t count = c_UTF8TestStrTable.size();
+		for (size_t i = 0u; i < count; ++i) {
+			// get item
+			const auto& u8str = c_UTF8TestStrTable[i];
+			const auto& u16str = c_UTF16TestStrTable[i];
+			const auto& u32str = c_UTF32TestStrTable[i];
+
+			// create cache variables
+			std::string u8cache;
+			std::u16string u16cache;
+			std::u32string u32cache;
+
+			// do convertion check
+			Assert(YYCC::EncodingHelper::UTF8ToUTF16(u8str.c_str(), u16cache), "YYCC::EncodingHelper::UTF8ToUTF16");
+			Assert(u16cache == u16str, "YYCC::EncodingHelper::UTF8ToUTF16");
+
+			Assert(YYCC::EncodingHelper::UTF8ToUTF32(u8str.c_str(), u32cache), "YYCC::EncodingHelper::UTF8ToUTF32");
+			Assert(u32cache == u32str, "YYCC::EncodingHelper::UTF8ToUTF32");
+
+			Assert(YYCC::EncodingHelper::UTF16ToUTF8(u16str.c_str(), u8cache), "YYCC::EncodingHelper::UTF16ToUTF8");
+			Assert(u8cache == u8str, "YYCC::EncodingHelper::UTF16ToUTF8");
+
+			Assert(YYCC::EncodingHelper::UTF32ToUTF8(u32str.c_str(), u8cache), "YYCC::EncodingHelper::UTF32ToUTF8");
+			Assert(u8cache == u8str, "YYCC::EncodingHelper::UTF32ToUTF8");
+		}
+
+	}
+
 	static void StringTestbench() {
 		// Test Printf
 		auto test_printf = YYCC::StringHelper::Printf("%s == %s", "Hello World", "Hello, world");
@@ -267,10 +355,11 @@ namespace YYCCTestbench {

 int main(int argc, char** args) {
 	//YYCCTestbench::ConsoleTestbench();
+	YYCCTestbench::EncodingTestbench();
 	//YYCCTestbench::StringTestbench();
 	//YYCCTestbench::ParserTestbench();
 	//YYCCTestbench::DialogTestbench();
-	YYCCTestbench::ExceptionTestbench();
+	//YYCCTestbench::ExceptionTestbench();
 	//YYCCTestbench::WinFctTestbench();
 	//YYCCTestbench::FsPathPatch();
 }