fix: use new method to do the convertion among UTF8, UTF16 and UTF32.

- use std::codevct as the convertion method among UTF8, UTF16 and UTF32. - fix the issue that COM Guard was accidently dropped by compiler because no reference to it.
2024-06-20 10:16:13 +08:00
parent 1fd132f0c9
commit 3fa05b43d9
5 changed files with 97 additions and 87 deletions
--- a/src/COMHelper.cpp
+++ b/src/COMHelper.cpp
@ -21,6 +21,10 @@ namespace YYCC::COMHelper {
 			}
 		}

+		bool IsInitialized() const {
+			return m_HasInit;
+		}
+
 	protected:
 		bool m_HasInit;
 	};
@ -33,7 +37,11 @@ namespace YYCC::COMHelper {
 	 * So we use a static instance in here.
 	 * And make it be const so no one can change it.
 	*/
-	static const ComGuard c_ComGuard;
+	static const ComGuard c_ComGuard {};
+
+	bool IsInitialized() {
+		return c_ComGuard.IsInitialized();
+	}

 }

--- a/src/COMHelper.hpp
+++ b/src/COMHelper.hpp
@ -64,6 +64,17 @@ namespace YYCC::COMHelper {

 	using SmartLPWSTR = std::unique_ptr<std::remove_pointer_t<LPWSTR>, CoTaskMemDeleter>;

+	/**
+	 * @brief Check whether COM environment has been initialized.
+	 * @return True if it is, otherwise false.
+	 * @remarks
+	 * This function will call corresponding function of COM Guard.
+	 * Do not remove this function and you must preserve at least one reference to this function in final program.
+	 * Some compiler will try to drop COM Guard in final program if no reference to it and it will cause the initialization of COM environment failed.
+	 * This is the reason why I order you do the things said above.
+	*/
+	bool IsInitialized();
+
 }

 #endif
--- a/src/ConsoleHelper.cpp
+++ b/src/ConsoleHelper.cpp
@ -245,12 +245,12 @@ namespace YYCC::ConsoleHelper {
 	}

 	void Write(const char* u8_strl) {
-		va_list empty;
+		va_list empty{};
 		RawWrite<false, false, false>(u8_strl, empty);
 	}
 	
 	void WriteLine(const char* u8_strl) {
-		va_list empty;
+		va_list empty{};
 		RawWrite<false, false, true>(u8_strl, empty);
 	}
 	
@ -269,12 +269,12 @@ namespace YYCC::ConsoleHelper {
 	}

 	void ErrWrite(const char* u8_strl) {
-		va_list empty;
+		va_list empty{};
 		RawWrite<false, true, false>(u8_strl, empty);
 	}
 	
 	void ErrWriteLine(const char* u8_strl) {
-		va_list empty;
+		va_list empty{};
 		RawWrite<false, true, true>(u8_strl, empty);
 	}

--- a/src/DialogHelper.cpp
+++ b/src/DialogHelper.cpp
@ -173,6 +173,9 @@ namespace YYCC::DialogHelper {
 		// prepare result variable
 		HRESULT hr;

+		// check whether COM environment has been initialized
+		if (!COMHelper::IsInitialized()) return false;
+
 		// create file dialog instance
 		// fetch dialog CLSID first
 		CLSID dialog_clsid;
--- a/src/EncodingHelper.cpp
+++ b/src/EncodingHelper.cpp
@ -1,7 +1,6 @@
 #include "EncodingHelper.hpp"

-#include <cuchar>
-#include <climits>
+#include <locale>

 namespace YYCC::EncodingHelper {

@ -71,61 +70,49 @@ namespace YYCC::EncodingHelper {

 #endif

+#if defined(__cpp_char8_t)
+	using CodecvtUTF8Char_t = char8_t;
+#else
+	using CodecvtUTF8Char_t = char;
+#endif
 	template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
-	static bool UTF8ToUTFOther(const char* src, std::basic_string<_TChar>& dest) {
+	using CodecvtFacet_t = std::codecvt<_TChar, CodecvtUTF8Char_t, std::mbstate_t>;
+
+	template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
+	static bool UTF8ToUTFOther(const char* _src, std::basic_string<_TChar>& dest) {
 		// Reference:
-		// https://zh.cppreference.com/w/cpp/string/multibyte/mbrtoc32
-		// https://zh.cppreference.com/w/cpp/string/multibyte/mbrtoc16
-		// https://learn.microsoft.com/zh-cn/cpp/c-runtime-library/reference/mbrtoc16-mbrtoc323?view=msvc-170
-		// 
-		// Due to the same reason introduced in UTFOtherToUTF8,
-		// we use these function as convertion function.
+		// https://zh.cppreference.com/w/cpp/locale/codecvt/in
 		
 		// init src string
-		if (src == nullptr) return false;
-		std::string src_string(src);
-		// init result string
-		dest.clear();
+		if (_src == nullptr) return false;
+		std::string src(_src);

-		// init essential cvt variables
-		std::mbstate_t state {};
-		_TChar c1632;
-		const char* ptr = src_string.c_str();
-		const char* end = src_string.c_str() + src_string.size() + 1;
+		// init locale and get codecvt facet
+		// same reason in UTFOtherToUTF8 to keeping reference to locale
+		const auto& this_locale = std::locale::classic();
+		const auto& this_codecvt = std::use_facet<CodecvtFacet_t<_TChar>>(this_locale);
 		
-		// start convertion
-		while (true) {
+		// convertion preparation
+		std::mbstate_t mb{};
+		dest.resize(src.size());
+		const CodecvtUTF8Char_t* intern_from = reinterpret_cast<const CodecvtUTF8Char_t*>(src.c_str()),
+			*intern_from_end = reinterpret_cast<const CodecvtUTF8Char_t*>(src.c_str() + src.size()),
+			*intern_from_next = nullptr;
+		_TChar* extern_to = dest.data(),
+			*extern_to_end = dest.data() + dest.size(),
+			*extern_to_next = nullptr;
 		// do convertion
-			size_t rc;
-			if constexpr (std::is_same_v<_TChar, char16_t>) {
-				rc = std::mbrtoc16(&c1632, ptr, end - ptr, &state);
-			} else {
-				rc = std::mbrtoc32(&c1632, ptr, end - ptr, &state);
-			}
-			if (!rc) break;
+		auto result = this_codecvt.in(
+			mb,
+			intern_from, intern_from_end, intern_from_next,
+			extern_to, extern_to_end, extern_to_next
+		);

 		// check result
-			if (rc == static_cast<size_t>(-1)) {
-				// encoding error, return false
+		if (result != CodecvtFacet_t<_TChar>::ok)
 			return false;
-			} else if (rc == static_cast<size_t>(-2)) {
-				// insufficient sequence, return false
-				return false;
-			} else if (rc == static_cast<size_t>(-3)) {
-				// UTF16 pair case (usually is emoji, one emoji is represented by 2 UTF16)
-				// 
-				// only push result char but do not increase pointer
-				// because this char is output from state.
-				dest.push_back(c1632);
-			} else {
-				// normal case
-				// append to result
-				dest.push_back(c1632);
-				// inc ptr
-				ptr += rc;
-			}
-		}
-
+		// resize result and return
+		dest.resize(extern_to_next - dest.data());
 		return true;
 	}

@ -147,40 +134,41 @@ namespace YYCC::EncodingHelper {
 	}

 	template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
-	static bool UTFOtherToUTF8(const _TChar* src, std::string& dest) {
+	static bool UTFOtherToUTF8(const _TChar* _src, std::string& dest) {
 		// Reference:
-		// https://zh.cppreference.com/w/cpp/string/multibyte/c32rtomb
-		// https://zh.cppreference.com/w/cpp/string/multibyte/c16rtomb
-		// https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/c16rtomb-c32rtomb1?view=msvc-170
-		// 
-		// Due to Microsoft implementation, c16rtomb and c32rtomb 
-		// always convert UTF32 and UTF16 string into UTF8 string no matter current c locale.
-		// At the same time, most Linux use UTF8 as their locale.
-		// So using c16rtomb and c32rtomb do the convertion from UTF32 or UTF16 to UTF8 is reasonable.
+		// https://zh.cppreference.com/w/cpp/locale/codecvt/out
 		
 		// initialize src string
-		if (src == nullptr) return false;
-		std::basic_string<_TChar> src_string(src);
-		// init result string
-		dest.clear();
+		if (_src == nullptr) return false;
+		std::basic_string<_TChar> src(_src);

-		// init essential cvt variables
-		std::mbstate_t state {};
-		char out[MB_LEN_MAX] {};
-		for (_TChar c : src_string) {
+		// init locale and get codecvt facet
+		// the reference to locale must be preserved until convertion done.
+		// because the life time of codecvt facet is equal to the reference to locale.
+		const auto& this_locale = std::locale::classic();
+		const auto& this_codecvt = std::use_facet<CodecvtFacet_t<_TChar>>(this_locale);
+
+		// do convertion preparation
+		std::mbstate_t mb{};
+		dest.resize(src.size() * this_codecvt.max_length());
+		const _TChar* intern_from = src.c_str(),
+			*intern_from_end = src.c_str() + src.size(),
+			*intern_from_next = nullptr;
+		CodecvtUTF8Char_t* extern_to = reinterpret_cast<CodecvtUTF8Char_t*>(dest.data()),
+			*extern_to_end = reinterpret_cast<CodecvtUTF8Char_t*>(dest.data() + dest.size()),
+			*extern_to_next = nullptr;
 		// do convertion
-			std::size_t rc;
-			if constexpr (std::is_same_v<_TChar, char16_t>) {
-				rc = std::c16rtomb(out, c, &state);
-			} else {
-				rc = std::c32rtomb(out, c, &state);
-			}
-			// convertion failed
-			if (rc == static_cast<size_t>(-1)) return false;
-			// otherwise append result
-			dest.append(out, rc);
-		}
+		auto result = this_codecvt.out(
+			mb,
+			intern_from, intern_from_end, intern_from_next,
+			extern_to, extern_to_end, extern_to_next
+		);

+		// check result
+		if (result != CodecvtFacet_t<_TChar>::ok)
+			return false;
+		// resize result and retuen
+		dest.resize(extern_to_next - reinterpret_cast<CodecvtUTF8Char_t*>(dest.data()));
 		return true;
 	}