From c15b57d055367444035d30862b661ffb8c0aa756 Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Wed, 26 Jun 2024 21:04:56 +0800 Subject: [PATCH] refactor: bring char8_t to this library. - add yycc_char8_t and yycc_u8string in code to indicate explicit utf8 char type and string. it also has a polyfill if compiler and library do not support utf8 char type. - refactor the whole encoding helper. allow converting string with embedded NUL. but not tested. --- .../{nightly.yml => nightly.yml.disabled} | 0 src/EncodingHelper.cpp | 304 +++++++++++++----- src/EncodingHelper.hpp | 68 ++-- src/YYCCInternal.hpp | 31 +- 4 files changed, 286 insertions(+), 117 deletions(-) rename .github/workflows/{nightly.yml => nightly.yml.disabled} (100%) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml.disabled similarity index 100% rename from .github/workflows/nightly.yml rename to .github/workflows/nightly.yml.disabled diff --git a/src/EncodingHelper.cpp b/src/EncodingHelper.cpp index 3915cef..9bd0be8 100644 --- a/src/EncodingHelper.cpp +++ b/src/EncodingHelper.cpp @@ -4,89 +4,189 @@ namespace YYCC::EncodingHelper { + /* Define some assistant macros for easy writing. */ + +#define CONVFCT_TYPE2(fct_name, src_char_type, dst_char_type, ...) if (src == nullptr) return false; \ +std::basic_string cache(src); \ +return fct_name(cache, dst, ##__VA_ARGS__); + +#define CONVFCT_TYPE3(fct_name, src_char_type, dst_char_type, ...) std::basic_string ret; \ +if (!fct_name(src, ret, ##__VA_ARGS__)) ret.clear(); \ +return ret; + +#define CONVFCT_TYPE4(fct_name, src_char_type, dst_char_type, ...) std::basic_string ret; \ +if (!fct_name(src, ret, ##__VA_ARGS__)) ret.clear(); \ +return ret; + + #if YYCC_OS == YYCC_OS_WINDOWS - bool WcharToChar(const wchar_t* src, std::string& dest, UINT codepage) { - int count, write_result; +#pragma region WcharToChar + + bool WcharToChar(const std::wstring& src, std::string& dst, UINT code_page) { + // if src is empty, direct output + if (src.empty()) { + dst.clear(); + return true; + } - //converter to CHAR - count = WideCharToMultiByte(codepage, 0, reinterpret_cast(src), -1, NULL, 0, NULL, NULL); - if (count <= 0) return false; + // init WideCharToMultiByte used variables + // setup src pointer + LPCWCH lpWideCharStr = reinterpret_cast(src.c_str()); + // check whether source string is too large. + size_t cSrcSize = src.size(); + if (cSrcSize > std::numeric_limits::max()) return false; + int cchWideChar = static_cast(src.size()); - dest.resize(count - 1); - write_result = WideCharToMultiByte(codepage, 0, reinterpret_cast(src), -1, reinterpret_cast(dest.data()), count, NULL, NULL); + // do convertion + // do a dry-run first to fetch desired size. + int desired_size = WideCharToMultiByte(code_page, 0, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL); + if (desired_size <= 0) return false; + // resize dest for receiving result + dst.resize(static_cast(desired_size)); + // do real convertion + int write_result = WideCharToMultiByte(code_page, 0, lpWideCharStr, cchWideChar, reinterpret_cast(dst.data()), desired_size, NULL, NULL); if (write_result <= 0) return false; return true; } - bool WcharToUTF8(const wchar_t* src, std::string& dest) { - return WcharToChar(src, dest, CP_UTF8); + bool WcharToChar(const wchar_t* src, std::string& dst, UINT code_page) { + CONVFCT_TYPE2(WcharToChar, wchar_t, char, code_page); } - std::string WcharToChar(const wchar_t* src, UINT codepage) { - std::string ret; - if (!WcharToChar(src, ret, codepage)) ret.clear(); - return ret; + std::string WcharToChar(const std::wstring& src, UINT code_page) { + CONVFCT_TYPE3(WcharToChar, wchar_t, char, code_page); } - std::string WcharToUTF8(const wchar_t* src) { - return WcharToChar(src, CP_UTF8); + std::string WcharToChar(const wchar_t* src, UINT code_page) { + CONVFCT_TYPE4(WcharToChar, wchar_t, char, code_page); } - bool CharToWchar(const char* src, std::wstring& dest, UINT codepage) { - int wcount, write_result; +#pragma endregion + +#pragma region CharToWchar - // convert to WCHAR - wcount = MultiByteToWideChar(codepage, 0, reinterpret_cast(src), -1, NULL, 0); - if (wcount <= 0) return false; + bool CharToWchar(const std::string& src, std::wstring& dst, UINT code_page) { + // if src is empty, direct output + if (src.empty()) { + dst.clear(); + return true; + } - dest.resize(wcount - 1); - write_result = MultiByteToWideChar(codepage, 0, reinterpret_cast(src), -1, reinterpret_cast(dest.data()), wcount); + // init WideCharToMultiByte used variables + // setup src pointer + LPCCH lpMultiByteStr = reinterpret_cast(src.c_str()); + // check whether source string is too large. + size_t cSrcSize = src.size(); + if (cSrcSize > std::numeric_limits::max()) return false; + int cbMultiByte = static_cast(src.size()); + + // do convertion + // do a dry-run first to fetch desired size. + int desired_size = MultiByteToWideChar(code_page, 0, lpMultiByteStr, cbMultiByte, NULL, 0); + if (desired_size <= 0) return false; + // resize dest for receiving result + dst.resize(static_cast(desired_size)); + // do real convertion + int write_result = MultiByteToWideChar(code_page, 0, lpMultiByteStr, cbMultiByte, reinterpret_cast(dst.data()), desired_size); if (write_result <= 0) return false; return true; } - bool UTF8ToWchar(const char* src, std::wstring& dest) { - return CharToWchar(src, dest, CP_UTF8); + bool CharToWchar(const char* src, std::wstring& dst, UINT code_page) { + CONVFCT_TYPE2(CharToWchar, char, wchar_t, code_page); } - std::wstring CharToWchar(const char* src, UINT codepage) { - std::wstring ret; - if (!CharToWchar(src, ret, codepage)) ret.clear(); - return ret; + std::wstring CharToWchar(const std::string& src, UINT code_page) { + CONVFCT_TYPE3(CharToWchar, char, wchar_t, code_page); } - std::wstring UTF8ToWchar(const char* src) { - return CharToWchar(src, CP_UTF8); + std::wstring CharToWchar(const char* src, UINT code_page) { + CONVFCT_TYPE4(CharToWchar, char, wchar_t, code_page); } - bool CharToChar(const char* src, std::string& dest, UINT src_codepage, UINT dest_codepage) { +#pragma endregion + +#pragma region CharToChar + + bool CharToChar(const std::string& src, std::string& dst, UINT src_code_page, UINT dst_code_page) { std::wstring intermediary; - if (!CharToWchar(src, intermediary, src_codepage)) return false; - if (!WcharToChar(intermediary.c_str(), dest, dest_codepage)) return false; + if (!CharToWchar(src, intermediary, src_code_page)) return false; + if (!WcharToChar(intermediary.c_str(), dst, dst_code_page)) return false; return true; } - std::string CharToChar(const char* src, UINT src_codepage, UINT dest_codepage) { - std::string ret; - if (!CharToChar(src, ret, src_codepage, dest_codepage)) ret.clear(); - return ret; + bool CharToChar(const char* src, std::string& dst, UINT src_code_page, UINT dst_code_page) { + CONVFCT_TYPE2(CharToChar, char, char, src_code_page, dst_code_page); + } + std::string CharToChar(const std::string& src, UINT src_code_page, UINT dst_code_page) { + CONVFCT_TYPE3(CharToChar, char, char, src_code_page, dst_code_page); + } + std::string CharToChar(const char* src, UINT src_code_page, UINT dst_code_page) { + CONVFCT_TYPE4(CharToChar, char, char, src_code_page, dst_code_page); } +#pragma endregion + +#pragma region WcharToUTF8 + + bool WcharToUTF8(const std::wstring& src, yycc_u8string& dst) { + std::string adapted_dst; + bool ret = WcharToChar(src, adapted_dst, CP_UTF8); + if (ret) dst.assign(reinterpret_cast(adapted_dst.c_str()), adapted_dst.size()); + return ret; + } + bool WcharToUTF8(const wchar_t* src, yycc_u8string& dst) { + CONVFCT_TYPE2(WcharToUTF8, wchar_t, yycc_char8_t); + } + yycc_u8string WcharToUTF8(const std::wstring& src) { + CONVFCT_TYPE3(WcharToUTF8, wchar_t, yycc_char8_t); + } + yycc_u8string WcharToUTF8(const wchar_t* src) { + CONVFCT_TYPE4(WcharToUTF8, wchar_t, yycc_char8_t); + } + +#pragma endregion + +#pragma region UTF8ToWchar + + bool UTF8ToWchar(const yycc_u8string& src, std::wstring& dst) { + std::string adapted_src(reinterpret_cast(src.c_str()), src.size()); + return CharToWchar(adapted_src, dst, CP_UTF8); + } + bool UTF8ToWchar(const yycc_char8_t* src, std::wstring& dst) { + CONVFCT_TYPE2(UTF8ToWchar, yycc_char8_t, wchar_t); + } + std::wstring UTF8ToWchar(const yycc_u8string& src) { + CONVFCT_TYPE3(UTF8ToWchar, yycc_char8_t, wchar_t); + } + std::wstring UTF8ToWchar(const yycc_char8_t* src) { + CONVFCT_TYPE4(UTF8ToWchar, yycc_char8_t, wchar_t); + } + +#pragma endregion + #endif + +#pragma region UTF8 UTF16 UTF32 Help Funcs + + /* + According to the documentation introduced in CppReference. + The standard library is guaranteed to provide several specific specializations of \c std::codecvt. + The UTF8 char type in UTF8 related specializations of \c std::codecvt is different. + It is also independend from we defined \c yycc_char8_t. + So it is essential define a type which can correctly trigger specific specializations of \c std::codecv in there. + */ #if defined(__cpp_char8_t) using CodecvtUTF8Char_t = char8_t; #else using CodecvtUTF8Char_t = char; #endif + template || std::is_same_v<_TChar, char32_t>, int> = 0> using CodecvtFacet_t = std::codecvt<_TChar, CodecvtUTF8Char_t, std::mbstate_t>; template || std::is_same_v<_TChar, char32_t>, int> = 0> - static bool UTF8ToUTFOther(const char* _src, std::basic_string<_TChar>& dest) { + static bool UTF8ToUTFOther(const yycc_u8string& src, std::basic_string<_TChar>& dst) { // Reference: // https://zh.cppreference.com/w/cpp/locale/codecvt/in - // init src string - if (_src == nullptr) return false; - std::string src(_src); - // init locale and get codecvt facet // same reason in UTFOtherToUTF8 to keeping reference to locale const auto& this_locale = std::locale::classic(); @@ -94,12 +194,12 @@ namespace YYCC::EncodingHelper { // convertion preparation std::mbstate_t mb{}; - dest.resize(src.size()); + dst.resize(src.size()); const CodecvtUTF8Char_t* intern_from = reinterpret_cast(src.c_str()), *intern_from_end = reinterpret_cast(src.c_str() + src.size()), *intern_from_next = nullptr; - _TChar* extern_to = dest.data(), - *extern_to_end = dest.data() + dest.size(), + _TChar* extern_to = dst.data(), + *extern_to_end = dst.data() + dst.size(), *extern_to_next = nullptr; // do convertion auto result = this_codecvt.in( @@ -112,36 +212,15 @@ namespace YYCC::EncodingHelper { if (result != CodecvtFacet_t<_TChar>::ok) return false; // resize result and return - dest.resize(extern_to_next - dest.data()); + dst.resize(extern_to_next - dst.data()); return true; } - bool UTF8ToUTF16(const char* src, std::u16string& dest) { - return UTF8ToUTFOther(src, dest); - } - std::u16string UTF8ToUTF16(const char* src) { - std::u16string ret; - if (!UTF8ToUTF16(src, ret)) ret.clear(); - return ret; - } - bool UTF8ToUTF32(const char* src, std::u32string& dest) { - return UTF8ToUTFOther(src, dest); - } - std::u32string UTF8ToUTF32(const char* src) { - std::u32string ret; - if (!UTF8ToUTF32(src, ret)) ret.clear(); - return ret; - } - template || std::is_same_v<_TChar, char32_t>, int> = 0> - static bool UTFOtherToUTF8(const _TChar* _src, std::string& dest) { + static bool UTFOtherToUTF8(const std::basic_string<_TChar>& src, yycc_u8string& dst) { // Reference: // https://zh.cppreference.com/w/cpp/locale/codecvt/out - // initialize src string - if (_src == nullptr) return false; - std::basic_string<_TChar> src(_src); - // init locale and get codecvt facet // the reference to locale must be preserved until convertion done. // because the life time of codecvt facet is equal to the reference to locale. @@ -150,12 +229,12 @@ namespace YYCC::EncodingHelper { // do convertion preparation std::mbstate_t mb{}; - dest.resize(src.size() * this_codecvt.max_length()); + dst.resize(src.size() * this_codecvt.max_length()); const _TChar* intern_from = src.c_str(), *intern_from_end = src.c_str() + src.size(), *intern_from_next = nullptr; - CodecvtUTF8Char_t* extern_to = reinterpret_cast(dest.data()), - *extern_to_end = reinterpret_cast(dest.data() + dest.size()), + CodecvtUTF8Char_t* extern_to = reinterpret_cast(dst.data()), + *extern_to_end = reinterpret_cast(dst.data() + dst.size()), *extern_to_next = nullptr; // do convertion auto result = this_codecvt.out( @@ -168,26 +247,83 @@ namespace YYCC::EncodingHelper { if (result != CodecvtFacet_t<_TChar>::ok) return false; // resize result and retuen - dest.resize(extern_to_next - reinterpret_cast(dest.data())); + dst.resize(extern_to_next - reinterpret_cast(dst.data())); return true; } - bool UTF16ToUTF8(const char16_t* src, std::string& dest) { - return UTFOtherToUTF8(src, dest); +#pragma endregion + +#pragma region UTF8ToUTF16 + + bool UTF8ToUTF16(const yycc_u8string& src, std::u16string& dst) { + return UTF8ToUTFOther(src, dst); } - std::string UTF16ToUTF8(const char16_t* src) { - std::string ret; - if (!UTF16ToUTF8(src, ret)) ret.clear(); - return ret; + bool UTF8ToUTF16(const yycc_char8_t* src, std::u16string& dst) { + CONVFCT_TYPE2(UTF8ToUTF16, yycc_char8_t, char16_t); } - bool UTF32ToUTF8(const char32_t* src, std::string& dest) { - return UTFOtherToUTF8(src, dest); + std::u16string UTF8ToUTF16(const yycc_u8string& src) { + CONVFCT_TYPE3(UTF8ToUTF16, yycc_char8_t, char16_t); } - std::string UTF32ToUTF8(const char32_t* src) { - std::string ret; - if (!UTF32ToUTF8(src, ret)) ret.clear(); - return ret; + std::u16string UTF8ToUTF16(const yycc_char8_t* src) { + CONVFCT_TYPE4(UTF8ToUTF16, yycc_char8_t, char16_t); } +#pragma endregion + +#pragma region UTF16ToUTF8 + + bool UTF16ToUTF8(const std::u16string& src, yycc_u8string& dst) { + return UTFOtherToUTF8(src, dst); + } + bool UTF16ToUTF8(const char16_t* src, yycc_u8string& dst) { + CONVFCT_TYPE2(UTF16ToUTF8, char16_t, yycc_char8_t); + } + yycc_u8string UTF16ToUTF8(const std::u16string& src) { + CONVFCT_TYPE3(UTF16ToUTF8, char16_t, yycc_char8_t); + } + yycc_u8string UTF16ToUTF8(const char16_t* src) { + CONVFCT_TYPE4(UTF16ToUTF8, char16_t, yycc_char8_t); + } + +#pragma endregion + +#pragma region UTF8ToUTF32 + + bool UTF8ToUTF32(const yycc_u8string& src, std::u32string& dst) { + return UTF8ToUTFOther(src, dst); + } + bool UTF8ToUTF32(const yycc_char8_t* src, std::u32string& dst) { + CONVFCT_TYPE2(UTF8ToUTF32, yycc_char8_t, char32_t); + } + std::u32string UTF8ToUTF32(const yycc_u8string& src) { + CONVFCT_TYPE3(UTF8ToUTF32, yycc_char8_t, char32_t); + } + std::u32string UTF8ToUTF32(const yycc_char8_t* src) { + CONVFCT_TYPE4(UTF8ToUTF32, yycc_char8_t, char32_t); + } + +#pragma endregion + +#pragma region UTF32ToUTF8 + + bool UTF32ToUTF8(const std::u32string& src, yycc_u8string& dst) { + return UTFOtherToUTF8(src, dst); + } + bool UTF32ToUTF8(const char32_t* src, yycc_u8string& dst) { + CONVFCT_TYPE2(UTF32ToUTF8, char32_t, yycc_char8_t); + } + yycc_u8string UTF32ToUTF8(const std::u32string& src) { + CONVFCT_TYPE3(UTF32ToUTF8, char32_t, yycc_char8_t); + } + yycc_u8string UTF32ToUTF8(const char32_t* src) { + CONVFCT_TYPE4(UTF32ToUTF8, char32_t, yycc_char8_t); + } + +#pragma endregion + +#undef CONVFCT_TYPE2 +#undef CONVFCT_TYPE3 +#undef CONVFCT_TYPE4 + } diff --git a/src/EncodingHelper.hpp b/src/EncodingHelper.hpp index fb3ffc1..5b6fa4b 100644 --- a/src/EncodingHelper.hpp +++ b/src/EncodingHelper.hpp @@ -21,20 +21,20 @@ * \li \c UTF8: UTF8 string. * \li \c Wchar: wchar_t string. * \par - * For example: \c WcharToUTF8 will perform the convertion from wchar_t to UTF8, + * For example: \c WcharToUTF8 will perform the convertion from wchar_t to UTF8, * and \c CharToChar will perform the convertion between 2 code-page-based string and caller can specify individual code page for these 2 string. * \par * These functions are Windows specific and are unavailable on other platforms. * Becasue Windows use wchar_t string as its function arguments for globalization, and this library use UTF8 everywhere. * So it should have a bidirectional way to do convertion between wchar_t string and UTF8 string. - * + * * \par UTF32, UTF16 and UTF8 Convertion * This namespace also provide the convertion among UTF32, UTF16 and UTF8. * These convertion functions are suit for all platforms, not Windows oriented. * \par * Due to implementation, this library assume all non-Windows system use UTF8 as their C locale. * Otherwise these functions will produce wrong result. - * + * * \par Function Parameters * We provide these encoding convertion functions with following 2 types: * \li Function returns \c bool and its parameter order source string pointer and a corresponding \c std::basic_string container for receiving result. @@ -46,35 +46,59 @@ * First declaration will return false to indicate there is an error when doing convertion. Please note that the content of string container passing in may still be changed! * Last declaration will return empty string to indicate error. Please note if you pass empty string in, they still will output empty string but it doesn't mean an error. * So last declaration is used in the scenario that we don't care whether the convertion success did. For example, output something to console. - * + * */ namespace YYCC::EncodingHelper { #if YYCC_OS == YYCC_OS_WINDOWS - bool WcharToChar(const wchar_t* src, std::string& dest, UINT codepage); - bool WcharToUTF8(const wchar_t* src, std::string& dest); - std::string WcharToChar(const wchar_t* src, UINT codepage); - std::string WcharToUTF8(const wchar_t* src); + bool WcharToChar(const std::wstring& src, std::string& dst, UINT code_page); + bool WcharToChar(const wchar_t* src, std::string& dst, UINT code_page); + std::string WcharToChar(const std::wstring& src, UINT code_page); + std::string WcharToChar(const wchar_t* src, UINT code_page); - bool CharToWchar(const char* src, std::wstring& dest, UINT codepage); - bool UTF8ToWchar(const char* src, std::wstring& dest); - std::wstring CharToWchar(const char* src, UINT codepage); - std::wstring UTF8ToWchar(const char* src); + bool CharToWchar(const std::string& src, std::wstring& dst, UINT code_page); + bool CharToWchar(const char* src, std::wstring& dst, UINT code_page); + std::wstring CharToWchar(const std::string& src, UINT code_page); + std::wstring CharToWchar(const char* src, UINT code_page); - bool CharToChar(const char* src, std::string& dest, UINT src_codepage, UINT dest_codepage); - std::string CharToChar(const char* src, UINT src_codepage, UINT dest_codepage); + bool CharToChar(const std::string& src, std::string& dst, UINT src_code_page, UINT dst_code_page); + bool CharToChar(const char* src, std::string& dst, UINT src_code_page, UINT dst_code_page); + std::string CharToChar(const std::string& src, UINT src_code_page, UINT dst_code_page); + std::string CharToChar(const char* src, UINT src_code_page, UINT dst_code_page); + + + bool WcharToUTF8(const std::wstring& src, yycc_u8string& dst); + bool WcharToUTF8(const wchar_t* src, yycc_u8string& dst); + yycc_u8string WcharToUTF8(const std::wstring& src); + yycc_u8string WcharToUTF8(const wchar_t* src); + + bool UTF8ToWchar(const yycc_u8string& src, std::wstring& dst); + bool UTF8ToWchar(const yycc_char8_t* src, std::wstring& dst); + std::wstring UTF8ToWchar(const yycc_u8string& src); + std::wstring UTF8ToWchar(const yycc_char8_t* src); #endif - bool UTF8ToUTF16(const char* src, std::u16string& dest); - std::u16string UTF8ToUTF16(const char* src); - bool UTF8ToUTF32(const char* src, std::u32string& dest); - std::u32string UTF8ToUTF32(const char* src); + bool UTF8ToUTF16(const yycc_u8string& src, std::u16string& dst); + bool UTF8ToUTF16(const yycc_char8_t* src, std::u16string& dst); + std::u16string UTF8ToUTF16(const yycc_u8string& src); + std::u16string UTF8ToUTF16(const yycc_char8_t* src); - bool UTF16ToUTF8(const char16_t* src, std::string& dest); - std::string UTF16ToUTF8(const char16_t* src); - bool UTF32ToUTF8(const char32_t* src, std::string& dest); - std::string UTF32ToUTF8(const char32_t* src); + bool UTF16ToUTF8(const std::u16string& src, yycc_u8string& dst); + bool UTF16ToUTF8(const char16_t* src, yycc_u8string& dst); + yycc_u8string UTF16ToUTF8(const std::u16string& src); + yycc_u8string UTF16ToUTF8(const char16_t* src); + + + bool UTF8ToUTF32(const yycc_u8string& src, std::u32string& dst); + bool UTF8ToUTF32(const yycc_char8_t* src, std::u32string& dst); + std::u32string UTF8ToUTF32(const yycc_u8string& src); + std::u32string UTF8ToUTF32(const yycc_char8_t* src); + + bool UTF32ToUTF8(const std::u32string& src, yycc_u8string& dst); + bool UTF32ToUTF8(const char32_t* src, yycc_u8string& dst); + yycc_u8string UTF32ToUTF8(const std::u32string& src); + yycc_u8string UTF32ToUTF8(const char32_t* src); } diff --git a/src/YYCCInternal.hpp b/src/YYCCInternal.hpp index 3cdc8b3..94a7451 100644 --- a/src/YYCCInternal.hpp +++ b/src/YYCCInternal.hpp @@ -24,14 +24,23 @@ #endif -//// Decide the char type we used -//#include -//namespace YYCC { -//#if defined(__cpp_char8_t) -// using u8char = char8_t; -// using u8string = std::std::string -//#else -// using u8char = char; -// using u8string = std::string; -//#endif -//} +// Define the UTF8 char type we used. +// Also define an universal macro to create UTF8 string literal. +// And do a polyfill if no embedded char8_t type. +#include +namespace YYCC { +#if defined(__cpp_char8_t) + using yycc_char8_t = char8_t; + using yycc_u8string = std::u8string; + +#define _YYCC_U8(strl) u8 ## strl +#define YYCC_U8(strl) (_YYCC_U8(strl)) +#else + using yycc_char8_t = unsigned char; + using yycc_u8string = std::basic_string; + +#define _YYCC_U8(strl) u8 ## strl +#define YYCC_U8(strl) (reinterpret_cast(_YYCC_U8(strl))) +#endif +} +