diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8d3ba79..869d99f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,10 @@ PRIVATE yycc/string/op.cpp yycc/rust/panic.cpp yycc/patch/path.cpp + yycc/encoding/utf.cpp + yycc/encoding/windows.cpp + yycc/encoding/iconv.cpp + yycc/encoding/united_codec.cpp # YYCC/COMHelper.cpp # YYCC/ArgParser.cpp # YYCC/ConfigManager.cpp @@ -61,6 +65,10 @@ FILES yycc/patch/path.hpp yycc/patch/contains.hpp yycc/patch/starts_ends_with.hpp + yycc/encoding/utf.hpp + yycc/encoding/windows.hpp + yycc/encoding/iconv.hpp + yycc/encoding/united_codec.hpp # # Headers # # Common headers diff --git a/src/YYCCLegacy/EncodingHelper.cpp b/src/YYCCLegacy/EncodingHelper.cpp index bb5a992..8938938 100644 --- a/src/YYCCLegacy/EncodingHelper.cpp +++ b/src/YYCCLegacy/EncodingHelper.cpp @@ -4,36 +4,6 @@ namespace YYCC::EncodingHelper { -#pragma region UTF8 Ordinary Convertion - - const yycc_char8_t* ToUTF8(const char* src) { - return reinterpret_cast(src); - } - yycc_char8_t* ToUTF8(char* src) { - return reinterpret_cast(src); - } - yycc_u8string ToUTF8(const std::string_view& src) { - return yycc_u8string(reinterpret_cast(src.data()), src.size()); - } - yycc_u8string_view ToUTF8View(const std::string_view& src) { - return yycc_u8string_view(reinterpret_cast(src.data()), src.size()); - } - - const char* ToOrdinary(const yycc_char8_t* src) { - return reinterpret_cast(src); - } - char* ToOrdinary(yycc_char8_t* src) { - return reinterpret_cast(src); - } - std::string ToOrdinary(const yycc_u8string_view& src) { - return std::string(reinterpret_cast(src.data()), src.size()); - } - std::string_view ToOrdinaryView(const yycc_u8string_view& src) { - return std::string_view(reinterpret_cast(src.data()), src.size()); - } - -#pragma endregion - /* Define some assistant macros for easy writing. */ #define CONVFCT_TYPE2(fct_name, src_char_type, dst_char_type, ...) if (src == nullptr) return false; \ @@ -231,176 +201,6 @@ return ret; #endif - -#pragma region UTF8 UTF16 UTF32 Help Funcs - - /* - According to the documentation introduced in CppReference. - The standard library is guaranteed to provide several specific specializations of \c std::codecvt. - The UTF8 char type in UTF8 related specializations of \c std::codecvt is different. - It is also independend from we defined \c yycc_char8_t. - So it is essential define a type which can correctly trigger specific specializations of \c std::codecv in there. - */ -#if defined(__cpp_char8_t) - using CodecvtUTF8Char_t = char8_t; -#else - using CodecvtUTF8Char_t = char; -#endif - - template || std::is_same_v<_TChar, char32_t>, int> = 0> - using CodecvtFacet_t = std::codecvt<_TChar, CodecvtUTF8Char_t, std::mbstate_t>; - - template || std::is_same_v<_TChar, char32_t>, int> = 0> - static bool UTF8ToUTFOther(const yycc_u8string_view& src, std::basic_string<_TChar>& dst) { - // Reference: - // https://zh.cppreference.com/w/cpp/locale/codecvt/in - - // if src is empty, return directly - if (src.empty()) { - dst.clear(); - return true; - } - - // init locale and get codecvt facet - // same reason in UTFOtherToUTF8 to keeping reference to locale - const auto& this_locale = std::locale::classic(); - const auto& this_codecvt = std::use_facet>(this_locale); - - // convertion preparation - std::mbstate_t mb{}; - dst.resize(src.size()); - const CodecvtUTF8Char_t* intern_from = reinterpret_cast(src.data()), - *intern_from_end = reinterpret_cast(src.data() + src.size()), - *intern_from_next = nullptr; - _TChar* extern_to = dst.data(), - *extern_to_end = dst.data() + dst.size(), - *extern_to_next = nullptr; - // do convertion - auto result = this_codecvt.in( - mb, - intern_from, intern_from_end, intern_from_next, - extern_to, extern_to_end, extern_to_next - ); - - // check result - if (result != CodecvtFacet_t<_TChar>::ok) - return false; - // resize result and return - dst.resize(extern_to_next - dst.data()); - return true; - } - - template || std::is_same_v<_TChar, char32_t>, int> = 0> - static bool UTFOtherToUTF8(const std::basic_string_view<_TChar>& src, yycc_u8string& dst) { - // Reference: - // https://zh.cppreference.com/w/cpp/locale/codecvt/out - - // if src is empty, return directly - if (src.empty()) { - dst.clear(); - return true; - } - - // init locale and get codecvt facet - // the reference to locale must be preserved until convertion done. - // because the life time of codecvt facet is equal to the reference to locale. - const auto& this_locale = std::locale::classic(); - const auto& this_codecvt = std::use_facet>(this_locale); - - // do convertion preparation - std::mbstate_t mb{}; - dst.resize(src.size() * this_codecvt.max_length()); - const _TChar* intern_from = src.data(), - *intern_from_end = src.data() + src.size(), - *intern_from_next = nullptr; - CodecvtUTF8Char_t* extern_to = reinterpret_cast(dst.data()), - *extern_to_end = reinterpret_cast(dst.data() + dst.size()), - *extern_to_next = nullptr; - // do convertion - auto result = this_codecvt.out( - mb, - intern_from, intern_from_end, intern_from_next, - extern_to, extern_to_end, extern_to_next - ); - - // check result - if (result != CodecvtFacet_t<_TChar>::ok) - return false; - // resize result and retuen - dst.resize(extern_to_next - reinterpret_cast(dst.data())); - return true; - } - -#pragma endregion - -#pragma region UTF8ToUTF16 - - bool UTF8ToUTF16(const yycc_u8string_view& src, std::u16string& dst) { - return UTF8ToUTFOther(src, dst); - } - bool UTF8ToUTF16(const yycc_char8_t* src, std::u16string& dst) { - CONVFCT_TYPE2(UTF8ToUTF16, yycc_char8_t, char16_t); - } - std::u16string UTF8ToUTF16(const yycc_u8string_view& src) { - CONVFCT_TYPE3(UTF8ToUTF16, yycc_char8_t, char16_t); - } - std::u16string UTF8ToUTF16(const yycc_char8_t* src) { - CONVFCT_TYPE4(UTF8ToUTF16, yycc_char8_t, char16_t); - } - -#pragma endregion - -#pragma region UTF16ToUTF8 - - bool UTF16ToUTF8(const std::u16string_view& src, yycc_u8string& dst) { - return UTFOtherToUTF8(src, dst); - } - bool UTF16ToUTF8(const char16_t* src, yycc_u8string& dst) { - CONVFCT_TYPE2(UTF16ToUTF8, char16_t, yycc_char8_t); - } - yycc_u8string UTF16ToUTF8(const std::u16string_view& src) { - CONVFCT_TYPE3(UTF16ToUTF8, char16_t, yycc_char8_t); - } - yycc_u8string UTF16ToUTF8(const char16_t* src) { - CONVFCT_TYPE4(UTF16ToUTF8, char16_t, yycc_char8_t); - } - -#pragma endregion - -#pragma region UTF8ToUTF32 - - bool UTF8ToUTF32(const yycc_u8string_view& src, std::u32string& dst) { - return UTF8ToUTFOther(src, dst); - } - bool UTF8ToUTF32(const yycc_char8_t* src, std::u32string& dst) { - CONVFCT_TYPE2(UTF8ToUTF32, yycc_char8_t, char32_t); - } - std::u32string UTF8ToUTF32(const yycc_u8string_view& src) { - CONVFCT_TYPE3(UTF8ToUTF32, yycc_char8_t, char32_t); - } - std::u32string UTF8ToUTF32(const yycc_char8_t* src) { - CONVFCT_TYPE4(UTF8ToUTF32, yycc_char8_t, char32_t); - } - -#pragma endregion - -#pragma region UTF32ToUTF8 - - bool UTF32ToUTF8(const std::u32string_view& src, yycc_u8string& dst) { - return UTFOtherToUTF8(src, dst); - } - bool UTF32ToUTF8(const char32_t* src, yycc_u8string& dst) { - CONVFCT_TYPE2(UTF32ToUTF8, char32_t, yycc_char8_t); - } - yycc_u8string UTF32ToUTF8(const std::u32string_view& src) { - CONVFCT_TYPE3(UTF32ToUTF8, char32_t, yycc_char8_t); - } - yycc_u8string UTF32ToUTF8(const char32_t* src) { - CONVFCT_TYPE4(UTF32ToUTF8, char32_t, yycc_char8_t); - } - -#pragma endregion - #undef CONVFCT_TYPE2 #undef CONVFCT_TYPE3 #undef CONVFCT_TYPE4 diff --git a/src/YYCCLegacy/EncodingHelper.hpp b/src/YYCCLegacy/EncodingHelper.hpp index 4e05d9d..7c95f31 100644 --- a/src/YYCCLegacy/EncodingHelper.hpp +++ b/src/YYCCLegacy/EncodingHelper.hpp @@ -17,20 +17,6 @@ */ namespace YYCC::EncodingHelper { -#define _YYCC_U8(strl) u8 ## strl ///< The assistant macro for YYCC_U8. -#define YYCC_U8(strl) (reinterpret_cast(_YYCC_U8(strl))) ///< The macro for creating UTF8 string literal. See \ref library_encoding. -#define YYCC_U8_CHAR(chr) (static_cast(chr)) ///< The macro for casting ordinary char type into YYCC UTF8 char type. - - const yycc_char8_t* ToUTF8(const char* src); - yycc_char8_t* ToUTF8(char* src); - yycc_u8string ToUTF8(const std::string_view& src); - yycc_u8string_view ToUTF8View(const std::string_view& src); - - const char* ToOrdinary(const yycc_char8_t* src); - char* ToOrdinary(yycc_char8_t* src); - std::string ToOrdinary(const yycc_u8string_view& src); - std::string_view ToOrdinaryView(const yycc_u8string_view& src); - #if YYCC_OS == YYCC_OS_WINDOWS bool WcharToChar(const std::wstring_view& src, std::string& dst, UINT code_page); @@ -71,25 +57,4 @@ namespace YYCC::EncodingHelper { #endif - bool UTF8ToUTF16(const yycc_u8string_view& src, std::u16string& dst); - bool UTF8ToUTF16(const yycc_char8_t* src, std::u16string& dst); - std::u16string UTF8ToUTF16(const yycc_u8string_view& src); - std::u16string UTF8ToUTF16(const yycc_char8_t* src); - - bool UTF16ToUTF8(const std::u16string_view& src, yycc_u8string& dst); - bool UTF16ToUTF8(const char16_t* src, yycc_u8string& dst); - yycc_u8string UTF16ToUTF8(const std::u16string_view& src); - yycc_u8string UTF16ToUTF8(const char16_t* src); - - - bool UTF8ToUTF32(const yycc_u8string_view& src, std::u32string& dst); - bool UTF8ToUTF32(const yycc_char8_t* src, std::u32string& dst); - std::u32string UTF8ToUTF32(const yycc_u8string_view& src); - std::u32string UTF8ToUTF32(const yycc_char8_t* src); - - bool UTF32ToUTF8(const std::u32string_view& src, yycc_u8string& dst); - bool UTF32ToUTF8(const char32_t* src, yycc_u8string& dst); - yycc_u8string UTF32ToUTF8(const std::u32string_view& src); - yycc_u8string UTF32ToUTF8(const char32_t* src); - } diff --git a/src/yycc/encoding/utf.cpp b/src/yycc/encoding/utf.cpp new file mode 100644 index 0000000..06f7804 --- /dev/null +++ b/src/yycc/encoding/utf.cpp @@ -0,0 +1,201 @@ +#include "utf.hpp" +#include "../macro/feature_probe.hpp" +#include + +#define NS_YYCC_STRING ::yycc::string + +namespace yycc::encoding::utf { + +#pragma region Generic Converter + + /* + * NOTE: + * According to the documentation introduced in CppReference. + * The standard library is guaranteed to provide several specific specializations of \c std::codecvt. + * The UTF8 char type in UTF8 related specializations of \c std::codecvt is different. + * It is also independend from we defined char type. + * So it is essential define a type which can correctly trigger specific specializations of \c std::codecv in there. + */ + +#if defined(YYCC_CPPFEAT_UTF8) + using CodecvtUtf8Char = char8_t; +#else + using CodecvtUtf8Char = char; +#endif + + template || std::is_same_v, int> + = 0> + using CodecvtFacet = std::codecvt; + + template || std::is_same_v, int> + = 0> + static ConvResult> generic_to_utf_other( + const NS_YYCC_STRING::u8string_view& src) { + // Reference: + // https://en.cppreference.com/w/cpp/locale/codecvt/in + + // prepare return value + std::basic_string dst; + + // if src is empty, return directly + if (src.empty()) { + return dst; + } + + // init locale and get codecvt facet + // same reason in UTFOtherToUTF8 to keeping reference to locale + const auto& this_locale = std::locale::classic(); + const auto& this_codecvt = std::use_facet>(this_locale); + + // convertion preparation + std::mbstate_t mb{}; + dst.resize(src.size()); + const CodecvtUtf8Char *intern_from = reinterpret_cast(src.data()), + *intern_from_end = reinterpret_cast( + src.data() + src.size()), + *intern_from_next = nullptr; + TChar *extern_to = dst.data(), *extern_to_end = dst.data() + dst.size(), + *extern_to_next = nullptr; + // do convertion + auto result = this_codecvt.in(mb, + intern_from, + intern_from_end, + intern_from_next, + extern_to, + extern_to_end, + extern_to_next); + + // check result + if (result != CodecvtFacet::ok) return ConvError(); + // resize result and return + dst.resize(extern_to_next - dst.data()); + return dst; + } + + template || std::is_same_v, int> + = 0> + static ConvResult generic_to_utf8( + const std::basic_string_view& src) { + // Reference: + // https://en.cppreference.com/w/cpp/locale/codecvt/out + + // prepare return value + NS_YYCC_STRING::u8string dst; + + // if src is empty, return directly + if (src.empty()) { + return dst; + } + + // init locale and get codecvt facet + // the reference to locale must be preserved until convertion done. + // because the life time of codecvt facet is equal to the reference to locale. + const auto& this_locale = std::locale::classic(); + const auto& this_codecvt = std::use_facet>(this_locale); + + // do convertion preparation + std::mbstate_t mb{}; + dst.resize(src.size() * this_codecvt.max_length()); + const TChar *intern_from = src.data(), *intern_from_end = src.data() + src.size(), + *intern_from_next = nullptr; + CodecvtUtf8Char *extern_to = reinterpret_cast(dst.data()), + *extern_to_end = reinterpret_cast(dst.data() + dst.size()), + *extern_to_next = nullptr; + // do convertion + auto result = this_codecvt.out(mb, + intern_from, + intern_from_end, + intern_from_next, + extern_to, + extern_to_end, + extern_to_next); + + // check result + if (result != CodecvtFacet::ok) return ConvError(); + // resize result and retuen + dst.resize(extern_to_next - reinterpret_cast(dst.data())); + return dst; + } + +#pragma endregion + +#pragma region Help Macros + +#define CONVFN_TYPE1(fct_name, src_char_type, dst_char_type) \ + auto rv = priv_##fct_name(src); \ + if (const auto* ptr = std::get_if>(&rv)) { \ + dst = std::move(*ptr); \ + return true; \ + } else if (const auto* ptr = std::get_if(&rv)) { \ + return false; \ + } else { \ + throw std::runtime_error("unreachable code"); \ + } + +#define CONVFN_TYPE2(fct_name, src_char_type, dst_char_type) \ + std::basic_string rv; \ + if (fct_name(src, rv)) return rv; \ + else throw std::runtime_error("fail to convert utf string"); + +#pragma endregion + +#pragma region UTF8 -> UTF16 + + ConvResult priv_to_utf16(const NS_YYCC_STRING::u8string_view& src) { + return generic_to_utf_other(src); + } + bool to_utf16(const NS_YYCC_STRING::u8string_view& src, std::u16string& dst) { + CONVFN_TYPE1(to_utf16, NS_YYCC_STRING::u8char, char16_t); + } + std::u16string to_utf16(const NS_YYCC_STRING::u8string_view& src) { + CONVFN_TYPE2(to_utf16, NS_YYCC_STRING::u8char, char16_t); + } + +#pragma endregion + +#pragma region UTF16 -> UTF8 + + ConvResult priv_to_utf8(const std::u16string_view& src) { + return generic_to_utf8(src); + } + bool to_utf8(const std::u16string_view& src, NS_YYCC_STRING::u8string& dst) { + CONVFN_TYPE1(to_utf8, char16_t, NS_YYCC_STRING::u8char); + } + NS_YYCC_STRING::u8string to_utf8(const std::u16string_view& src) { + CONVFN_TYPE2(to_utf8, char16_t, NS_YYCC_STRING::u8char); + } + +#pragma endregion + +#pragma region UTF8 -> UTF32 + + ConvResult priv_to_utf32(const NS_YYCC_STRING::u8string_view& src) { + return generic_to_utf_other(src); + } + bool to_utf32(const NS_YYCC_STRING::u8string_view& src, std::u32string& dst) { + CONVFN_TYPE1(to_utf32, NS_YYCC_STRING::u8char, char32_t); + } + std::u32string to_utf32(const NS_YYCC_STRING::u8string_view& src) { + CONVFN_TYPE2(to_utf32, NS_YYCC_STRING::u8char, char32_t); + } + +#pragma endregion + +#pragma region UTF32 -> UTF8 + + ConvResult priv_to_utf8(const std::u32string_view& src) { + return generic_to_utf8(src); + } + bool to_utf8(const std::u32string_view& src, NS_YYCC_STRING::u8string& dst) { + CONVFN_TYPE1(to_utf8, char32_t, NS_YYCC_STRING::u8char); + } + NS_YYCC_STRING::u8string to_utf8(const std::u32string_view& src) { + CONVFN_TYPE2(to_utf8, char32_t, NS_YYCC_STRING::u8char); + } + +#pragma endregion + +} // namespace yycc::encoding::utf diff --git a/src/yycc/encoding/utf.hpp b/src/yycc/encoding/utf.hpp index e69de29..892919a 100644 --- a/src/yycc/encoding/utf.hpp +++ b/src/yycc/encoding/utf.hpp @@ -0,0 +1,43 @@ +#pragma once +#include +#include +#include + +#define NS_YYCC_STRING ::yycc::string + +namespace yycc::encoding::utf { + + /// @private + struct ConvError {}; + + /// @private + template, int> = 0> + using ConvResult = std::variant; + + // UTF8 -> UTF16 + + ConvResult priv_to_utf16(const NS_YYCC_STRING::u8string_view& src); + bool to_utf16(const NS_YYCC_STRING::u8string_view& src, std::u16string& dst); + std::u16string to_utf16(const NS_YYCC_STRING::u8string_view& src); + + // UTF16 -> UTF8 + + ConvResult priv_to_utf8(const std::u16string_view& src); + bool to_utf8(const std::u16string_view& src, NS_YYCC_STRING::u8string& dst); + NS_YYCC_STRING::u8string to_utf8(const std::u16string_view& src); + + // UTF8 -> UTF32 + + ConvResult priv_to_utf32(const NS_YYCC_STRING::u8string_view& src); + bool to_utf32(const NS_YYCC_STRING::u8string_view& src, std::u32string& dst); + std::u32string to_utf32(const NS_YYCC_STRING::u8string_view& src); + + // UTF32 -> UTF8 + + ConvResult priv_to_utf8(const std::u32string_view& src); + bool to_utf8(const std::u32string_view& src, NS_YYCC_STRING::u8string& dst); + NS_YYCC_STRING::u8string to_utf8(const std::u32string_view& src); + +} + +#undef NS_YYCC_STRING diff --git a/src/yycc/macro/feature_probe.hpp b/src/yycc/macro/feature_probe.hpp index b5cae23..12dded6 100644 --- a/src/yycc/macro/feature_probe.hpp +++ b/src/yycc/macro/feature_probe.hpp @@ -18,6 +18,11 @@ // ===== C++ Features ===== +// Check whether there is support of UTF8 string system. +#if defined(__cpp_char8_t) || defined(YYCC_CPPFEAT_GE_CPP20) + #define YYCC_CPPFEAT_UTF8 +#endif + // Check whether there is support of `contains` for `set` and `map` including their varients. #if defined(YYCC_CPPFEAT_GE_CPP20) #define YYCC_CPPFEAT_CONTAINS diff --git a/src/yycc/string.hpp b/src/yycc/string.hpp index 6118f07..64f0d13 100644 --- a/src/yycc/string.hpp +++ b/src/yycc/string.hpp @@ -3,6 +3,7 @@ // Define the UTF8 char type we used. // And do a polyfill if no embedded char8_t type. +#include "macro/feature_probe.hpp" #include #include @@ -29,7 +30,7 @@ namespace yycc::string { It is equal to \c std::u8string_view if your current C++ standard support it. */ -#if defined(__cpp_char8_t) +#if defined(YYCC_CPPFEAT_UTF8) using u8char = char8_t; using u8string = std::u8string; using u8string_view = std::u8string_view;