refactor: add utf convertion namespace

This commit is contained in:
2025-07-02 10:36:33 +08:00
parent 6e884d865d
commit cec6091996
7 changed files with 259 additions and 236 deletions

View File

@ -15,6 +15,10 @@ PRIVATE
yycc/string/op.cpp
yycc/rust/panic.cpp
yycc/patch/path.cpp
yycc/encoding/utf.cpp
yycc/encoding/windows.cpp
yycc/encoding/iconv.cpp
yycc/encoding/united_codec.cpp
# YYCC/COMHelper.cpp
# YYCC/ArgParser.cpp
# YYCC/ConfigManager.cpp
@ -61,6 +65,10 @@ FILES
yycc/patch/path.hpp
yycc/patch/contains.hpp
yycc/patch/starts_ends_with.hpp
yycc/encoding/utf.hpp
yycc/encoding/windows.hpp
yycc/encoding/iconv.hpp
yycc/encoding/united_codec.hpp
# # Headers
# # Common headers

View File

@ -4,36 +4,6 @@
namespace YYCC::EncodingHelper {
#pragma region UTF8 Ordinary Convertion
const yycc_char8_t* ToUTF8(const char* src) {
return reinterpret_cast<const yycc_char8_t*>(src);
}
yycc_char8_t* ToUTF8(char* src) {
return reinterpret_cast<yycc_char8_t*>(src);
}
yycc_u8string ToUTF8(const std::string_view& src) {
return yycc_u8string(reinterpret_cast<const yycc_char8_t*>(src.data()), src.size());
}
yycc_u8string_view ToUTF8View(const std::string_view& src) {
return yycc_u8string_view(reinterpret_cast<const yycc_char8_t*>(src.data()), src.size());
}
const char* ToOrdinary(const yycc_char8_t* src) {
return reinterpret_cast<const char*>(src);
}
char* ToOrdinary(yycc_char8_t* src) {
return reinterpret_cast<char*>(src);
}
std::string ToOrdinary(const yycc_u8string_view& src) {
return std::string(reinterpret_cast<const char*>(src.data()), src.size());
}
std::string_view ToOrdinaryView(const yycc_u8string_view& src) {
return std::string_view(reinterpret_cast<const char*>(src.data()), src.size());
}
#pragma endregion
/* Define some assistant macros for easy writing. */
#define CONVFCT_TYPE2(fct_name, src_char_type, dst_char_type, ...) if (src == nullptr) return false; \
@ -231,176 +201,6 @@ return ret;
#endif
#pragma region UTF8 UTF16 UTF32 Help Funcs
/*
According to the documentation introduced in CppReference.
The standard library is guaranteed to provide several specific specializations of \c std::codecvt.
The UTF8 char type in UTF8 related specializations of \c std::codecvt is different.
It is also independend from we defined \c yycc_char8_t.
So it is essential define a type which can correctly trigger specific specializations of \c std::codecv in there.
*/
#if defined(__cpp_char8_t)
using CodecvtUTF8Char_t = char8_t;
#else
using CodecvtUTF8Char_t = char;
#endif
template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
using CodecvtFacet_t = std::codecvt<_TChar, CodecvtUTF8Char_t, std::mbstate_t>;
template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
static bool UTF8ToUTFOther(const yycc_u8string_view& src, std::basic_string<_TChar>& dst) {
// Reference:
// https://zh.cppreference.com/w/cpp/locale/codecvt/in
// if src is empty, return directly
if (src.empty()) {
dst.clear();
return true;
}
// init locale and get codecvt facet
// same reason in UTFOtherToUTF8 to keeping reference to locale
const auto& this_locale = std::locale::classic();
const auto& this_codecvt = std::use_facet<CodecvtFacet_t<_TChar>>(this_locale);
// convertion preparation
std::mbstate_t mb{};
dst.resize(src.size());
const CodecvtUTF8Char_t* intern_from = reinterpret_cast<const CodecvtUTF8Char_t*>(src.data()),
*intern_from_end = reinterpret_cast<const CodecvtUTF8Char_t*>(src.data() + src.size()),
*intern_from_next = nullptr;
_TChar* extern_to = dst.data(),
*extern_to_end = dst.data() + dst.size(),
*extern_to_next = nullptr;
// do convertion
auto result = this_codecvt.in(
mb,
intern_from, intern_from_end, intern_from_next,
extern_to, extern_to_end, extern_to_next
);
// check result
if (result != CodecvtFacet_t<_TChar>::ok)
return false;
// resize result and return
dst.resize(extern_to_next - dst.data());
return true;
}
template<typename _TChar, std::enable_if_t<std::is_same_v<_TChar, char16_t> || std::is_same_v<_TChar, char32_t>, int> = 0>
static bool UTFOtherToUTF8(const std::basic_string_view<_TChar>& src, yycc_u8string& dst) {
// Reference:
// https://zh.cppreference.com/w/cpp/locale/codecvt/out
// if src is empty, return directly
if (src.empty()) {
dst.clear();
return true;
}
// init locale and get codecvt facet
// the reference to locale must be preserved until convertion done.
// because the life time of codecvt facet is equal to the reference to locale.
const auto& this_locale = std::locale::classic();
const auto& this_codecvt = std::use_facet<CodecvtFacet_t<_TChar>>(this_locale);
// do convertion preparation
std::mbstate_t mb{};
dst.resize(src.size() * this_codecvt.max_length());
const _TChar* intern_from = src.data(),
*intern_from_end = src.data() + src.size(),
*intern_from_next = nullptr;
CodecvtUTF8Char_t* extern_to = reinterpret_cast<CodecvtUTF8Char_t*>(dst.data()),
*extern_to_end = reinterpret_cast<CodecvtUTF8Char_t*>(dst.data() + dst.size()),
*extern_to_next = nullptr;
// do convertion
auto result = this_codecvt.out(
mb,
intern_from, intern_from_end, intern_from_next,
extern_to, extern_to_end, extern_to_next
);
// check result
if (result != CodecvtFacet_t<_TChar>::ok)
return false;
// resize result and retuen
dst.resize(extern_to_next - reinterpret_cast<CodecvtUTF8Char_t*>(dst.data()));
return true;
}
#pragma endregion
#pragma region UTF8ToUTF16
bool UTF8ToUTF16(const yycc_u8string_view& src, std::u16string& dst) {
return UTF8ToUTFOther<char16_t>(src, dst);
}
bool UTF8ToUTF16(const yycc_char8_t* src, std::u16string& dst) {
CONVFCT_TYPE2(UTF8ToUTF16, yycc_char8_t, char16_t);
}
std::u16string UTF8ToUTF16(const yycc_u8string_view& src) {
CONVFCT_TYPE3(UTF8ToUTF16, yycc_char8_t, char16_t);
}
std::u16string UTF8ToUTF16(const yycc_char8_t* src) {
CONVFCT_TYPE4(UTF8ToUTF16, yycc_char8_t, char16_t);
}
#pragma endregion
#pragma region UTF16ToUTF8
bool UTF16ToUTF8(const std::u16string_view& src, yycc_u8string& dst) {
return UTFOtherToUTF8<char16_t>(src, dst);
}
bool UTF16ToUTF8(const char16_t* src, yycc_u8string& dst) {
CONVFCT_TYPE2(UTF16ToUTF8, char16_t, yycc_char8_t);
}
yycc_u8string UTF16ToUTF8(const std::u16string_view& src) {
CONVFCT_TYPE3(UTF16ToUTF8, char16_t, yycc_char8_t);
}
yycc_u8string UTF16ToUTF8(const char16_t* src) {
CONVFCT_TYPE4(UTF16ToUTF8, char16_t, yycc_char8_t);
}
#pragma endregion
#pragma region UTF8ToUTF32
bool UTF8ToUTF32(const yycc_u8string_view& src, std::u32string& dst) {
return UTF8ToUTFOther<char32_t>(src, dst);
}
bool UTF8ToUTF32(const yycc_char8_t* src, std::u32string& dst) {
CONVFCT_TYPE2(UTF8ToUTF32, yycc_char8_t, char32_t);
}
std::u32string UTF8ToUTF32(const yycc_u8string_view& src) {
CONVFCT_TYPE3(UTF8ToUTF32, yycc_char8_t, char32_t);
}
std::u32string UTF8ToUTF32(const yycc_char8_t* src) {
CONVFCT_TYPE4(UTF8ToUTF32, yycc_char8_t, char32_t);
}
#pragma endregion
#pragma region UTF32ToUTF8
bool UTF32ToUTF8(const std::u32string_view& src, yycc_u8string& dst) {
return UTFOtherToUTF8<char32_t>(src, dst);
}
bool UTF32ToUTF8(const char32_t* src, yycc_u8string& dst) {
CONVFCT_TYPE2(UTF32ToUTF8, char32_t, yycc_char8_t);
}
yycc_u8string UTF32ToUTF8(const std::u32string_view& src) {
CONVFCT_TYPE3(UTF32ToUTF8, char32_t, yycc_char8_t);
}
yycc_u8string UTF32ToUTF8(const char32_t* src) {
CONVFCT_TYPE4(UTF32ToUTF8, char32_t, yycc_char8_t);
}
#pragma endregion
#undef CONVFCT_TYPE2
#undef CONVFCT_TYPE3
#undef CONVFCT_TYPE4

View File

@ -17,20 +17,6 @@
*/
namespace YYCC::EncodingHelper {
#define _YYCC_U8(strl) u8 ## strl ///< The assistant macro for YYCC_U8.
#define YYCC_U8(strl) (reinterpret_cast<const ::YYCC::yycc_char8_t*>(_YYCC_U8(strl))) ///< The macro for creating UTF8 string literal. See \ref library_encoding.
#define YYCC_U8_CHAR(chr) (static_cast<YYCC::yycc_char8_t>(chr)) ///< The macro for casting ordinary char type into YYCC UTF8 char type.
const yycc_char8_t* ToUTF8(const char* src);
yycc_char8_t* ToUTF8(char* src);
yycc_u8string ToUTF8(const std::string_view& src);
yycc_u8string_view ToUTF8View(const std::string_view& src);
const char* ToOrdinary(const yycc_char8_t* src);
char* ToOrdinary(yycc_char8_t* src);
std::string ToOrdinary(const yycc_u8string_view& src);
std::string_view ToOrdinaryView(const yycc_u8string_view& src);
#if YYCC_OS == YYCC_OS_WINDOWS
bool WcharToChar(const std::wstring_view& src, std::string& dst, UINT code_page);
@ -71,25 +57,4 @@ namespace YYCC::EncodingHelper {
#endif
bool UTF8ToUTF16(const yycc_u8string_view& src, std::u16string& dst);
bool UTF8ToUTF16(const yycc_char8_t* src, std::u16string& dst);
std::u16string UTF8ToUTF16(const yycc_u8string_view& src);
std::u16string UTF8ToUTF16(const yycc_char8_t* src);
bool UTF16ToUTF8(const std::u16string_view& src, yycc_u8string& dst);
bool UTF16ToUTF8(const char16_t* src, yycc_u8string& dst);
yycc_u8string UTF16ToUTF8(const std::u16string_view& src);
yycc_u8string UTF16ToUTF8(const char16_t* src);
bool UTF8ToUTF32(const yycc_u8string_view& src, std::u32string& dst);
bool UTF8ToUTF32(const yycc_char8_t* src, std::u32string& dst);
std::u32string UTF8ToUTF32(const yycc_u8string_view& src);
std::u32string UTF8ToUTF32(const yycc_char8_t* src);
bool UTF32ToUTF8(const std::u32string_view& src, yycc_u8string& dst);
bool UTF32ToUTF8(const char32_t* src, yycc_u8string& dst);
yycc_u8string UTF32ToUTF8(const std::u32string_view& src);
yycc_u8string UTF32ToUTF8(const char32_t* src);
}

201
src/yycc/encoding/utf.cpp Normal file
View File

@ -0,0 +1,201 @@
#include "utf.hpp"
#include "../macro/feature_probe.hpp"
#include <locale>
#define NS_YYCC_STRING ::yycc::string
namespace yycc::encoding::utf {
#pragma region Generic Converter
/*
* NOTE:
* According to the documentation introduced in CppReference.
* The standard library is guaranteed to provide several specific specializations of \c std::codecvt.
* The UTF8 char type in UTF8 related specializations of \c std::codecvt is different.
* It is also independend from we defined char type.
* So it is essential define a type which can correctly trigger specific specializations of \c std::codecv in there.
*/
#if defined(YYCC_CPPFEAT_UTF8)
using CodecvtUtf8Char = char8_t;
#else
using CodecvtUtf8Char = char;
#endif
template<typename TChar,
std::enable_if_t<std::is_same_v<TChar, char16_t> || std::is_same_v<TChar, char32_t>, int>
= 0>
using CodecvtFacet = std::codecvt<TChar, CodecvtUtf8Char, std::mbstate_t>;
template<typename TChar,
std::enable_if_t<std::is_same_v<TChar, char16_t> || std::is_same_v<TChar, char32_t>, int>
= 0>
static ConvResult<std::basic_string<TChar>> generic_to_utf_other(
const NS_YYCC_STRING::u8string_view& src) {
// Reference:
// https://en.cppreference.com/w/cpp/locale/codecvt/in
// prepare return value
std::basic_string<TChar> dst;
// if src is empty, return directly
if (src.empty()) {
return dst;
}
// init locale and get codecvt facet
// same reason in UTFOtherToUTF8 to keeping reference to locale
const auto& this_locale = std::locale::classic();
const auto& this_codecvt = std::use_facet<CodecvtFacet<TChar>>(this_locale);
// convertion preparation
std::mbstate_t mb{};
dst.resize(src.size());
const CodecvtUtf8Char *intern_from = reinterpret_cast<const CodecvtUtf8Char*>(src.data()),
*intern_from_end = reinterpret_cast<const CodecvtUtf8Char*>(
src.data() + src.size()),
*intern_from_next = nullptr;
TChar *extern_to = dst.data(), *extern_to_end = dst.data() + dst.size(),
*extern_to_next = nullptr;
// do convertion
auto result = this_codecvt.in(mb,
intern_from,
intern_from_end,
intern_from_next,
extern_to,
extern_to_end,
extern_to_next);
// check result
if (result != CodecvtFacet<TChar>::ok) return ConvError();
// resize result and return
dst.resize(extern_to_next - dst.data());
return dst;
}
template<typename TChar,
std::enable_if_t<std::is_same_v<TChar, char16_t> || std::is_same_v<TChar, char32_t>, int>
= 0>
static ConvResult<NS_YYCC_STRING::u8string> generic_to_utf8(
const std::basic_string_view<TChar>& src) {
// Reference:
// https://en.cppreference.com/w/cpp/locale/codecvt/out
// prepare return value
NS_YYCC_STRING::u8string dst;
// if src is empty, return directly
if (src.empty()) {
return dst;
}
// init locale and get codecvt facet
// the reference to locale must be preserved until convertion done.
// because the life time of codecvt facet is equal to the reference to locale.
const auto& this_locale = std::locale::classic();
const auto& this_codecvt = std::use_facet<CodecvtFacet<TChar>>(this_locale);
// do convertion preparation
std::mbstate_t mb{};
dst.resize(src.size() * this_codecvt.max_length());
const TChar *intern_from = src.data(), *intern_from_end = src.data() + src.size(),
*intern_from_next = nullptr;
CodecvtUtf8Char *extern_to = reinterpret_cast<CodecvtUtf8Char*>(dst.data()),
*extern_to_end = reinterpret_cast<CodecvtUtf8Char*>(dst.data() + dst.size()),
*extern_to_next = nullptr;
// do convertion
auto result = this_codecvt.out(mb,
intern_from,
intern_from_end,
intern_from_next,
extern_to,
extern_to_end,
extern_to_next);
// check result
if (result != CodecvtFacet<TChar>::ok) return ConvError();
// resize result and retuen
dst.resize(extern_to_next - reinterpret_cast<CodecvtUtf8Char*>(dst.data()));
return dst;
}
#pragma endregion
#pragma region Help Macros
#define CONVFN_TYPE1(fct_name, src_char_type, dst_char_type) \
auto rv = priv_##fct_name(src); \
if (const auto* ptr = std::get_if<std::basic_string<dst_char_type>>(&rv)) { \
dst = std::move(*ptr); \
return true; \
} else if (const auto* ptr = std::get_if<ConvError>(&rv)) { \
return false; \
} else { \
throw std::runtime_error("unreachable code"); \
}
#define CONVFN_TYPE2(fct_name, src_char_type, dst_char_type) \
std::basic_string<dst_char_type> rv; \
if (fct_name(src, rv)) return rv; \
else throw std::runtime_error("fail to convert utf string");
#pragma endregion
#pragma region UTF8 -> UTF16
ConvResult<std::u16string> priv_to_utf16(const NS_YYCC_STRING::u8string_view& src) {
return generic_to_utf_other<char16_t>(src);
}
bool to_utf16(const NS_YYCC_STRING::u8string_view& src, std::u16string& dst) {
CONVFN_TYPE1(to_utf16, NS_YYCC_STRING::u8char, char16_t);
}
std::u16string to_utf16(const NS_YYCC_STRING::u8string_view& src) {
CONVFN_TYPE2(to_utf16, NS_YYCC_STRING::u8char, char16_t);
}
#pragma endregion
#pragma region UTF16 -> UTF8
ConvResult<NS_YYCC_STRING::u8string> priv_to_utf8(const std::u16string_view& src) {
return generic_to_utf8<char16_t>(src);
}
bool to_utf8(const std::u16string_view& src, NS_YYCC_STRING::u8string& dst) {
CONVFN_TYPE1(to_utf8, char16_t, NS_YYCC_STRING::u8char);
}
NS_YYCC_STRING::u8string to_utf8(const std::u16string_view& src) {
CONVFN_TYPE2(to_utf8, char16_t, NS_YYCC_STRING::u8char);
}
#pragma endregion
#pragma region UTF8 -> UTF32
ConvResult<std::u32string> priv_to_utf32(const NS_YYCC_STRING::u8string_view& src) {
return generic_to_utf_other<char32_t>(src);
}
bool to_utf32(const NS_YYCC_STRING::u8string_view& src, std::u32string& dst) {
CONVFN_TYPE1(to_utf32, NS_YYCC_STRING::u8char, char32_t);
}
std::u32string to_utf32(const NS_YYCC_STRING::u8string_view& src) {
CONVFN_TYPE2(to_utf32, NS_YYCC_STRING::u8char, char32_t);
}
#pragma endregion
#pragma region UTF32 -> UTF8
ConvResult<NS_YYCC_STRING::u8string> priv_to_utf8(const std::u32string_view& src) {
return generic_to_utf8<char32_t>(src);
}
bool to_utf8(const std::u32string_view& src, NS_YYCC_STRING::u8string& dst) {
CONVFN_TYPE1(to_utf8, char32_t, NS_YYCC_STRING::u8char);
}
NS_YYCC_STRING::u8string to_utf8(const std::u32string_view& src) {
CONVFN_TYPE2(to_utf8, char32_t, NS_YYCC_STRING::u8char);
}
#pragma endregion
} // namespace yycc::encoding::utf

View File

@ -0,0 +1,43 @@
#pragma once
#include <yycc/string.hpp>
#include <type_traits>
#include <variant>
#define NS_YYCC_STRING ::yycc::string
namespace yycc::encoding::utf {
/// @private
struct ConvError {};
/// @private
template<typename T, std::enable_if_t<!std::is_same_v<T, ConvError>, int> = 0>
using ConvResult = std::variant<T, ConvError>;
// UTF8 -> UTF16
ConvResult<std::u16string> priv_to_utf16(const NS_YYCC_STRING::u8string_view& src);
bool to_utf16(const NS_YYCC_STRING::u8string_view& src, std::u16string& dst);
std::u16string to_utf16(const NS_YYCC_STRING::u8string_view& src);
// UTF16 -> UTF8
ConvResult<NS_YYCC_STRING::u8string> priv_to_utf8(const std::u16string_view& src);
bool to_utf8(const std::u16string_view& src, NS_YYCC_STRING::u8string& dst);
NS_YYCC_STRING::u8string to_utf8(const std::u16string_view& src);
// UTF8 -> UTF32
ConvResult<std::u32string> priv_to_utf32(const NS_YYCC_STRING::u8string_view& src);
bool to_utf32(const NS_YYCC_STRING::u8string_view& src, std::u32string& dst);
std::u32string to_utf32(const NS_YYCC_STRING::u8string_view& src);
// UTF32 -> UTF8
ConvResult<NS_YYCC_STRING::u8string> priv_to_utf8(const std::u32string_view& src);
bool to_utf8(const std::u32string_view& src, NS_YYCC_STRING::u8string& dst);
NS_YYCC_STRING::u8string to_utf8(const std::u32string_view& src);
}
#undef NS_YYCC_STRING

View File

@ -18,6 +18,11 @@
// ===== C++ Features =====
// Check whether there is support of UTF8 string system.
#if defined(__cpp_char8_t) || defined(YYCC_CPPFEAT_GE_CPP20)
#define YYCC_CPPFEAT_UTF8
#endif
// Check whether there is support of `contains` for `set` and `map` including their varients.
#if defined(YYCC_CPPFEAT_GE_CPP20)
#define YYCC_CPPFEAT_CONTAINS

View File

@ -3,6 +3,7 @@
// Define the UTF8 char type we used.
// And do a polyfill if no embedded char8_t type.
#include "macro/feature_probe.hpp"
#include <string>
#include <string_view>
@ -29,7 +30,7 @@ namespace yycc::string {
It is equal to \c std::u8string_view if your current C++ standard support it.
*/
#if defined(__cpp_char8_t)
#if defined(YYCC_CPPFEAT_UTF8)
using u8char = char8_t;
using u8string = std::u8string;
using u8string_view = std::u8string_view;