2025-07-14 15:06:33 +08:00
|
|
|
#include "windows.hpp"
|
|
|
|
|
2025-07-23 10:18:01 +08:00
|
|
|
#if defined(YYCC_OS_WINDOWS)
|
2025-07-14 15:06:33 +08:00
|
|
|
|
|
|
|
#include "../string/reinterpret.hpp"
|
|
|
|
#include <limits>
|
|
|
|
#include <stdexcept>
|
2025-07-22 21:52:09 +08:00
|
|
|
#include <cuchar>
|
2025-07-14 15:06:33 +08:00
|
|
|
|
|
|
|
#include "../windows/import_guard_head.hpp"
|
|
|
|
#include <Windows.h>
|
2025-07-15 16:17:59 +08:00
|
|
|
#include "../windows/import_guard_tail.hpp"
|
2025-07-14 15:06:33 +08:00
|
|
|
|
|
|
|
#define NS_YYCC_STRING_REINTERPRET ::yycc::string::reinterpret
|
|
|
|
|
|
|
|
namespace yycc::encoding::windows {
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
#pragma region WideCharToMultiByte and MultiByteToWideChar stuff
|
2025-07-14 15:06:33 +08:00
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// WChar -> Char
|
|
|
|
ConvResult<std::string> to_char(const std::wstring_view& src, CodePage code_page) {
|
2025-07-14 15:06:33 +08:00
|
|
|
// prepare result
|
|
|
|
std::string dst;
|
|
|
|
|
|
|
|
// if src is empty, direct output
|
|
|
|
if (src.empty()) {
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
// init WideCharToMultiByte used variables
|
|
|
|
// setup src pointer
|
|
|
|
LPCWCH lpWideCharStr = reinterpret_cast<LPCWCH>(src.data());
|
|
|
|
// check whether source string is too large.
|
|
|
|
size_t cSrcSize = src.size();
|
2025-07-25 10:49:07 +08:00
|
|
|
if (cSrcSize > std::numeric_limits<int>::max()) return std::unexpected(ConvError::TooLargeLength);
|
2025-07-14 15:06:33 +08:00
|
|
|
int cchWideChar = static_cast<int>(src.size());
|
|
|
|
|
|
|
|
// do convertion
|
|
|
|
// do a dry-run first to fetch desired size.
|
2025-07-22 21:52:09 +08:00
|
|
|
int desired_size = WideCharToMultiByte(code_page, 0, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL);
|
2025-07-25 10:49:07 +08:00
|
|
|
if (desired_size <= 0) return std::unexpected(ConvError::NoDesiredSize);
|
2025-07-14 15:06:33 +08:00
|
|
|
// resize dest for receiving result
|
|
|
|
dst.resize(static_cast<size_t>(desired_size));
|
|
|
|
// do real convertion
|
2025-07-22 21:52:09 +08:00
|
|
|
int write_result
|
|
|
|
= WideCharToMultiByte(code_page, 0, lpWideCharStr, cchWideChar, reinterpret_cast<LPSTR>(dst.data()), desired_size, NULL, NULL);
|
2025-07-25 10:49:07 +08:00
|
|
|
if (write_result <= 0) return std::unexpected(ConvError::BadWrittenSize);
|
2025-07-14 15:06:33 +08:00
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// Char -> WChar
|
|
|
|
ConvResult<std::wstring> to_wchar(const std::string_view& src, CodePage code_page) {
|
2025-07-14 15:06:33 +08:00
|
|
|
// prepare result
|
|
|
|
std::wstring dst;
|
|
|
|
|
|
|
|
// if src is empty, direct output
|
|
|
|
if (src.empty()) {
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
// init WideCharToMultiByte used variables
|
|
|
|
// setup src pointer
|
|
|
|
LPCCH lpMultiByteStr = reinterpret_cast<LPCCH>(src.data());
|
|
|
|
// check whether source string is too large.
|
|
|
|
size_t cSrcSize = src.size();
|
2025-07-25 10:49:07 +08:00
|
|
|
if (cSrcSize > std::numeric_limits<int>::max()) return std::unexpected(ConvError::TooLargeLength);
|
2025-07-14 15:06:33 +08:00
|
|
|
int cbMultiByte = static_cast<int>(src.size());
|
|
|
|
|
|
|
|
// do convertion
|
|
|
|
// do a dry-run first to fetch desired size.
|
|
|
|
int desired_size = MultiByteToWideChar(code_page, 0, lpMultiByteStr, cbMultiByte, NULL, 0);
|
2025-07-25 10:49:07 +08:00
|
|
|
if (desired_size <= 0) return std::unexpected(ConvError::NoDesiredSize);
|
2025-07-14 15:06:33 +08:00
|
|
|
// resize dest for receiving result
|
|
|
|
dst.resize(static_cast<size_t>(desired_size));
|
|
|
|
// do real convertion
|
2025-07-22 21:52:09 +08:00
|
|
|
int write_result = MultiByteToWideChar(code_page, 0, lpMultiByteStr, cbMultiByte, reinterpret_cast<LPWSTR>(dst.data()), desired_size);
|
2025-07-25 10:49:07 +08:00
|
|
|
if (write_result <= 0) return std::unexpected(ConvError::BadWrittenSize);
|
2025-07-14 15:06:33 +08:00
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// Char -> Char
|
|
|
|
ConvResult<std::string> to_char(const std::string_view& src, CodePage src_code_page, CodePage dst_code_page) {
|
|
|
|
auto first_rv = to_wchar(src, src_code_page);
|
|
|
|
return first_rv.and_then([dst_code_page](const auto& src) { return to_char(src, dst_code_page); });
|
2025-07-14 15:06:33 +08:00
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// WChar -> UTF8
|
|
|
|
ConvResult<std::u8string> to_utf8(const std::wstring_view& src) {
|
|
|
|
auto rv = to_char(src, CP_UTF8);
|
|
|
|
return rv.transform([](const auto& dst) { return NS_YYCC_STRING_REINTERPRET::as_utf8(dst); });
|
2025-07-14 15:06:33 +08:00
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// UTF8 -> WChar
|
|
|
|
ConvResult<std::wstring> to_wchar(const std::u8string_view& src) {
|
|
|
|
return to_wchar(NS_YYCC_STRING_REINTERPRET::as_ordinary_view(src), CP_UTF8);
|
2025-07-14 15:06:33 +08:00
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// Char -> UTF8
|
|
|
|
ConvResult<std::u8string> to_utf8(const std::string_view& src, CodePage code_page) {
|
|
|
|
auto rv = to_char(src, code_page, CP_UTF8);
|
|
|
|
return rv.transform([](const auto& dst) { return NS_YYCC_STRING_REINTERPRET::as_utf8(dst); });
|
2025-07-14 15:06:33 +08:00
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// UTF8 -> Char
|
|
|
|
ConvResult<std::string> to_char(const std::u8string_view& src, CodePage code_page) {
|
|
|
|
return to_char(NS_YYCC_STRING_REINTERPRET::as_ordinary_view(src), CP_UTF8, code_page);
|
2025-07-14 15:06:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#pragma endregion
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
#pragma region UTF stuff
|
2025-07-14 15:06:33 +08:00
|
|
|
|
2025-07-21 20:36:26 +08:00
|
|
|
// YYC MARK:
|
|
|
|
// The convertion between UTF is implemented by c16rtomb, c32rtomb, mbrtoc16 and mbrtoc32.
|
|
|
|
// These function is locale related in C++ standard, but in Microsoft STL, it's only for UTF8.
|
|
|
|
// So we can use them safely in Win32 environment.
|
2025-07-23 16:05:53 +08:00
|
|
|
// Reference:
|
|
|
|
// * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/c16rtomb-c32rtomb1?view=msvc-170
|
|
|
|
// * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/mbrtoc16-mbrtoc323?view=msvc-170
|
2025-07-21 20:36:26 +08:00
|
|
|
|
2025-07-31 22:25:14 +08:00
|
|
|
#if defined(YYCC_STL_MSSTL)
|
|
|
|
|
2025-07-22 21:52:09 +08:00
|
|
|
// 1 UTF32 unit can produe 4 UTF8 units or 2 UTF16 units in theory.
|
|
|
|
// So we pre-allocate memory for the result to prevent allocating memory multiple times.
|
|
|
|
constexpr size_t MULTIPLE_UTF8_TO_UTF16 = 1u;
|
|
|
|
constexpr size_t MULTIPLE_UTF16_TO_UTF8 = 2u;
|
|
|
|
constexpr size_t MULTIPLE_UTF8_TO_UTF32 = 1u;
|
|
|
|
constexpr size_t MULTIPLE_UTF32_TO_UTF8 = 4u;
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// UTF8 -> UTF16
|
|
|
|
ConvResult<std::u16string> to_utf16(const std::u8string_view& src) {
|
2025-07-22 21:52:09 +08:00
|
|
|
std::u16string dst;
|
|
|
|
dst.reserve(src.size() * MULTIPLE_UTF8_TO_UTF16);
|
|
|
|
|
|
|
|
std::mbstate_t state{}; // zero-initialized to initial state
|
|
|
|
char16_t c16;
|
|
|
|
const char* ptr = reinterpret_cast<const char*>(src.data());
|
|
|
|
const char* end = ptr + src.size();
|
|
|
|
|
2025-08-13 10:49:35 +08:00
|
|
|
// YYC MARK:
|
|
|
|
// Due to the shitty design of mbrtoc16, it forcely assume that passed string is null-terminated.
|
|
|
|
// And the third argument should >= 1.
|
|
|
|
// However, our given string is string view which do not have null-terminated guaranteen.
|
2025-08-13 15:29:47 +08:00
|
|
|
//
|
2025-08-13 10:49:35 +08:00
|
|
|
// So we manually check whether we have reach the tail of string and simulate a fake null terminal.
|
|
|
|
// If string is still processing, we pass given string.
|
|
|
|
// If we have reach the tail of string, we pass our homemade NULL_TERMINAL to this function to make it works normally.
|
2025-08-13 15:29:47 +08:00
|
|
|
//
|
2025-08-13 10:49:35 +08:00
|
|
|
// This is a stupid polyfill, however, it I do not do this,
|
|
|
|
// there is a bug that the second part of surrogate pair will be dropped in final string,
|
|
|
|
// if there is a Unicode character located at the tail of string which need surrogate pair to be presented.
|
|
|
|
static const char NULL_TERMINAL = '\0';
|
2025-08-20 19:32:44 +08:00
|
|
|
while (true) {
|
|
|
|
bool not_tail = ptr < end;
|
|
|
|
const char* new_ptr = not_tail ? ptr : &NULL_TERMINAL;
|
|
|
|
size_t new_size = not_tail ? end - ptr : sizeof(NULL_TERMINAL);
|
|
|
|
size_t rc = std::mbrtoc16(&c16, new_ptr, new_size, &state);
|
|
|
|
if (!rc) break;
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
if (rc == (size_t) -1) return std::unexpected(ConvError::EncodeUtf8);
|
|
|
|
else if (rc == (size_t) -2) return std::unexpected(ConvError::IncompleteUtf8);
|
2025-08-13 10:49:35 +08:00
|
|
|
else if (rc == (size_t) -3) dst.push_back(c16); // from earlier surrogate pair
|
2025-07-22 21:52:09 +08:00
|
|
|
else {
|
|
|
|
dst.push_back(c16);
|
|
|
|
ptr += rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// UTF16 -> UTF8
|
|
|
|
ConvResult<std::u8string> to_utf8(const std::u16string_view& src) {
|
|
|
|
std::u8string dst;
|
2025-07-22 21:52:09 +08:00
|
|
|
dst.reserve(src.size() * MULTIPLE_UTF16_TO_UTF8);
|
|
|
|
|
|
|
|
std::mbstate_t state{};
|
|
|
|
char mbout[MB_LEN_MAX]{};
|
2025-08-13 10:49:35 +08:00
|
|
|
size_t rc = 1; // Assign it to ONE to avoid mismatching surrogate pair checker when string is empty.
|
2025-07-22 21:52:09 +08:00
|
|
|
for (char16_t c : src) {
|
2025-08-13 10:49:35 +08:00
|
|
|
rc = std::c16rtomb(mbout, c, &state);
|
2025-08-13 15:29:47 +08:00
|
|
|
|
2025-08-13 10:49:35 +08:00
|
|
|
if (rc == (size_t) -1) return std::unexpected(ConvError::InvalidUtf16);
|
|
|
|
else dst.append(reinterpret_cast<char8_t*>(mbout), rc);
|
|
|
|
}
|
2025-07-23 16:05:53 +08:00
|
|
|
|
2025-08-13 10:49:35 +08:00
|
|
|
if (rc == 0) {
|
|
|
|
// YYC MARK:
|
|
|
|
// If rc is zero after processing all chars,
|
|
|
|
// it means that we are aborted when processing an UTF16 surrogate pair.
|
|
|
|
// We should report it as an error.
|
|
|
|
return std::unexpected(ConvError::InvalidUtf16);
|
2025-07-22 21:52:09 +08:00
|
|
|
}
|
2025-08-13 10:49:35 +08:00
|
|
|
|
|
|
|
// Okey, return result.
|
2025-07-22 21:52:09 +08:00
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// UTF8 -> UTF32
|
|
|
|
ConvResult<std::u32string> to_utf32(const std::u8string_view& src) {
|
2025-07-22 21:52:09 +08:00
|
|
|
std::u32string dst;
|
|
|
|
dst.reserve(src.size() * MULTIPLE_UTF8_TO_UTF32);
|
|
|
|
|
|
|
|
std::mbstate_t state{};
|
|
|
|
char32_t c32;
|
|
|
|
const char* ptr = reinterpret_cast<const char*>(src.data());
|
|
|
|
const char* end = ptr + src.size();
|
|
|
|
|
|
|
|
while (ptr < end) {
|
2025-08-13 10:49:35 +08:00
|
|
|
// YYC MARK:
|
|
|
|
// There is no surrogate pair in UTF32,
|
|
|
|
// so we do not need do that stupid things in UTF8 to UTF32 functions.
|
2025-07-22 21:52:09 +08:00
|
|
|
size_t rc = std::mbrtoc32(&c32, ptr, end - ptr, &state);
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
if (rc == (size_t) -1) return std::unexpected(ConvError::EncodeUtf8);
|
|
|
|
else if (rc == (size_t) -2) return std::unexpected(ConvError::IncompleteUtf8);
|
2025-08-13 10:49:35 +08:00
|
|
|
else if (rc == (size_t) -3) throw std::runtime_error("no surrogates in UTF-32");
|
2025-07-22 21:52:09 +08:00
|
|
|
else dst.push_back(c32);
|
|
|
|
|
|
|
|
ptr += rc;
|
|
|
|
}
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
2025-07-25 10:49:07 +08:00
|
|
|
// UTF32 -> UTF8
|
|
|
|
ConvResult<std::u8string> to_utf8(const std::u32string_view& src) {
|
|
|
|
std::u8string dst;
|
2025-07-22 21:52:09 +08:00
|
|
|
dst.reserve(src.size() * MULTIPLE_UTF32_TO_UTF8);
|
|
|
|
|
|
|
|
std::mbstate_t state{};
|
|
|
|
char mbout[MB_LEN_MAX]{};
|
|
|
|
for (char32_t c : src) {
|
2025-07-23 16:05:53 +08:00
|
|
|
size_t rc = std::c32rtomb(mbout, c, &state);
|
|
|
|
|
2025-08-13 10:49:35 +08:00
|
|
|
if (rc == (size_t) -1) return std::unexpected(ConvError::InvalidUtf32);
|
|
|
|
else dst.append(reinterpret_cast<char8_t*>(mbout), rc);
|
2025-07-22 21:52:09 +08:00
|
|
|
}
|
2025-08-13 10:49:35 +08:00
|
|
|
|
|
|
|
// YYC MARK:
|
|
|
|
// There is no surrogate pair for UTF32,
|
|
|
|
// so this "if" statement only presented in UTF16 to UTF8 function.
|
|
|
|
// In this function, we directly return value.
|
2025-07-22 21:52:09 +08:00
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
2025-07-31 22:25:14 +08:00
|
|
|
#endif
|
|
|
|
|
2025-07-22 21:52:09 +08:00
|
|
|
#pragma endregion
|
|
|
|
|
2025-07-14 15:06:33 +08:00
|
|
|
} // namespace yycc::encoding::windows
|
|
|
|
|
|
|
|
#endif
|