feat: add new package wcwidth
- add wcwidth in carton. - order clang-format do not format some generated content.
This commit is contained in:
@ -24,6 +24,7 @@ PRIVATE
|
||||
yycc/encoding/iconv.cpp
|
||||
|
||||
yycc/carton/pycodec.cpp
|
||||
yycc/carton/wcwidth.cpp
|
||||
)
|
||||
target_sources(YYCCommonplace
|
||||
PUBLIC
|
||||
@ -67,6 +68,7 @@ FILES
|
||||
yycc/encoding/iconv.hpp
|
||||
|
||||
yycc/carton/pycodec.hpp
|
||||
yycc/carton/wcwidth.hpp
|
||||
)
|
||||
# Setup header infomations
|
||||
target_include_directories(YYCCommonplace
|
||||
|
@ -19,6 +19,7 @@ namespace yycc::carton::pycodec {
|
||||
|
||||
#pragma region Encoding Name
|
||||
|
||||
// clang-format off
|
||||
static const std::map<std::u8string_view, std::u8string_view> ALIAS_MAP{
|
||||
{u8"646"sv, u8"ascii"sv},
|
||||
{u8"us-ascii"sv, u8"ascii"sv},
|
||||
@ -224,6 +225,7 @@ namespace yycc::carton::pycodec {
|
||||
{u8"utf-8"sv, u8"utf_8"sv},
|
||||
{u8"cp65001"sv, u8"utf_8"sv},
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
/**
|
||||
* @brief Resolve encoding name alias and fetch real encoding name.
|
||||
@ -248,40 +250,74 @@ namespace yycc::carton::pycodec {
|
||||
|
||||
using CodePage = YYCC_PYCODEC_BACKEND_NS::CodePage;
|
||||
|
||||
static const std::map<std::u8string_view, CodePage> WINCP_MAP{
|
||||
{u8"ascii"sv, static_cast<CodePage>(437u)}, {u8"big5"sv, static_cast<CodePage>(950u)},
|
||||
{u8"cp037"sv, static_cast<CodePage>(037u)}, {u8"cp437"sv, static_cast<CodePage>(437u)},
|
||||
{u8"cp500"sv, static_cast<CodePage>(500u)}, {u8"cp720"sv, static_cast<CodePage>(720u)},
|
||||
{u8"cp737"sv, static_cast<CodePage>(737u)}, {u8"cp775"sv, static_cast<CodePage>(775u)},
|
||||
{u8"cp850"sv, static_cast<CodePage>(850u)}, {u8"cp852"sv, static_cast<CodePage>(852u)},
|
||||
{u8"cp855"sv, static_cast<CodePage>(855u)}, {u8"cp857"sv, static_cast<CodePage>(857u)},
|
||||
{u8"cp858"sv, static_cast<CodePage>(858u)}, {u8"cp860"sv, static_cast<CodePage>(860u)},
|
||||
{u8"cp861"sv, static_cast<CodePage>(861u)}, {u8"cp862"sv, static_cast<CodePage>(862u)},
|
||||
{u8"cp863"sv, static_cast<CodePage>(863u)}, {u8"cp864"sv, static_cast<CodePage>(864u)},
|
||||
{u8"cp865"sv, static_cast<CodePage>(865u)}, {u8"cp866"sv, static_cast<CodePage>(866u)},
|
||||
{u8"cp869"sv, static_cast<CodePage>(869u)}, {u8"cp874"sv, static_cast<CodePage>(874u)},
|
||||
{u8"cp875"sv, static_cast<CodePage>(875u)}, {u8"cp932"sv, static_cast<CodePage>(932u)},
|
||||
{u8"cp949"sv, static_cast<CodePage>(949u)}, {u8"cp950"sv, static_cast<CodePage>(950u)},
|
||||
{u8"cp1026"sv, static_cast<CodePage>(1026u)}, {u8"cp1140"sv, static_cast<CodePage>(1140u)},
|
||||
{u8"cp1250"sv, static_cast<CodePage>(1250u)}, {u8"cp1251"sv, static_cast<CodePage>(1251u)},
|
||||
{u8"cp1252"sv, static_cast<CodePage>(1252u)}, {u8"cp1253"sv, static_cast<CodePage>(1253u)},
|
||||
{u8"cp1254"sv, static_cast<CodePage>(1254u)}, {u8"cp1255"sv, static_cast<CodePage>(1255u)},
|
||||
{u8"cp1256"sv, static_cast<CodePage>(1256u)}, {u8"cp1257"sv, static_cast<CodePage>(1257u)},
|
||||
{u8"cp1258"sv, static_cast<CodePage>(1258u)}, {u8"euc_jp"sv, static_cast<CodePage>(20932u)},
|
||||
{u8"euc_kr"sv, static_cast<CodePage>(51949u)}, {u8"gb2312"sv, static_cast<CodePage>(936u)},
|
||||
{u8"gbk"sv, static_cast<CodePage>(936u)}, {u8"gb18030"sv, static_cast<CodePage>(54936u)},
|
||||
{u8"hz"sv, static_cast<CodePage>(52936u)}, {u8"iso2022_jp"sv, static_cast<CodePage>(50220u)},
|
||||
{u8"iso2022_kr"sv, static_cast<CodePage>(50225u)}, {u8"latin_1"sv, static_cast<CodePage>(28591u)},
|
||||
{u8"iso8859_2"sv, static_cast<CodePage>(28592u)}, {u8"iso8859_3"sv, static_cast<CodePage>(28593u)},
|
||||
{u8"iso8859_4"sv, static_cast<CodePage>(28594u)}, {u8"iso8859_5"sv, static_cast<CodePage>(28595u)},
|
||||
{u8"iso8859_6"sv, static_cast<CodePage>(28596u)}, {u8"iso8859_7"sv, static_cast<CodePage>(28597u)},
|
||||
{u8"iso8859_8"sv, static_cast<CodePage>(28598u)}, {u8"iso8859_9"sv, static_cast<CodePage>(28599u)},
|
||||
{u8"iso8859_13"sv, static_cast<CodePage>(28603u)}, {u8"iso8859_15"sv, static_cast<CodePage>(28605u)},
|
||||
{u8"johab"sv, static_cast<CodePage>(1361u)}, {u8"mac_cyrillic"sv, static_cast<CodePage>(10007u)},
|
||||
{u8"mac_greek"sv, static_cast<CodePage>(10006u)}, {u8"mac_iceland"sv, static_cast<CodePage>(10079u)},
|
||||
{u8"mac_turkish"sv, static_cast<CodePage>(10081u)}, {u8"shift_jis"sv, static_cast<CodePage>(932u)},
|
||||
{u8"utf_7"sv, static_cast<CodePage>(65000u)}, {u8"utf_8"sv, static_cast<CodePage>(65001u)},
|
||||
// clang-format off
|
||||
static const std::map<std::u8string_view, CodePage> WINCP_MAP {
|
||||
{ u8"ascii"sv, static_cast<CodePage>(437u) },
|
||||
{ u8"big5"sv, static_cast<CodePage>(950u) },
|
||||
{ u8"cp037"sv, static_cast<CodePage>(037u) },
|
||||
{ u8"cp437"sv, static_cast<CodePage>(437u) },
|
||||
{ u8"cp500"sv, static_cast<CodePage>(500u) },
|
||||
{ u8"cp720"sv, static_cast<CodePage>(720u) },
|
||||
{ u8"cp737"sv, static_cast<CodePage>(737u) },
|
||||
{ u8"cp775"sv, static_cast<CodePage>(775u) },
|
||||
{ u8"cp850"sv, static_cast<CodePage>(850u) },
|
||||
{ u8"cp852"sv, static_cast<CodePage>(852u) },
|
||||
{ u8"cp855"sv, static_cast<CodePage>(855u) },
|
||||
{ u8"cp857"sv, static_cast<CodePage>(857u) },
|
||||
{ u8"cp858"sv, static_cast<CodePage>(858u) },
|
||||
{ u8"cp860"sv, static_cast<CodePage>(860u) },
|
||||
{ u8"cp861"sv, static_cast<CodePage>(861u) },
|
||||
{ u8"cp862"sv, static_cast<CodePage>(862u) },
|
||||
{ u8"cp863"sv, static_cast<CodePage>(863u) },
|
||||
{ u8"cp864"sv, static_cast<CodePage>(864u) },
|
||||
{ u8"cp865"sv, static_cast<CodePage>(865u) },
|
||||
{ u8"cp866"sv, static_cast<CodePage>(866u) },
|
||||
{ u8"cp869"sv, static_cast<CodePage>(869u) },
|
||||
{ u8"cp874"sv, static_cast<CodePage>(874u) },
|
||||
{ u8"cp875"sv, static_cast<CodePage>(875u) },
|
||||
{ u8"cp932"sv, static_cast<CodePage>(932u) },
|
||||
{ u8"cp949"sv, static_cast<CodePage>(949u) },
|
||||
{ u8"cp950"sv, static_cast<CodePage>(950u) },
|
||||
{ u8"cp1026"sv, static_cast<CodePage>(1026u) },
|
||||
{ u8"cp1140"sv, static_cast<CodePage>(1140u) },
|
||||
{ u8"cp1250"sv, static_cast<CodePage>(1250u) },
|
||||
{ u8"cp1251"sv, static_cast<CodePage>(1251u) },
|
||||
{ u8"cp1252"sv, static_cast<CodePage>(1252u) },
|
||||
{ u8"cp1253"sv, static_cast<CodePage>(1253u) },
|
||||
{ u8"cp1254"sv, static_cast<CodePage>(1254u) },
|
||||
{ u8"cp1255"sv, static_cast<CodePage>(1255u) },
|
||||
{ u8"cp1256"sv, static_cast<CodePage>(1256u) },
|
||||
{ u8"cp1257"sv, static_cast<CodePage>(1257u) },
|
||||
{ u8"cp1258"sv, static_cast<CodePage>(1258u) },
|
||||
{ u8"euc_jp"sv, static_cast<CodePage>(20932u) },
|
||||
{ u8"euc_kr"sv, static_cast<CodePage>(51949u) },
|
||||
{ u8"gb2312"sv, static_cast<CodePage>(936u) },
|
||||
{ u8"gbk"sv, static_cast<CodePage>(936u) },
|
||||
{ u8"gb18030"sv, static_cast<CodePage>(54936u) },
|
||||
{ u8"hz"sv, static_cast<CodePage>(52936u) },
|
||||
{ u8"iso2022_jp"sv, static_cast<CodePage>(50220u) },
|
||||
{ u8"iso2022_kr"sv, static_cast<CodePage>(50225u) },
|
||||
{ u8"latin_1"sv, static_cast<CodePage>(28591u) },
|
||||
{ u8"iso8859_2"sv, static_cast<CodePage>(28592u) },
|
||||
{ u8"iso8859_3"sv, static_cast<CodePage>(28593u) },
|
||||
{ u8"iso8859_4"sv, static_cast<CodePage>(28594u) },
|
||||
{ u8"iso8859_5"sv, static_cast<CodePage>(28595u) },
|
||||
{ u8"iso8859_6"sv, static_cast<CodePage>(28596u) },
|
||||
{ u8"iso8859_7"sv, static_cast<CodePage>(28597u) },
|
||||
{ u8"iso8859_8"sv, static_cast<CodePage>(28598u) },
|
||||
{ u8"iso8859_9"sv, static_cast<CodePage>(28599u) },
|
||||
{ u8"iso8859_13"sv, static_cast<CodePage>(28603u) },
|
||||
{ u8"iso8859_15"sv, static_cast<CodePage>(28605u) },
|
||||
{ u8"johab"sv, static_cast<CodePage>(1361u) },
|
||||
{ u8"mac_cyrillic"sv, static_cast<CodePage>(10007u) },
|
||||
{ u8"mac_greek"sv, static_cast<CodePage>(10006u) },
|
||||
{ u8"mac_iceland"sv, static_cast<CodePage>(10079u) },
|
||||
{ u8"mac_turkish"sv, static_cast<CodePage>(10081u) },
|
||||
{ u8"shift_jis"sv, static_cast<CodePage>(932u) },
|
||||
{ u8"utf_7"sv, static_cast<CodePage>(65000u) },
|
||||
{ u8"utf_8"sv, static_cast<CodePage>(65001u) },
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
static FetchResult<CodePage> fetch_code_page(const std::u8string_view& enc_name) {
|
||||
// resolve alias
|
||||
@ -296,6 +332,7 @@ namespace yycc::carton::pycodec {
|
||||
|
||||
#else
|
||||
|
||||
// clang-format off
|
||||
static const std::map<std::u8string_view, std::string_view> ICONV_MAP{
|
||||
{u8"ascii"sv, "ASCII"sv},
|
||||
{u8"big5"sv, "BIG5"sv},
|
||||
@ -359,6 +396,7 @@ namespace yycc::carton::pycodec {
|
||||
{u8"utf_7"sv, "UTF-7"sv},
|
||||
{u8"utf_8"sv, "UTF-8"sv},
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
static FetchResult<std::string_view> fetch_iconv_name(const std::u8string_view& enc_name) {
|
||||
// resolve alias
|
||||
|
478
src/yycc/carton/wcwidth.cpp
Normal file
478
src/yycc/carton/wcwidth.cpp
Normal file
@ -0,0 +1,478 @@
|
||||
#include "wcwidth.hpp"
|
||||
#include "../encoding/stl.hpp"
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
|
||||
#define ENC ::yycc::encoding::stl
|
||||
|
||||
namespace yycc::carton::wcwidth {
|
||||
|
||||
using Boundary = std::pair<char32_t, char32_t>;
|
||||
using BoundaryVector = std::vector<Boundary>;
|
||||
|
||||
// YYC MARK:
|
||||
// Following table and code are copied from Python package "wcwidth".
|
||||
// Although the code of this package are also copied from the original "wcwidth" C implementation.
|
||||
//
|
||||
// I do not need so much exact measurement.
|
||||
// I just want a "it works" wcwdith in all platforms.
|
||||
// So these tables are coming from the table with lowest UNICODE version
|
||||
// (original package provides different tables for different UNICODE versions).
|
||||
|
||||
// clang-format off
|
||||
static const BoundaryVector ZERO_WIDTH{
|
||||
{U'\x00000', U'\x00000'}, // (nil)
|
||||
{U'\x000ad', U'\x000ad'}, // Soft Hyphen
|
||||
{U'\x00300', U'\x0036f'}, // Combining Grave Accent ..Combining Latin Small Le
|
||||
{U'\x00483', U'\x00486'}, // Combining Cyrillic Titlo..Combining Cyrillic Psili
|
||||
{U'\x00488', U'\x00489'}, // Combining Cyrillic Hundr..Combining Cyrillic Milli
|
||||
{U'\x00591', U'\x005b9'}, // Hebrew Accent Etnahta ..Hebrew Point Holam
|
||||
{U'\x005bb', U'\x005bd'}, // Hebrew Point Qubuts ..Hebrew Point Meteg
|
||||
{U'\x005bf', U'\x005bf'}, // Hebrew Point Rafe
|
||||
{U'\x005c1', U'\x005c2'}, // Hebrew Point Shin Dot ..Hebrew Point Sin Dot
|
||||
{U'\x005c4', U'\x005c5'}, // Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot
|
||||
{U'\x005c7', U'\x005c7'}, // Hebrew Point Qamats Qatan
|
||||
{U'\x00600', U'\x00603'}, // Arabic Number Sign ..Arabic Sign Safha
|
||||
{U'\x00610', U'\x00615'}, // Arabic Sign Sallallahou ..Arabic Small High Tah
|
||||
{U'\x0064b', U'\x0065e'}, // Arabic Fathatan ..Arabic Fatha With Two Do
|
||||
{U'\x00670', U'\x00670'}, // Arabic Letter Superscript Alef
|
||||
{U'\x006d6', U'\x006e4'}, // Arabic Small High Ligatu..Arabic Small High Madda
|
||||
{U'\x006e7', U'\x006e8'}, // Arabic Small High Yeh ..Arabic Small High Noon
|
||||
{U'\x006ea', U'\x006ed'}, // Arabic Empty Centre Low ..Arabic Small Low Meem
|
||||
{U'\x0070f', U'\x0070f'}, // Syriac Abbreviation Mark
|
||||
{U'\x00711', U'\x00711'}, // Syriac Letter Superscript Alaph
|
||||
{U'\x00730', U'\x0074a'}, // Syriac Pthaha Above ..Syriac Barrekh
|
||||
{U'\x007a6', U'\x007b0'}, // Thaana Abafili ..Thaana Sukun
|
||||
{U'\x00901', U'\x00903'}, // Devanagari Sign Candrabi..Devanagari Sign Visarga
|
||||
{U'\x0093c', U'\x0093c'}, // Devanagari Sign Nukta
|
||||
{U'\x0093e', U'\x0094d'}, // Devanagari Vowel Sign Aa..Devanagari Sign Virama
|
||||
{U'\x00951', U'\x00954'}, // Devanagari Stress Sign U..Devanagari Acute Accent
|
||||
{U'\x00962', U'\x00963'}, // Devanagari Vowel Sign Vo..Devanagari Vowel Sign Vo
|
||||
{U'\x00981', U'\x00983'}, // Bengali Sign Candrabindu..Bengali Sign Visarga
|
||||
{U'\x009bc', U'\x009bc'}, // Bengali Sign Nukta
|
||||
{U'\x009be', U'\x009c4'}, // Bengali Vowel Sign Aa ..Bengali Vowel Sign Vocal
|
||||
{U'\x009c7', U'\x009c8'}, // Bengali Vowel Sign E ..Bengali Vowel Sign Ai
|
||||
{U'\x009cb', U'\x009cd'}, // Bengali Vowel Sign O ..Bengali Sign Virama
|
||||
{U'\x009d7', U'\x009d7'}, // Bengali Au Length Mark
|
||||
{U'\x009e2', U'\x009e3'}, // Bengali Vowel Sign Vocal..Bengali Vowel Sign Vocal
|
||||
{U'\x00a01', U'\x00a03'}, // Gurmukhi Sign Adak Bindi..Gurmukhi Sign Visarga
|
||||
{U'\x00a3c', U'\x00a3c'}, // Gurmukhi Sign Nukta
|
||||
{U'\x00a3e', U'\x00a42'}, // Gurmukhi Vowel Sign Aa ..Gurmukhi Vowel Sign Uu
|
||||
{U'\x00a47', U'\x00a48'}, // Gurmukhi Vowel Sign Ee ..Gurmukhi Vowel Sign Ai
|
||||
{U'\x00a4b', U'\x00a4d'}, // Gurmukhi Vowel Sign Oo ..Gurmukhi Sign Virama
|
||||
{U'\x00a70', U'\x00a71'}, // Gurmukhi Tippi ..Gurmukhi Addak
|
||||
{U'\x00a81', U'\x00a83'}, // Gujarati Sign Candrabind..Gujarati Sign Visarga
|
||||
{U'\x00abc', U'\x00abc'}, // Gujarati Sign Nukta
|
||||
{U'\x00abe', U'\x00ac5'}, // Gujarati Vowel Sign Aa ..Gujarati Vowel Sign Cand
|
||||
{U'\x00ac7', U'\x00ac9'}, // Gujarati Vowel Sign E ..Gujarati Vowel Sign Cand
|
||||
{U'\x00acb', U'\x00acd'}, // Gujarati Vowel Sign O ..Gujarati Sign Virama
|
||||
{U'\x00ae2', U'\x00ae3'}, // Gujarati Vowel Sign Voca..Gujarati Vowel Sign Voca
|
||||
{U'\x00b01', U'\x00b03'}, // Oriya Sign Candrabindu ..Oriya Sign Visarga
|
||||
{U'\x00b3c', U'\x00b3c'}, // Oriya Sign Nukta
|
||||
{U'\x00b3e', U'\x00b43'}, // Oriya Vowel Sign Aa ..Oriya Vowel Sign Vocalic
|
||||
{U'\x00b47', U'\x00b48'}, // Oriya Vowel Sign E ..Oriya Vowel Sign Ai
|
||||
{U'\x00b4b', U'\x00b4d'}, // Oriya Vowel Sign O ..Oriya Sign Virama
|
||||
{U'\x00b56', U'\x00b57'}, // Oriya Ai Length Mark ..Oriya Au Length Mark
|
||||
{U'\x00b82', U'\x00b82'}, // Tamil Sign Anusvara
|
||||
{U'\x00bbe', U'\x00bc2'}, // Tamil Vowel Sign Aa ..Tamil Vowel Sign Uu
|
||||
{U'\x00bc6', U'\x00bc8'}, // Tamil Vowel Sign E ..Tamil Vowel Sign Ai
|
||||
{U'\x00bca', U'\x00bcd'}, // Tamil Vowel Sign O ..Tamil Sign Virama
|
||||
{U'\x00bd7', U'\x00bd7'}, // Tamil Au Length Mark
|
||||
{U'\x00c01', U'\x00c03'}, // Telugu Sign Candrabindu ..Telugu Sign Visarga
|
||||
{U'\x00c3e', U'\x00c44'}, // Telugu Vowel Sign Aa ..Telugu Vowel Sign Vocali
|
||||
{U'\x00c46', U'\x00c48'}, // Telugu Vowel Sign E ..Telugu Vowel Sign Ai
|
||||
{U'\x00c4a', U'\x00c4d'}, // Telugu Vowel Sign O ..Telugu Sign Virama
|
||||
{U'\x00c55', U'\x00c56'}, // Telugu Length Mark ..Telugu Ai Length Mark
|
||||
{U'\x00c82', U'\x00c83'}, // Kannada Sign Anusvara ..Kannada Sign Visarga
|
||||
{U'\x00cbc', U'\x00cbc'}, // Kannada Sign Nukta
|
||||
{U'\x00cbe', U'\x00cc4'}, // Kannada Vowel Sign Aa ..Kannada Vowel Sign Vocal
|
||||
{U'\x00cc6', U'\x00cc8'}, // Kannada Vowel Sign E ..Kannada Vowel Sign Ai
|
||||
{U'\x00cca', U'\x00ccd'}, // Kannada Vowel Sign O ..Kannada Sign Virama
|
||||
{U'\x00cd5', U'\x00cd6'}, // Kannada Length Mark ..Kannada Ai Length Mark
|
||||
{U'\x00d02', U'\x00d03'}, // Malayalam Sign Anusvara ..Malayalam Sign Visarga
|
||||
{U'\x00d3e', U'\x00d43'}, // Malayalam Vowel Sign Aa ..Malayalam Vowel Sign Voc
|
||||
{U'\x00d46', U'\x00d48'}, // Malayalam Vowel Sign E ..Malayalam Vowel Sign Ai
|
||||
{U'\x00d4a', U'\x00d4d'}, // Malayalam Vowel Sign O ..Malayalam Sign Virama
|
||||
{U'\x00d57', U'\x00d57'}, // Malayalam Au Length Mark
|
||||
{U'\x00d82', U'\x00d83'}, // Sinhala Sign Anusvaraya ..Sinhala Sign Visargaya
|
||||
{U'\x00dca', U'\x00dca'}, // Sinhala Sign Al-lakuna
|
||||
{U'\x00dcf', U'\x00dd4'}, // Sinhala Vowel Sign Aela-..Sinhala Vowel Sign Ketti
|
||||
{U'\x00dd6', U'\x00dd6'}, // Sinhala Vowel Sign Diga Paa-pilla
|
||||
{U'\x00dd8', U'\x00ddf'}, // Sinhala Vowel Sign Gaett..Sinhala Vowel Sign Gayan
|
||||
{U'\x00df2', U'\x00df3'}, // Sinhala Vowel Sign Diga ..Sinhala Vowel Sign Diga
|
||||
{U'\x00e31', U'\x00e31'}, // Thai Character Mai Han-akat
|
||||
{U'\x00e34', U'\x00e3a'}, // Thai Character Sara I ..Thai Character Phinthu
|
||||
{U'\x00e47', U'\x00e4e'}, // Thai Character Maitaikhu..Thai Character Yamakkan
|
||||
{U'\x00eb1', U'\x00eb1'}, // Lao Vowel Sign Mai Kan
|
||||
{U'\x00eb4', U'\x00eb9'}, // Lao Vowel Sign I ..Lao Vowel Sign Uu
|
||||
{U'\x00ebb', U'\x00ebc'}, // Lao Vowel Sign Mai Kon ..Lao Semivowel Sign Lo
|
||||
{U'\x00ec8', U'\x00ecd'}, // Lao Tone Mai Ek ..Lao Niggahita
|
||||
{U'\x00f18', U'\x00f19'}, // Tibetan Astrological Sig..Tibetan Astrological Sig
|
||||
{U'\x00f35', U'\x00f35'}, // Tibetan Mark Ngas Bzung Nyi Zla
|
||||
{U'\x00f37', U'\x00f37'}, // Tibetan Mark Ngas Bzung Sgor Rtags
|
||||
{U'\x00f39', U'\x00f39'}, // Tibetan Mark Tsa -phru
|
||||
{U'\x00f3e', U'\x00f3f'}, // Tibetan Sign Yar Tshes ..Tibetan Sign Mar Tshes
|
||||
{U'\x00f71', U'\x00f84'}, // Tibetan Vowel Sign Aa ..Tibetan Mark Halanta
|
||||
{U'\x00f86', U'\x00f87'}, // Tibetan Sign Lci Rtags ..Tibetan Sign Yang Rtags
|
||||
{U'\x00f90', U'\x00f97'}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter
|
||||
{U'\x00f99', U'\x00fbc'}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter
|
||||
{U'\x00fc6', U'\x00fc6'}, // Tibetan Symbol Padma Gdan
|
||||
{U'\x0102c', U'\x01032'}, // Myanmar Vowel Sign Aa ..Myanmar Vowel Sign Ai
|
||||
{U'\x01036', U'\x01039'}, // Myanmar Sign Anusvara ..Myanmar Sign Virama
|
||||
{U'\x01056', U'\x01059'}, // Myanmar Vowel Sign Vocal..Myanmar Vowel Sign Vocal
|
||||
{U'\x01160', U'\x011ff'}, // Hangul Jungseong Filler ..Hangul Jongseong Ssangni
|
||||
{U'\x0135f', U'\x0135f'}, // Ethiopic Combining Gemination Mark
|
||||
{U'\x01712', U'\x01714'}, // Tagalog Vowel Sign I ..Tagalog Sign Virama
|
||||
{U'\x01732', U'\x01734'}, // Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod
|
||||
{U'\x01752', U'\x01753'}, // Buhid Vowel Sign I ..Buhid Vowel Sign U
|
||||
{U'\x01772', U'\x01773'}, // Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U
|
||||
{U'\x017b4', U'\x017d3'}, // Khmer Vowel Inherent Aq ..Khmer Sign Bathamasat
|
||||
{U'\x017dd', U'\x017dd'}, // Khmer Sign Atthacan
|
||||
{U'\x0180b', U'\x0180d'}, // Mongolian Free Variation..Mongolian Free Variation
|
||||
{U'\x018a9', U'\x018a9'}, // Mongolian Letter Ali Gali Dagalga
|
||||
{U'\x01920', U'\x0192b'}, // Limbu Vowel Sign A ..Limbu Subjoined Letter W
|
||||
{U'\x01930', U'\x0193b'}, // Limbu Small Letter Ka ..Limbu Sign Sa-i
|
||||
{U'\x019b0', U'\x019c0'}, // New Tai Lue Vowel Sign V..New Tai Lue Vowel Sign I
|
||||
{U'\x019c8', U'\x019c9'}, // New Tai Lue Tone Mark-1 ..New Tai Lue Tone Mark-2
|
||||
{U'\x01a17', U'\x01a1b'}, // Buginese Vowel Sign I ..Buginese Vowel Sign Ae
|
||||
{U'\x01dc0', U'\x01dc3'}, // Combining Dotted Grave A..Combining Suspension Mar
|
||||
{U'\x0200b', U'\x0200f'}, // Zero Width Space ..Right-to-left Mark
|
||||
{U'\x02028', U'\x0202e'}, // Line Separator ..Right-to-left Override
|
||||
{U'\x02060', U'\x02063'}, // Word Joiner ..Invisible Separator
|
||||
{U'\x0206a', U'\x0206f'}, // Inhibit Symmetric Swappi..Nominal Digit Shapes
|
||||
{U'\x020d0', U'\x020eb'}, // Combining Left Harpoon A..Combining Long Double So
|
||||
{U'\x0302a', U'\x0302f'}, // Ideographic Level Tone M..Hangul Double Dot Tone M
|
||||
{U'\x03099', U'\x0309a'}, // Combining Katakana-hirag..Combining Katakana-hirag
|
||||
{U'\x0a802', U'\x0a802'}, // Syloti Nagri Sign Dvisvara
|
||||
{U'\x0a806', U'\x0a806'}, // Syloti Nagri Sign Hasanta
|
||||
{U'\x0a80b', U'\x0a80b'}, // Syloti Nagri Sign Anusvara
|
||||
{U'\x0a823', U'\x0a827'}, // Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign
|
||||
{U'\x0d7b0', U'\x0d7ff'}, // Hangul Jungseong O-yeo ..(nil)
|
||||
{U'\x0fb1e', U'\x0fb1e'}, // Hebrew Point Judeo-spanish Varika
|
||||
{U'\x0fe00', U'\x0fe0f'}, // Variation Selector-1 ..Variation Selector-16
|
||||
{U'\x0fe20', U'\x0fe23'}, // Combining Ligature Left ..Combining Double Tilde R
|
||||
{U'\x0feff', U'\x0feff'}, // Zero Width No-break Space
|
||||
{U'\x0fff9', U'\x0fffb'}, // Interlinear Annotation A..Interlinear Annotation T
|
||||
{U'\x10a01', U'\x10a03'}, // Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo
|
||||
{U'\x10a05', U'\x10a06'}, // Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O
|
||||
{U'\x10a0c', U'\x10a0f'}, // Kharoshthi Vowel Length ..Kharoshthi Sign Visarga
|
||||
{U'\x10a38', U'\x10a3a'}, // Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo
|
||||
{U'\x10a3f', U'\x10a3f'}, // Kharoshthi Virama
|
||||
{U'\x1d165', U'\x1d169'}, // Musical Symbol Combining..Musical Symbol Combining
|
||||
{U'\x1d16d', U'\x1d182'}, // Musical Symbol Combining..Musical Symbol Combining
|
||||
{U'\x1d185', U'\x1d18b'}, // Musical Symbol Combining..Musical Symbol Combining
|
||||
{U'\x1d1aa', U'\x1d1ad'}, // Musical Symbol Combining..Musical Symbol Combining
|
||||
{U'\x1d242', U'\x1d244'}, // Combining Greek Musical ..Combining Greek Musical
|
||||
{U'\xe0001', U'\xe0001'}, // Language Tag
|
||||
{U'\xe0020', U'\xe007f'}, // Tag Space ..Cancel Tag
|
||||
{U'\xe0100', U'\xe01ef'}, // Variation Selector-17 ..Variation Selector-256
|
||||
};
|
||||
|
||||
static const BoundaryVector WIDE_EAST_ASIAN{
|
||||
{U'\x01100', U'\x01159'}, // Hangul Choseong Kiyeok ..Hangul Choseong Yeorinhi
|
||||
{U'\x0115f', U'\x0115f'}, // Hangul Choseong Filler
|
||||
{U'\x02329', U'\x0232a'}, // Left-pointing Angle Brac..Right-pointing Angle Bra
|
||||
{U'\x02e80', U'\x02e99'}, // Cjk Radical Repeat ..Cjk Radical Rap
|
||||
{U'\x02e9b', U'\x02ef3'}, // Cjk Radical Choke ..Cjk Radical C-simplified
|
||||
{U'\x02f00', U'\x02fd5'}, // Kangxi Radical One ..Kangxi Radical Flute
|
||||
{U'\x02ff0', U'\x02ffb'}, // Ideographic Description ..Ideographic Description
|
||||
{U'\x03000', U'\x03029'}, // Ideographic Space ..Hangzhou Numeral Nine
|
||||
{U'\x03030', U'\x0303e'}, // Wavy Dash ..Ideographic Variation In
|
||||
{U'\x03041', U'\x03096'}, // Hiragana Letter Small A ..Hiragana Letter Small Ke
|
||||
{U'\x0309b', U'\x030ff'}, // Katakana-hiragana Voiced..Katakana Digraph Koto
|
||||
{U'\x03105', U'\x0312c'}, // Bopomofo Letter B ..Bopomofo Letter Gn
|
||||
{U'\x03131', U'\x0318e'}, // Hangul Letter Kiyeok ..Hangul Letter Araeae
|
||||
{U'\x03190', U'\x031b7'}, // Ideographic Annotation L..Bopomofo Final Letter H
|
||||
{U'\x031c0', U'\x031cf'}, // Cjk Stroke T ..Cjk Stroke N
|
||||
{U'\x031f0', U'\x0321e'}, // Katakana Letter Small Ku..Parenthesized Korean Cha
|
||||
{U'\x03220', U'\x03243'}, // Parenthesized Ideograph ..Parenthesized Ideograph
|
||||
{U'\x03250', U'\x032fe'}, // Partnership Sign ..Circled Katakana Wo
|
||||
{U'\x03300', U'\x04db5'}, // Square Apaato ..Cjk Unified Ideograph-4d
|
||||
{U'\x04e00', U'\x09fbb'}, // Cjk Unified Ideograph-4e..Cjk Unified Ideograph-9f
|
||||
{U'\x0a000', U'\x0a48c'}, // Yi Syllable It ..Yi Syllable Yyr
|
||||
{U'\x0a490', U'\x0a4c6'}, // Yi Radical Qot ..Yi Radical Ke
|
||||
{U'\x0ac00', U'\x0d7a3'}, // Hangul Syllable Ga ..Hangul Syllable Hih
|
||||
{U'\x0f900', U'\x0fa2d'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr
|
||||
{U'\x0fa30', U'\x0fa6a'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr
|
||||
{U'\x0fa70', U'\x0fad9'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr
|
||||
{U'\x0fe10', U'\x0fe19'}, // Presentation Form For Ve..Presentation Form For Ve
|
||||
{U'\x0fe30', U'\x0fe52'}, // Presentation Form For Ve..Small Full Stop
|
||||
{U'\x0fe54', U'\x0fe66'}, // Small Semicolon ..Small Equals Sign
|
||||
{U'\x0fe68', U'\x0fe6b'}, // Small Reverse Solidus ..Small Commercial At
|
||||
{U'\x0ff01', U'\x0ff60'}, // Fullwidth Exclamation Ma..Fullwidth Right White Pa
|
||||
{U'\x0ffe0', U'\x0ffe6'}, // Fullwidth Cent Sign ..Fullwidth Won Sign
|
||||
{U'\x20000', U'\x2fffd'}, // Cjk Unified Ideograph-20..(nil)
|
||||
{U'\x30000', U'\x3fffd'}, // Cjk Unified Ideograph-30..(nil)
|
||||
};
|
||||
|
||||
static const BoundaryVector VS16_NARROW_TO_WIDE{
|
||||
{U'\x00023', U'\x00023'}, // Number Sign
|
||||
{U'\x0002a', U'\x0002a'}, // Asterisk
|
||||
{U'\x00030', U'\x00039'}, // Digit Zero ..Digit Nine
|
||||
{U'\x000a9', U'\x000a9'}, // Copyright Sign
|
||||
{U'\x000ae', U'\x000ae'}, // Registered Sign
|
||||
{U'\x0203c', U'\x0203c'}, // Double Exclamation Mark
|
||||
{U'\x02049', U'\x02049'}, // Exclamation Question Mark
|
||||
{U'\x02122', U'\x02122'}, // Trade Mark Sign
|
||||
{U'\x02139', U'\x02139'}, // Information Source
|
||||
{U'\x02194', U'\x02199'}, // Left Right Arrow ..South West Arrow
|
||||
{U'\x021a9', U'\x021aa'}, // Leftwards Arrow With Hoo..Rightwards Arrow With Ho
|
||||
{U'\x02328', U'\x02328'}, // Keyboard
|
||||
{U'\x023cf', U'\x023cf'}, // Eject Symbol
|
||||
{U'\x023ed', U'\x023ef'}, // Black Right-pointing Dou..Black Right-pointing Tri
|
||||
{U'\x023f1', U'\x023f2'}, // Stopwatch ..Timer Clock
|
||||
{U'\x023f8', U'\x023fa'}, // Double Vertical Bar ..Black Circle For Record
|
||||
{U'\x024c2', U'\x024c2'}, // Circled Latin Capital Letter M
|
||||
{U'\x025aa', U'\x025ab'}, // Black Small Square ..White Small Square
|
||||
{U'\x025b6', U'\x025b6'}, // Black Right-pointing Triangle
|
||||
{U'\x025c0', U'\x025c0'}, // Black Left-pointing Triangle
|
||||
{U'\x025fb', U'\x025fc'}, // White Medium Square ..Black Medium Square
|
||||
{U'\x02600', U'\x02604'}, // Black Sun With Rays ..Comet
|
||||
{U'\x0260e', U'\x0260e'}, // Black Telephone
|
||||
{U'\x02611', U'\x02611'}, // Ballot Box With Check
|
||||
{U'\x02618', U'\x02618'}, // Shamrock
|
||||
{U'\x0261d', U'\x0261d'}, // White Up Pointing Index
|
||||
{U'\x02620', U'\x02620'}, // Skull And Crossbones
|
||||
{U'\x02622', U'\x02623'}, // Radioactive Sign ..Biohazard Sign
|
||||
{U'\x02626', U'\x02626'}, // Orthodox Cross
|
||||
{U'\x0262a', U'\x0262a'}, // Star And Crescent
|
||||
{U'\x0262e', U'\x0262f'}, // Peace Symbol ..Yin Yang
|
||||
{U'\x02638', U'\x0263a'}, // Wheel Of Dharma ..White Smiling Face
|
||||
{U'\x02640', U'\x02640'}, // Female Sign
|
||||
{U'\x02642', U'\x02642'}, // Male Sign
|
||||
{U'\x0265f', U'\x02660'}, // Black Chess Pawn ..Black Spade Suit
|
||||
{U'\x02663', U'\x02663'}, // Black Club Suit
|
||||
{U'\x02665', U'\x02666'}, // Black Heart Suit ..Black Diamond Suit
|
||||
{U'\x02668', U'\x02668'}, // Hot Springs
|
||||
{U'\x0267b', U'\x0267b'}, // Black Universal Recycling Symbol
|
||||
{U'\x0267e', U'\x0267e'}, // Permanent Paper Sign
|
||||
{U'\x02692', U'\x02692'}, // Hammer And Pick
|
||||
{U'\x02694', U'\x02697'}, // Crossed Swords ..Alembic
|
||||
{U'\x02699', U'\x02699'}, // Gear
|
||||
{U'\x0269b', U'\x0269c'}, // Atom Symbol ..Fleur-de-lis
|
||||
{U'\x026a0', U'\x026a0'}, // Warning Sign
|
||||
{U'\x026a7', U'\x026a7'}, // Male With Stroke And Male And Female Sign
|
||||
{U'\x026b0', U'\x026b1'}, // Coffin ..Funeral Urn
|
||||
{U'\x026c8', U'\x026c8'}, // Thunder Cloud And Rain
|
||||
{U'\x026cf', U'\x026cf'}, // Pick
|
||||
{U'\x026d1', U'\x026d1'}, // Helmet With White Cross
|
||||
{U'\x026d3', U'\x026d3'}, // Chains
|
||||
{U'\x026e9', U'\x026e9'}, // Shinto Shrine
|
||||
{U'\x026f0', U'\x026f1'}, // Mountain ..Umbrella On Ground
|
||||
{U'\x026f4', U'\x026f4'}, // Ferry
|
||||
{U'\x026f7', U'\x026f9'}, // Skier ..Person With Ball
|
||||
{U'\x02702', U'\x02702'}, // Black Scissors
|
||||
{U'\x02708', U'\x02709'}, // Airplane ..Envelope
|
||||
{U'\x0270c', U'\x0270d'}, // Victory Hand ..Writing Hand
|
||||
{U'\x0270f', U'\x0270f'}, // Pencil
|
||||
{U'\x02712', U'\x02712'}, // Black Nib
|
||||
{U'\x02714', U'\x02714'}, // Heavy Check Mark
|
||||
{U'\x02716', U'\x02716'}, // Heavy Multiplication X
|
||||
{U'\x0271d', U'\x0271d'}, // Latin Cross
|
||||
{U'\x02721', U'\x02721'}, // Star Of David
|
||||
{U'\x02733', U'\x02734'}, // Eight Spoked Asterisk ..Eight Pointed Black Star
|
||||
{U'\x02744', U'\x02744'}, // Snowflake
|
||||
{U'\x02747', U'\x02747'}, // Sparkle
|
||||
{U'\x02763', U'\x02764'}, // Heavy Heart Exclamation ..Heavy Black Heart
|
||||
{U'\x027a1', U'\x027a1'}, // Black Rightwards Arrow
|
||||
{U'\x02934', U'\x02935'}, // Arrow Pointing Rightward..Arrow Pointing Rightward
|
||||
{U'\x02b05', U'\x02b07'}, // Leftwards Black Arrow ..Downwards Black Arrow
|
||||
{U'\x1f170', U'\x1f171'}, // Negative Squared Latin C..Negative Squared Latin C
|
||||
{U'\x1f17e', U'\x1f17f'}, // Negative Squared Latin C..Negative Squared Latin C
|
||||
{U'\x1f321', U'\x1f321'}, // Thermometer
|
||||
{U'\x1f324', U'\x1f32c'}, // White Sun With Small Clo..Wind Blowing Face
|
||||
{U'\x1f336', U'\x1f336'}, // Hot Pepper
|
||||
{U'\x1f37d', U'\x1f37d'}, // Fork And Knife With Plate
|
||||
{U'\x1f396', U'\x1f397'}, // Military Medal ..Reminder Ribbon
|
||||
{U'\x1f399', U'\x1f39b'}, // Studio Microphone ..Control Knobs
|
||||
{U'\x1f39e', U'\x1f39f'}, // Film Frames ..Admission Tickets
|
||||
{U'\x1f3cb', U'\x1f3ce'}, // Weight Lifter ..Racing Car
|
||||
{U'\x1f3d4', U'\x1f3df'}, // Snow Capped Mountain ..Stadium
|
||||
{U'\x1f3f3', U'\x1f3f3'}, // Waving White Flag
|
||||
{U'\x1f3f5', U'\x1f3f5'}, // Rosette
|
||||
{U'\x1f3f7', U'\x1f3f7'}, // Label
|
||||
{U'\x1f43f', U'\x1f43f'}, // Chipmunk
|
||||
{U'\x1f441', U'\x1f441'}, // Eye
|
||||
{U'\x1f4fd', U'\x1f4fd'}, // Film Projector
|
||||
{U'\x1f549', U'\x1f54a'}, // Om Symbol ..Dove Of Peace
|
||||
{U'\x1f56f', U'\x1f570'}, // Candle ..Mantelpiece Clock
|
||||
{U'\x1f573', U'\x1f579'}, // Hole ..Joystick
|
||||
{U'\x1f587', U'\x1f587'}, // Linked Paperclips
|
||||
{U'\x1f58a', U'\x1f58d'}, // Lower Left Ballpoint Pen..Lower Left Crayon
|
||||
{U'\x1f590', U'\x1f590'}, // Raised Hand With Fingers Splayed
|
||||
{U'\x1f5a5', U'\x1f5a5'}, // Desktop Computer
|
||||
{U'\x1f5a8', U'\x1f5a8'}, // Printer
|
||||
{U'\x1f5b1', U'\x1f5b2'}, // Three Button Mouse ..Trackball
|
||||
{U'\x1f5bc', U'\x1f5bc'}, // Frame With Picture
|
||||
{U'\x1f5c2', U'\x1f5c4'}, // Card Index Dividers ..File Cabinet
|
||||
{U'\x1f5d1', U'\x1f5d3'}, // Wastebasket ..Spiral Calendar Pad
|
||||
{U'\x1f5dc', U'\x1f5de'}, // Compression ..Rolled-up Newspaper
|
||||
{U'\x1f5e1', U'\x1f5e1'}, // Dagger Knife
|
||||
{U'\x1f5e3', U'\x1f5e3'}, // Speaking Head In Silhouette
|
||||
{U'\x1f5e8', U'\x1f5e8'}, // Left Speech Bubble
|
||||
{U'\x1f5ef', U'\x1f5ef'}, // Right Anger Bubble
|
||||
{U'\x1f5f3', U'\x1f5f3'}, // Ballot Box With Ballot
|
||||
{U'\x1f5fa', U'\x1f5fa'}, // World Map
|
||||
{U'\x1f6cb', U'\x1f6cb'}, // Couch And Lamp
|
||||
{U'\x1f6cd', U'\x1f6cf'}, // Shopping Bags ..Bed
|
||||
{U'\x1f6e0', U'\x1f6e5'}, // Hammer And Wrench ..Motor Boat
|
||||
{U'\x1f6e9', U'\x1f6e9'}, // Small Airplane
|
||||
{U'\x1f6f0', U'\x1f6f0'}, // Satellite
|
||||
{U'\x1f6f3', U'\x1f6f3'}, // Passenger Ship
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
static size_t bisearch(char32_t ucs, const BoundaryVector& table) {
|
||||
// TODO: Use STD algorithm to optimize this function
|
||||
|
||||
// YYC MARK:
|
||||
// Do not change this "int" to "size_t" casually,
|
||||
// because the result of arithmetic operation may be negative.
|
||||
// Do not change this type before using new algorithm.
|
||||
int lbound = 0, ubound = table.size() - 1;
|
||||
|
||||
if (ucs < table.front().first || ucs > table.back().second) return 0;
|
||||
|
||||
while (ubound >= lbound) {
|
||||
int mid = (lbound + ubound) / 2;
|
||||
if (ucs > table[mid].second) lbound = mid + 1;
|
||||
else if (ucs < table[mid].first) ubound = mid - 1;
|
||||
else return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t wcwidth(char32_t wc) {
|
||||
// TODO: Add lru_cache(maxsize=1000) for this function
|
||||
|
||||
// Small optimize for ASCII
|
||||
if (U'\x20' <= wc && wc < U'\x7F') [[likely]]
|
||||
return 1;
|
||||
|
||||
// C0/C1 control char
|
||||
// NOTE: Not vanilla implementation. Return 0 instead of 1.
|
||||
if ((wc && wc < L'\x20') || (L'\x7F' <= wc && wc < L'\xA0')) return 0;
|
||||
|
||||
// Zero-width char
|
||||
if (bisearch(wc, ZERO_WIDTH)) return 0;
|
||||
|
||||
// Width 1 or 2
|
||||
return 1 + bisearch(wc, WIDE_EAST_ASIAN);
|
||||
}
|
||||
|
||||
enum class WcswidthState {
|
||||
/// Normal character.
|
||||
Normal,
|
||||
/// Under ZWJ control char.
|
||||
/// Ignore the width of next char.
|
||||
ZeroWidthJoiner,
|
||||
/// Under ANSI Escape Sequence.
|
||||
/// Following chars should be treated as escape char.
|
||||
AnsiEscape,
|
||||
/// Under CSI control sequence, a part of ANSI Escape Sequence.
|
||||
/// No width was accumulated before terminal char.
|
||||
AnsiCsiEscape,
|
||||
};
|
||||
|
||||
struct WcswidthContext {
|
||||
/// Current state.
|
||||
WcswidthState state;
|
||||
/// Tract the last computed char.
|
||||
/// It will be used for VS16 char.
|
||||
std::optional<char32_t> last_measured_char;
|
||||
};
|
||||
|
||||
Result<size_t> wcswidth(const std::u32string_view& rhs) {
|
||||
WcswidthContext ctx{WcswidthState::Normal, std::nullopt};
|
||||
size_t width = 0;
|
||||
|
||||
for (char32_t chr : rhs) {
|
||||
// Match char value
|
||||
switch (ctx.state) {
|
||||
case WcswidthState::Normal: {
|
||||
switch (chr) {
|
||||
case U'\x200D': {
|
||||
// ZWJ control char
|
||||
ctx.state = WcswidthState::ZeroWidthJoiner;
|
||||
break;
|
||||
}
|
||||
case U'\xFE0F': {
|
||||
// VS16 control char
|
||||
// If we have a char which was acknowledged and has width,
|
||||
// analyse it instead of this control char.
|
||||
if (ctx.last_measured_char.has_value()) {
|
||||
width += bisearch(ctx.last_measured_char.value(), VS16_NARROW_TO_WIDE);
|
||||
ctx.last_measured_char = std::nullopt;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case U'\x1B': {
|
||||
// ANSI escape sequence
|
||||
ctx.state = WcswidthState::AnsiEscape;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
// Fetch widht for normal char
|
||||
int wcw = wcwidth(chr);
|
||||
// Tract the final non-zero char for VS16 control char
|
||||
if (wcw > 0) ctx.last_measured_char = wcw;
|
||||
// Accumulate width
|
||||
width += wcw;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case WcswidthState::ZeroWidthJoiner: {
|
||||
// Eat this char and back to normal state.
|
||||
// This is what ZWJ does.
|
||||
ctx.state = WcswidthState::Normal;
|
||||
break;
|
||||
}
|
||||
case WcswidthState::AnsiEscape: {
|
||||
// Check the second char of escape sequence.
|
||||
// If it is '[', we enter CSI state,
|
||||
// otherwise we eat it and back to normal state.
|
||||
// Additionally, there is a range requirement for this char (0x40-0x5F).
|
||||
if (chr == U'[') {
|
||||
ctx.state = WcswidthState::AnsiCsiEscape;
|
||||
} else if (chr >= U'\x40' && chr <= U'\x5F') {
|
||||
ctx.state = WcswidthState::Normal;
|
||||
} else {
|
||||
return std::unexpected(Error::BadAnsiEscSeq);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case WcswidthState::AnsiCsiEscape: {
|
||||
// CSI sequence is aonsisted by variable Parameter Char (count can be zero),
|
||||
// at least one Middle Char and only one Final Char.
|
||||
// So we eat all chars until we reach the terminal char.
|
||||
if (chr >= U'\x40' && chr <= U'\x7E') {
|
||||
// Final Char. Back to normal state.
|
||||
ctx.state = WcswidthState::Normal;
|
||||
} else if (chr >= U'\x30' && chr <= U'\x3F') {
|
||||
; // Parameter Char. Do nothing
|
||||
} else if (chr >= U'\x20' && chr <= U'\x2F') {
|
||||
; // Middle Char. Do nothing
|
||||
} else {
|
||||
return std::unexpected(Error::BadCsiSeq);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return width;
|
||||
}
|
||||
|
||||
Result<size_t> wcswidth(const std::u8string_view& rhs) {
|
||||
// Cast encoding
|
||||
auto u32str = ENC::to_utf32(rhs);
|
||||
if (!u32str.has_value()) return std::unexpected(Error::BadEncoding);
|
||||
// Call underlying function
|
||||
return wcswidth(u32str.value());
|
||||
}
|
||||
|
||||
} // namespace yycc::carton::wcwidth
|
47
src/yycc/carton/wcwidth.hpp
Normal file
47
src/yycc/carton/wcwidth.hpp
Normal file
@ -0,0 +1,47 @@
|
||||
#pragma once
|
||||
#include <string_view>
|
||||
#include <expected>
|
||||
|
||||
/**
|
||||
* @brief The namespace replicating Linux-specialized function, "wcswidth", in all platforms.
|
||||
* @details
|
||||
* "wcswdith" is a specialized function in Linux.
|
||||
* It was not included in POSIX standard and only provided on Linux.
|
||||
* This function can fetch how many space which given string occupied in terminal.
|
||||
* This is essential and useful function in our library.
|
||||
* So I create this namespace to make "wcswidth" be available on all platforms.
|
||||
*
|
||||
* "wcswidth" is based on \c wchar_t. In Linux, \c wchar_t is 4-bytes length.
|
||||
* It can represent any characters without surrogate pair.
|
||||
* However, in Windows, \c wchar_t is 2-bytes length.
|
||||
* There is possible surrogate pair within \c wchar_t string, which is inconvenient for our programming.
|
||||
* So in this homebrew namespace, I forcelt use \c char32_t as the basic char type.
|
||||
*
|
||||
* Due to the requirements of mine, this implementation is slightly different with original one.
|
||||
* These differences are list below:
|
||||
*
|
||||
* \li We do not return negative value for Control Char in "wcwidth",
|
||||
* because we need to support the analyse of ANSI Escape Sequence.
|
||||
* \li Due to the previous change, the type of return value of "wcwidth" and "wcswidth"
|
||||
* are changed from \c int to \c size_t because there is no negative return value.
|
||||
* \li "wcswidth" now support ANSI Escape Sequence (e.g. terminal color).
|
||||
* So it can analyse colorful output with correct space.
|
||||
*/
|
||||
namespace yycc::carton::wcwidth {
|
||||
|
||||
/// @brief Error occurs in this module
|
||||
enum class Error {
|
||||
BadEncoding, ///< Given
|
||||
BadAnsiEscSeq, ///< Bad char when processing ANSI Escape Sequence
|
||||
BadCsiSeq, ///< Bad char when processing CSI Sequence.
|
||||
};
|
||||
|
||||
/// @brief Result type for this module
|
||||
template<typename T>
|
||||
using Result = std::expected<T, Error>;
|
||||
|
||||
size_t wcwidth(char32_t wc);
|
||||
Result<size_t> wcswidth(const std::u32string_view& rhs);
|
||||
Result<size_t> wcswidth(const std::u8string_view& rhs);
|
||||
|
||||
} // namespace yycc::carton::wcwidth
|
@ -28,6 +28,7 @@ PRIVATE
|
||||
yycc/windows/winfct.cpp
|
||||
|
||||
yycc/carton/pycodec.cpp
|
||||
yycc/carton/wcwidth.cpp
|
||||
)
|
||||
target_sources(YYCCTestbench
|
||||
PRIVATE
|
||||
|
52
testbench/yycc/carton/wcwidth.cpp
Normal file
52
testbench/yycc/carton/wcwidth.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <yycc.hpp>
|
||||
#include <yycc/carton/wcwidth.hpp>
|
||||
|
||||
#define WCWDITH ::yycc::carton::wcwidth
|
||||
|
||||
namespace yycctest::carton::wcwidth {
|
||||
|
||||
#define TEST_SUCCESS(strl, len) \
|
||||
{ \
|
||||
auto rv = WCWDITH::wcswidth(strl); \
|
||||
ASSERT_TRUE(rv.has_value()); \
|
||||
EXPECT_EQ(rv.value(), len); \
|
||||
}
|
||||
|
||||
#define TEST_FAIL(strl) \
|
||||
{ \
|
||||
auto rv = WCWDITH::wcswidth(strl); \
|
||||
EXPECT_FALSE(rv.has_value()); \
|
||||
}
|
||||
|
||||
TEST(CartonWcwdith, BadAnsi) {
|
||||
TEST_FAIL(u8"\033?");
|
||||
}
|
||||
|
||||
TEST(CartonWcwdith, BadCsi) {
|
||||
TEST_FAIL(u8"\033[\t");
|
||||
}
|
||||
|
||||
TEST(CartonWcwdith, English) {
|
||||
TEST_SUCCESS(u8"abc", 3);
|
||||
}
|
||||
|
||||
TEST(CartonWcwdith, Chinese) {
|
||||
TEST_SUCCESS(u8"中文", 4);
|
||||
TEST_SUCCESS(u8"中a文", 5);
|
||||
}
|
||||
|
||||
TEST(CartonWcwdith, Japanese) {
|
||||
TEST_SUCCESS(u8"ありがとう", 10);
|
||||
TEST_SUCCESS(u8"アリガトウ", 10);
|
||||
TEST_SUCCESS(u8"アリガトウ", 6);
|
||||
}
|
||||
|
||||
TEST(CartonWcwdith, Termcolor) {
|
||||
// TODO: Fix this after finish "termcolor".
|
||||
// assert_eq!(wcswidth(&colored("abc", Color::Red, Default::default(), Default::default())), 3);
|
||||
// assert_eq!(wcswidth(&colored("中文", Color::Red, Default::default(), Default::default())), 4);
|
||||
// assert_eq!(wcswidth(&colored("ありがとう", Color::Red, Default::default(), Default::default())), 10);
|
||||
}
|
||||
|
||||
} // namespace yycctest::carton::wcwidth
|
Reference in New Issue
Block a user