feat: add new package wcwidth

- add wcwidth in carton.
- order clang-format do not format some generated content.
This commit is contained in:
2025-08-15 16:42:28 +08:00
parent 00c8f09907
commit 2f11ba6023
6 changed files with 651 additions and 33 deletions

View File

@ -24,6 +24,7 @@ PRIVATE
yycc/encoding/iconv.cpp
yycc/carton/pycodec.cpp
yycc/carton/wcwidth.cpp
)
target_sources(YYCCommonplace
PUBLIC
@ -67,6 +68,7 @@ FILES
yycc/encoding/iconv.hpp
yycc/carton/pycodec.hpp
yycc/carton/wcwidth.hpp
)
# Setup header infomations
target_include_directories(YYCCommonplace

View File

@ -19,6 +19,7 @@ namespace yycc::carton::pycodec {
#pragma region Encoding Name
// clang-format off
static const std::map<std::u8string_view, std::u8string_view> ALIAS_MAP{
{u8"646"sv, u8"ascii"sv},
{u8"us-ascii"sv, u8"ascii"sv},
@ -224,6 +225,7 @@ namespace yycc::carton::pycodec {
{u8"utf-8"sv, u8"utf_8"sv},
{u8"cp65001"sv, u8"utf_8"sv},
};
// clang-format on
/**
* @brief Resolve encoding name alias and fetch real encoding name.
@ -248,40 +250,74 @@ namespace yycc::carton::pycodec {
using CodePage = YYCC_PYCODEC_BACKEND_NS::CodePage;
static const std::map<std::u8string_view, CodePage> WINCP_MAP{
{u8"ascii"sv, static_cast<CodePage>(437u)}, {u8"big5"sv, static_cast<CodePage>(950u)},
{u8"cp037"sv, static_cast<CodePage>(037u)}, {u8"cp437"sv, static_cast<CodePage>(437u)},
{u8"cp500"sv, static_cast<CodePage>(500u)}, {u8"cp720"sv, static_cast<CodePage>(720u)},
{u8"cp737"sv, static_cast<CodePage>(737u)}, {u8"cp775"sv, static_cast<CodePage>(775u)},
{u8"cp850"sv, static_cast<CodePage>(850u)}, {u8"cp852"sv, static_cast<CodePage>(852u)},
{u8"cp855"sv, static_cast<CodePage>(855u)}, {u8"cp857"sv, static_cast<CodePage>(857u)},
{u8"cp858"sv, static_cast<CodePage>(858u)}, {u8"cp860"sv, static_cast<CodePage>(860u)},
{u8"cp861"sv, static_cast<CodePage>(861u)}, {u8"cp862"sv, static_cast<CodePage>(862u)},
{u8"cp863"sv, static_cast<CodePage>(863u)}, {u8"cp864"sv, static_cast<CodePage>(864u)},
{u8"cp865"sv, static_cast<CodePage>(865u)}, {u8"cp866"sv, static_cast<CodePage>(866u)},
{u8"cp869"sv, static_cast<CodePage>(869u)}, {u8"cp874"sv, static_cast<CodePage>(874u)},
{u8"cp875"sv, static_cast<CodePage>(875u)}, {u8"cp932"sv, static_cast<CodePage>(932u)},
{u8"cp949"sv, static_cast<CodePage>(949u)}, {u8"cp950"sv, static_cast<CodePage>(950u)},
{u8"cp1026"sv, static_cast<CodePage>(1026u)}, {u8"cp1140"sv, static_cast<CodePage>(1140u)},
{u8"cp1250"sv, static_cast<CodePage>(1250u)}, {u8"cp1251"sv, static_cast<CodePage>(1251u)},
{u8"cp1252"sv, static_cast<CodePage>(1252u)}, {u8"cp1253"sv, static_cast<CodePage>(1253u)},
{u8"cp1254"sv, static_cast<CodePage>(1254u)}, {u8"cp1255"sv, static_cast<CodePage>(1255u)},
{u8"cp1256"sv, static_cast<CodePage>(1256u)}, {u8"cp1257"sv, static_cast<CodePage>(1257u)},
{u8"cp1258"sv, static_cast<CodePage>(1258u)}, {u8"euc_jp"sv, static_cast<CodePage>(20932u)},
{u8"euc_kr"sv, static_cast<CodePage>(51949u)}, {u8"gb2312"sv, static_cast<CodePage>(936u)},
{u8"gbk"sv, static_cast<CodePage>(936u)}, {u8"gb18030"sv, static_cast<CodePage>(54936u)},
{u8"hz"sv, static_cast<CodePage>(52936u)}, {u8"iso2022_jp"sv, static_cast<CodePage>(50220u)},
{u8"iso2022_kr"sv, static_cast<CodePage>(50225u)}, {u8"latin_1"sv, static_cast<CodePage>(28591u)},
{u8"iso8859_2"sv, static_cast<CodePage>(28592u)}, {u8"iso8859_3"sv, static_cast<CodePage>(28593u)},
{u8"iso8859_4"sv, static_cast<CodePage>(28594u)}, {u8"iso8859_5"sv, static_cast<CodePage>(28595u)},
{u8"iso8859_6"sv, static_cast<CodePage>(28596u)}, {u8"iso8859_7"sv, static_cast<CodePage>(28597u)},
{u8"iso8859_8"sv, static_cast<CodePage>(28598u)}, {u8"iso8859_9"sv, static_cast<CodePage>(28599u)},
{u8"iso8859_13"sv, static_cast<CodePage>(28603u)}, {u8"iso8859_15"sv, static_cast<CodePage>(28605u)},
{u8"johab"sv, static_cast<CodePage>(1361u)}, {u8"mac_cyrillic"sv, static_cast<CodePage>(10007u)},
{u8"mac_greek"sv, static_cast<CodePage>(10006u)}, {u8"mac_iceland"sv, static_cast<CodePage>(10079u)},
{u8"mac_turkish"sv, static_cast<CodePage>(10081u)}, {u8"shift_jis"sv, static_cast<CodePage>(932u)},
{u8"utf_7"sv, static_cast<CodePage>(65000u)}, {u8"utf_8"sv, static_cast<CodePage>(65001u)},
// clang-format off
static const std::map<std::u8string_view, CodePage> WINCP_MAP {
{ u8"ascii"sv, static_cast<CodePage>(437u) },
{ u8"big5"sv, static_cast<CodePage>(950u) },
{ u8"cp037"sv, static_cast<CodePage>(037u) },
{ u8"cp437"sv, static_cast<CodePage>(437u) },
{ u8"cp500"sv, static_cast<CodePage>(500u) },
{ u8"cp720"sv, static_cast<CodePage>(720u) },
{ u8"cp737"sv, static_cast<CodePage>(737u) },
{ u8"cp775"sv, static_cast<CodePage>(775u) },
{ u8"cp850"sv, static_cast<CodePage>(850u) },
{ u8"cp852"sv, static_cast<CodePage>(852u) },
{ u8"cp855"sv, static_cast<CodePage>(855u) },
{ u8"cp857"sv, static_cast<CodePage>(857u) },
{ u8"cp858"sv, static_cast<CodePage>(858u) },
{ u8"cp860"sv, static_cast<CodePage>(860u) },
{ u8"cp861"sv, static_cast<CodePage>(861u) },
{ u8"cp862"sv, static_cast<CodePage>(862u) },
{ u8"cp863"sv, static_cast<CodePage>(863u) },
{ u8"cp864"sv, static_cast<CodePage>(864u) },
{ u8"cp865"sv, static_cast<CodePage>(865u) },
{ u8"cp866"sv, static_cast<CodePage>(866u) },
{ u8"cp869"sv, static_cast<CodePage>(869u) },
{ u8"cp874"sv, static_cast<CodePage>(874u) },
{ u8"cp875"sv, static_cast<CodePage>(875u) },
{ u8"cp932"sv, static_cast<CodePage>(932u) },
{ u8"cp949"sv, static_cast<CodePage>(949u) },
{ u8"cp950"sv, static_cast<CodePage>(950u) },
{ u8"cp1026"sv, static_cast<CodePage>(1026u) },
{ u8"cp1140"sv, static_cast<CodePage>(1140u) },
{ u8"cp1250"sv, static_cast<CodePage>(1250u) },
{ u8"cp1251"sv, static_cast<CodePage>(1251u) },
{ u8"cp1252"sv, static_cast<CodePage>(1252u) },
{ u8"cp1253"sv, static_cast<CodePage>(1253u) },
{ u8"cp1254"sv, static_cast<CodePage>(1254u) },
{ u8"cp1255"sv, static_cast<CodePage>(1255u) },
{ u8"cp1256"sv, static_cast<CodePage>(1256u) },
{ u8"cp1257"sv, static_cast<CodePage>(1257u) },
{ u8"cp1258"sv, static_cast<CodePage>(1258u) },
{ u8"euc_jp"sv, static_cast<CodePage>(20932u) },
{ u8"euc_kr"sv, static_cast<CodePage>(51949u) },
{ u8"gb2312"sv, static_cast<CodePage>(936u) },
{ u8"gbk"sv, static_cast<CodePage>(936u) },
{ u8"gb18030"sv, static_cast<CodePage>(54936u) },
{ u8"hz"sv, static_cast<CodePage>(52936u) },
{ u8"iso2022_jp"sv, static_cast<CodePage>(50220u) },
{ u8"iso2022_kr"sv, static_cast<CodePage>(50225u) },
{ u8"latin_1"sv, static_cast<CodePage>(28591u) },
{ u8"iso8859_2"sv, static_cast<CodePage>(28592u) },
{ u8"iso8859_3"sv, static_cast<CodePage>(28593u) },
{ u8"iso8859_4"sv, static_cast<CodePage>(28594u) },
{ u8"iso8859_5"sv, static_cast<CodePage>(28595u) },
{ u8"iso8859_6"sv, static_cast<CodePage>(28596u) },
{ u8"iso8859_7"sv, static_cast<CodePage>(28597u) },
{ u8"iso8859_8"sv, static_cast<CodePage>(28598u) },
{ u8"iso8859_9"sv, static_cast<CodePage>(28599u) },
{ u8"iso8859_13"sv, static_cast<CodePage>(28603u) },
{ u8"iso8859_15"sv, static_cast<CodePage>(28605u) },
{ u8"johab"sv, static_cast<CodePage>(1361u) },
{ u8"mac_cyrillic"sv, static_cast<CodePage>(10007u) },
{ u8"mac_greek"sv, static_cast<CodePage>(10006u) },
{ u8"mac_iceland"sv, static_cast<CodePage>(10079u) },
{ u8"mac_turkish"sv, static_cast<CodePage>(10081u) },
{ u8"shift_jis"sv, static_cast<CodePage>(932u) },
{ u8"utf_7"sv, static_cast<CodePage>(65000u) },
{ u8"utf_8"sv, static_cast<CodePage>(65001u) },
};
// clang-format on
static FetchResult<CodePage> fetch_code_page(const std::u8string_view& enc_name) {
// resolve alias
@ -296,6 +332,7 @@ namespace yycc::carton::pycodec {
#else
// clang-format off
static const std::map<std::u8string_view, std::string_view> ICONV_MAP{
{u8"ascii"sv, "ASCII"sv},
{u8"big5"sv, "BIG5"sv},
@ -359,6 +396,7 @@ namespace yycc::carton::pycodec {
{u8"utf_7"sv, "UTF-7"sv},
{u8"utf_8"sv, "UTF-8"sv},
};
// clang-format on
static FetchResult<std::string_view> fetch_iconv_name(const std::u8string_view& enc_name) {
// resolve alias

478
src/yycc/carton/wcwidth.cpp Normal file
View File

@ -0,0 +1,478 @@
#include "wcwidth.hpp"
#include "../encoding/stl.hpp"
#include <utility>
#include <vector>
#include <optional>
#define ENC ::yycc::encoding::stl
namespace yycc::carton::wcwidth {
using Boundary = std::pair<char32_t, char32_t>;
using BoundaryVector = std::vector<Boundary>;
// YYC MARK:
// Following table and code are copied from Python package "wcwidth".
// Although the code of this package are also copied from the original "wcwidth" C implementation.
//
// I do not need so much exact measurement.
// I just want a "it works" wcwdith in all platforms.
// So these tables are coming from the table with lowest UNICODE version
// (original package provides different tables for different UNICODE versions).
// clang-format off
static const BoundaryVector ZERO_WIDTH{
{U'\x00000', U'\x00000'}, // (nil)
{U'\x000ad', U'\x000ad'}, // Soft Hyphen
{U'\x00300', U'\x0036f'}, // Combining Grave Accent ..Combining Latin Small Le
{U'\x00483', U'\x00486'}, // Combining Cyrillic Titlo..Combining Cyrillic Psili
{U'\x00488', U'\x00489'}, // Combining Cyrillic Hundr..Combining Cyrillic Milli
{U'\x00591', U'\x005b9'}, // Hebrew Accent Etnahta ..Hebrew Point Holam
{U'\x005bb', U'\x005bd'}, // Hebrew Point Qubuts ..Hebrew Point Meteg
{U'\x005bf', U'\x005bf'}, // Hebrew Point Rafe
{U'\x005c1', U'\x005c2'}, // Hebrew Point Shin Dot ..Hebrew Point Sin Dot
{U'\x005c4', U'\x005c5'}, // Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot
{U'\x005c7', U'\x005c7'}, // Hebrew Point Qamats Qatan
{U'\x00600', U'\x00603'}, // Arabic Number Sign ..Arabic Sign Safha
{U'\x00610', U'\x00615'}, // Arabic Sign Sallallahou ..Arabic Small High Tah
{U'\x0064b', U'\x0065e'}, // Arabic Fathatan ..Arabic Fatha With Two Do
{U'\x00670', U'\x00670'}, // Arabic Letter Superscript Alef
{U'\x006d6', U'\x006e4'}, // Arabic Small High Ligatu..Arabic Small High Madda
{U'\x006e7', U'\x006e8'}, // Arabic Small High Yeh ..Arabic Small High Noon
{U'\x006ea', U'\x006ed'}, // Arabic Empty Centre Low ..Arabic Small Low Meem
{U'\x0070f', U'\x0070f'}, // Syriac Abbreviation Mark
{U'\x00711', U'\x00711'}, // Syriac Letter Superscript Alaph
{U'\x00730', U'\x0074a'}, // Syriac Pthaha Above ..Syriac Barrekh
{U'\x007a6', U'\x007b0'}, // Thaana Abafili ..Thaana Sukun
{U'\x00901', U'\x00903'}, // Devanagari Sign Candrabi..Devanagari Sign Visarga
{U'\x0093c', U'\x0093c'}, // Devanagari Sign Nukta
{U'\x0093e', U'\x0094d'}, // Devanagari Vowel Sign Aa..Devanagari Sign Virama
{U'\x00951', U'\x00954'}, // Devanagari Stress Sign U..Devanagari Acute Accent
{U'\x00962', U'\x00963'}, // Devanagari Vowel Sign Vo..Devanagari Vowel Sign Vo
{U'\x00981', U'\x00983'}, // Bengali Sign Candrabindu..Bengali Sign Visarga
{U'\x009bc', U'\x009bc'}, // Bengali Sign Nukta
{U'\x009be', U'\x009c4'}, // Bengali Vowel Sign Aa ..Bengali Vowel Sign Vocal
{U'\x009c7', U'\x009c8'}, // Bengali Vowel Sign E ..Bengali Vowel Sign Ai
{U'\x009cb', U'\x009cd'}, // Bengali Vowel Sign O ..Bengali Sign Virama
{U'\x009d7', U'\x009d7'}, // Bengali Au Length Mark
{U'\x009e2', U'\x009e3'}, // Bengali Vowel Sign Vocal..Bengali Vowel Sign Vocal
{U'\x00a01', U'\x00a03'}, // Gurmukhi Sign Adak Bindi..Gurmukhi Sign Visarga
{U'\x00a3c', U'\x00a3c'}, // Gurmukhi Sign Nukta
{U'\x00a3e', U'\x00a42'}, // Gurmukhi Vowel Sign Aa ..Gurmukhi Vowel Sign Uu
{U'\x00a47', U'\x00a48'}, // Gurmukhi Vowel Sign Ee ..Gurmukhi Vowel Sign Ai
{U'\x00a4b', U'\x00a4d'}, // Gurmukhi Vowel Sign Oo ..Gurmukhi Sign Virama
{U'\x00a70', U'\x00a71'}, // Gurmukhi Tippi ..Gurmukhi Addak
{U'\x00a81', U'\x00a83'}, // Gujarati Sign Candrabind..Gujarati Sign Visarga
{U'\x00abc', U'\x00abc'}, // Gujarati Sign Nukta
{U'\x00abe', U'\x00ac5'}, // Gujarati Vowel Sign Aa ..Gujarati Vowel Sign Cand
{U'\x00ac7', U'\x00ac9'}, // Gujarati Vowel Sign E ..Gujarati Vowel Sign Cand
{U'\x00acb', U'\x00acd'}, // Gujarati Vowel Sign O ..Gujarati Sign Virama
{U'\x00ae2', U'\x00ae3'}, // Gujarati Vowel Sign Voca..Gujarati Vowel Sign Voca
{U'\x00b01', U'\x00b03'}, // Oriya Sign Candrabindu ..Oriya Sign Visarga
{U'\x00b3c', U'\x00b3c'}, // Oriya Sign Nukta
{U'\x00b3e', U'\x00b43'}, // Oriya Vowel Sign Aa ..Oriya Vowel Sign Vocalic
{U'\x00b47', U'\x00b48'}, // Oriya Vowel Sign E ..Oriya Vowel Sign Ai
{U'\x00b4b', U'\x00b4d'}, // Oriya Vowel Sign O ..Oriya Sign Virama
{U'\x00b56', U'\x00b57'}, // Oriya Ai Length Mark ..Oriya Au Length Mark
{U'\x00b82', U'\x00b82'}, // Tamil Sign Anusvara
{U'\x00bbe', U'\x00bc2'}, // Tamil Vowel Sign Aa ..Tamil Vowel Sign Uu
{U'\x00bc6', U'\x00bc8'}, // Tamil Vowel Sign E ..Tamil Vowel Sign Ai
{U'\x00bca', U'\x00bcd'}, // Tamil Vowel Sign O ..Tamil Sign Virama
{U'\x00bd7', U'\x00bd7'}, // Tamil Au Length Mark
{U'\x00c01', U'\x00c03'}, // Telugu Sign Candrabindu ..Telugu Sign Visarga
{U'\x00c3e', U'\x00c44'}, // Telugu Vowel Sign Aa ..Telugu Vowel Sign Vocali
{U'\x00c46', U'\x00c48'}, // Telugu Vowel Sign E ..Telugu Vowel Sign Ai
{U'\x00c4a', U'\x00c4d'}, // Telugu Vowel Sign O ..Telugu Sign Virama
{U'\x00c55', U'\x00c56'}, // Telugu Length Mark ..Telugu Ai Length Mark
{U'\x00c82', U'\x00c83'}, // Kannada Sign Anusvara ..Kannada Sign Visarga
{U'\x00cbc', U'\x00cbc'}, // Kannada Sign Nukta
{U'\x00cbe', U'\x00cc4'}, // Kannada Vowel Sign Aa ..Kannada Vowel Sign Vocal
{U'\x00cc6', U'\x00cc8'}, // Kannada Vowel Sign E ..Kannada Vowel Sign Ai
{U'\x00cca', U'\x00ccd'}, // Kannada Vowel Sign O ..Kannada Sign Virama
{U'\x00cd5', U'\x00cd6'}, // Kannada Length Mark ..Kannada Ai Length Mark
{U'\x00d02', U'\x00d03'}, // Malayalam Sign Anusvara ..Malayalam Sign Visarga
{U'\x00d3e', U'\x00d43'}, // Malayalam Vowel Sign Aa ..Malayalam Vowel Sign Voc
{U'\x00d46', U'\x00d48'}, // Malayalam Vowel Sign E ..Malayalam Vowel Sign Ai
{U'\x00d4a', U'\x00d4d'}, // Malayalam Vowel Sign O ..Malayalam Sign Virama
{U'\x00d57', U'\x00d57'}, // Malayalam Au Length Mark
{U'\x00d82', U'\x00d83'}, // Sinhala Sign Anusvaraya ..Sinhala Sign Visargaya
{U'\x00dca', U'\x00dca'}, // Sinhala Sign Al-lakuna
{U'\x00dcf', U'\x00dd4'}, // Sinhala Vowel Sign Aela-..Sinhala Vowel Sign Ketti
{U'\x00dd6', U'\x00dd6'}, // Sinhala Vowel Sign Diga Paa-pilla
{U'\x00dd8', U'\x00ddf'}, // Sinhala Vowel Sign Gaett..Sinhala Vowel Sign Gayan
{U'\x00df2', U'\x00df3'}, // Sinhala Vowel Sign Diga ..Sinhala Vowel Sign Diga
{U'\x00e31', U'\x00e31'}, // Thai Character Mai Han-akat
{U'\x00e34', U'\x00e3a'}, // Thai Character Sara I ..Thai Character Phinthu
{U'\x00e47', U'\x00e4e'}, // Thai Character Maitaikhu..Thai Character Yamakkan
{U'\x00eb1', U'\x00eb1'}, // Lao Vowel Sign Mai Kan
{U'\x00eb4', U'\x00eb9'}, // Lao Vowel Sign I ..Lao Vowel Sign Uu
{U'\x00ebb', U'\x00ebc'}, // Lao Vowel Sign Mai Kon ..Lao Semivowel Sign Lo
{U'\x00ec8', U'\x00ecd'}, // Lao Tone Mai Ek ..Lao Niggahita
{U'\x00f18', U'\x00f19'}, // Tibetan Astrological Sig..Tibetan Astrological Sig
{U'\x00f35', U'\x00f35'}, // Tibetan Mark Ngas Bzung Nyi Zla
{U'\x00f37', U'\x00f37'}, // Tibetan Mark Ngas Bzung Sgor Rtags
{U'\x00f39', U'\x00f39'}, // Tibetan Mark Tsa -phru
{U'\x00f3e', U'\x00f3f'}, // Tibetan Sign Yar Tshes ..Tibetan Sign Mar Tshes
{U'\x00f71', U'\x00f84'}, // Tibetan Vowel Sign Aa ..Tibetan Mark Halanta
{U'\x00f86', U'\x00f87'}, // Tibetan Sign Lci Rtags ..Tibetan Sign Yang Rtags
{U'\x00f90', U'\x00f97'}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter
{U'\x00f99', U'\x00fbc'}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter
{U'\x00fc6', U'\x00fc6'}, // Tibetan Symbol Padma Gdan
{U'\x0102c', U'\x01032'}, // Myanmar Vowel Sign Aa ..Myanmar Vowel Sign Ai
{U'\x01036', U'\x01039'}, // Myanmar Sign Anusvara ..Myanmar Sign Virama
{U'\x01056', U'\x01059'}, // Myanmar Vowel Sign Vocal..Myanmar Vowel Sign Vocal
{U'\x01160', U'\x011ff'}, // Hangul Jungseong Filler ..Hangul Jongseong Ssangni
{U'\x0135f', U'\x0135f'}, // Ethiopic Combining Gemination Mark
{U'\x01712', U'\x01714'}, // Tagalog Vowel Sign I ..Tagalog Sign Virama
{U'\x01732', U'\x01734'}, // Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod
{U'\x01752', U'\x01753'}, // Buhid Vowel Sign I ..Buhid Vowel Sign U
{U'\x01772', U'\x01773'}, // Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U
{U'\x017b4', U'\x017d3'}, // Khmer Vowel Inherent Aq ..Khmer Sign Bathamasat
{U'\x017dd', U'\x017dd'}, // Khmer Sign Atthacan
{U'\x0180b', U'\x0180d'}, // Mongolian Free Variation..Mongolian Free Variation
{U'\x018a9', U'\x018a9'}, // Mongolian Letter Ali Gali Dagalga
{U'\x01920', U'\x0192b'}, // Limbu Vowel Sign A ..Limbu Subjoined Letter W
{U'\x01930', U'\x0193b'}, // Limbu Small Letter Ka ..Limbu Sign Sa-i
{U'\x019b0', U'\x019c0'}, // New Tai Lue Vowel Sign V..New Tai Lue Vowel Sign I
{U'\x019c8', U'\x019c9'}, // New Tai Lue Tone Mark-1 ..New Tai Lue Tone Mark-2
{U'\x01a17', U'\x01a1b'}, // Buginese Vowel Sign I ..Buginese Vowel Sign Ae
{U'\x01dc0', U'\x01dc3'}, // Combining Dotted Grave A..Combining Suspension Mar
{U'\x0200b', U'\x0200f'}, // Zero Width Space ..Right-to-left Mark
{U'\x02028', U'\x0202e'}, // Line Separator ..Right-to-left Override
{U'\x02060', U'\x02063'}, // Word Joiner ..Invisible Separator
{U'\x0206a', U'\x0206f'}, // Inhibit Symmetric Swappi..Nominal Digit Shapes
{U'\x020d0', U'\x020eb'}, // Combining Left Harpoon A..Combining Long Double So
{U'\x0302a', U'\x0302f'}, // Ideographic Level Tone M..Hangul Double Dot Tone M
{U'\x03099', U'\x0309a'}, // Combining Katakana-hirag..Combining Katakana-hirag
{U'\x0a802', U'\x0a802'}, // Syloti Nagri Sign Dvisvara
{U'\x0a806', U'\x0a806'}, // Syloti Nagri Sign Hasanta
{U'\x0a80b', U'\x0a80b'}, // Syloti Nagri Sign Anusvara
{U'\x0a823', U'\x0a827'}, // Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign
{U'\x0d7b0', U'\x0d7ff'}, // Hangul Jungseong O-yeo ..(nil)
{U'\x0fb1e', U'\x0fb1e'}, // Hebrew Point Judeo-spanish Varika
{U'\x0fe00', U'\x0fe0f'}, // Variation Selector-1 ..Variation Selector-16
{U'\x0fe20', U'\x0fe23'}, // Combining Ligature Left ..Combining Double Tilde R
{U'\x0feff', U'\x0feff'}, // Zero Width No-break Space
{U'\x0fff9', U'\x0fffb'}, // Interlinear Annotation A..Interlinear Annotation T
{U'\x10a01', U'\x10a03'}, // Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo
{U'\x10a05', U'\x10a06'}, // Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O
{U'\x10a0c', U'\x10a0f'}, // Kharoshthi Vowel Length ..Kharoshthi Sign Visarga
{U'\x10a38', U'\x10a3a'}, // Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo
{U'\x10a3f', U'\x10a3f'}, // Kharoshthi Virama
{U'\x1d165', U'\x1d169'}, // Musical Symbol Combining..Musical Symbol Combining
{U'\x1d16d', U'\x1d182'}, // Musical Symbol Combining..Musical Symbol Combining
{U'\x1d185', U'\x1d18b'}, // Musical Symbol Combining..Musical Symbol Combining
{U'\x1d1aa', U'\x1d1ad'}, // Musical Symbol Combining..Musical Symbol Combining
{U'\x1d242', U'\x1d244'}, // Combining Greek Musical ..Combining Greek Musical
{U'\xe0001', U'\xe0001'}, // Language Tag
{U'\xe0020', U'\xe007f'}, // Tag Space ..Cancel Tag
{U'\xe0100', U'\xe01ef'}, // Variation Selector-17 ..Variation Selector-256
};
static const BoundaryVector WIDE_EAST_ASIAN{
{U'\x01100', U'\x01159'}, // Hangul Choseong Kiyeok ..Hangul Choseong Yeorinhi
{U'\x0115f', U'\x0115f'}, // Hangul Choseong Filler
{U'\x02329', U'\x0232a'}, // Left-pointing Angle Brac..Right-pointing Angle Bra
{U'\x02e80', U'\x02e99'}, // Cjk Radical Repeat ..Cjk Radical Rap
{U'\x02e9b', U'\x02ef3'}, // Cjk Radical Choke ..Cjk Radical C-simplified
{U'\x02f00', U'\x02fd5'}, // Kangxi Radical One ..Kangxi Radical Flute
{U'\x02ff0', U'\x02ffb'}, // Ideographic Description ..Ideographic Description
{U'\x03000', U'\x03029'}, // Ideographic Space ..Hangzhou Numeral Nine
{U'\x03030', U'\x0303e'}, // Wavy Dash ..Ideographic Variation In
{U'\x03041', U'\x03096'}, // Hiragana Letter Small A ..Hiragana Letter Small Ke
{U'\x0309b', U'\x030ff'}, // Katakana-hiragana Voiced..Katakana Digraph Koto
{U'\x03105', U'\x0312c'}, // Bopomofo Letter B ..Bopomofo Letter Gn
{U'\x03131', U'\x0318e'}, // Hangul Letter Kiyeok ..Hangul Letter Araeae
{U'\x03190', U'\x031b7'}, // Ideographic Annotation L..Bopomofo Final Letter H
{U'\x031c0', U'\x031cf'}, // Cjk Stroke T ..Cjk Stroke N
{U'\x031f0', U'\x0321e'}, // Katakana Letter Small Ku..Parenthesized Korean Cha
{U'\x03220', U'\x03243'}, // Parenthesized Ideograph ..Parenthesized Ideograph
{U'\x03250', U'\x032fe'}, // Partnership Sign ..Circled Katakana Wo
{U'\x03300', U'\x04db5'}, // Square Apaato ..Cjk Unified Ideograph-4d
{U'\x04e00', U'\x09fbb'}, // Cjk Unified Ideograph-4e..Cjk Unified Ideograph-9f
{U'\x0a000', U'\x0a48c'}, // Yi Syllable It ..Yi Syllable Yyr
{U'\x0a490', U'\x0a4c6'}, // Yi Radical Qot ..Yi Radical Ke
{U'\x0ac00', U'\x0d7a3'}, // Hangul Syllable Ga ..Hangul Syllable Hih
{U'\x0f900', U'\x0fa2d'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr
{U'\x0fa30', U'\x0fa6a'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr
{U'\x0fa70', U'\x0fad9'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr
{U'\x0fe10', U'\x0fe19'}, // Presentation Form For Ve..Presentation Form For Ve
{U'\x0fe30', U'\x0fe52'}, // Presentation Form For Ve..Small Full Stop
{U'\x0fe54', U'\x0fe66'}, // Small Semicolon ..Small Equals Sign
{U'\x0fe68', U'\x0fe6b'}, // Small Reverse Solidus ..Small Commercial At
{U'\x0ff01', U'\x0ff60'}, // Fullwidth Exclamation Ma..Fullwidth Right White Pa
{U'\x0ffe0', U'\x0ffe6'}, // Fullwidth Cent Sign ..Fullwidth Won Sign
{U'\x20000', U'\x2fffd'}, // Cjk Unified Ideograph-20..(nil)
{U'\x30000', U'\x3fffd'}, // Cjk Unified Ideograph-30..(nil)
};
static const BoundaryVector VS16_NARROW_TO_WIDE{
{U'\x00023', U'\x00023'}, // Number Sign
{U'\x0002a', U'\x0002a'}, // Asterisk
{U'\x00030', U'\x00039'}, // Digit Zero ..Digit Nine
{U'\x000a9', U'\x000a9'}, // Copyright Sign
{U'\x000ae', U'\x000ae'}, // Registered Sign
{U'\x0203c', U'\x0203c'}, // Double Exclamation Mark
{U'\x02049', U'\x02049'}, // Exclamation Question Mark
{U'\x02122', U'\x02122'}, // Trade Mark Sign
{U'\x02139', U'\x02139'}, // Information Source
{U'\x02194', U'\x02199'}, // Left Right Arrow ..South West Arrow
{U'\x021a9', U'\x021aa'}, // Leftwards Arrow With Hoo..Rightwards Arrow With Ho
{U'\x02328', U'\x02328'}, // Keyboard
{U'\x023cf', U'\x023cf'}, // Eject Symbol
{U'\x023ed', U'\x023ef'}, // Black Right-pointing Dou..Black Right-pointing Tri
{U'\x023f1', U'\x023f2'}, // Stopwatch ..Timer Clock
{U'\x023f8', U'\x023fa'}, // Double Vertical Bar ..Black Circle For Record
{U'\x024c2', U'\x024c2'}, // Circled Latin Capital Letter M
{U'\x025aa', U'\x025ab'}, // Black Small Square ..White Small Square
{U'\x025b6', U'\x025b6'}, // Black Right-pointing Triangle
{U'\x025c0', U'\x025c0'}, // Black Left-pointing Triangle
{U'\x025fb', U'\x025fc'}, // White Medium Square ..Black Medium Square
{U'\x02600', U'\x02604'}, // Black Sun With Rays ..Comet
{U'\x0260e', U'\x0260e'}, // Black Telephone
{U'\x02611', U'\x02611'}, // Ballot Box With Check
{U'\x02618', U'\x02618'}, // Shamrock
{U'\x0261d', U'\x0261d'}, // White Up Pointing Index
{U'\x02620', U'\x02620'}, // Skull And Crossbones
{U'\x02622', U'\x02623'}, // Radioactive Sign ..Biohazard Sign
{U'\x02626', U'\x02626'}, // Orthodox Cross
{U'\x0262a', U'\x0262a'}, // Star And Crescent
{U'\x0262e', U'\x0262f'}, // Peace Symbol ..Yin Yang
{U'\x02638', U'\x0263a'}, // Wheel Of Dharma ..White Smiling Face
{U'\x02640', U'\x02640'}, // Female Sign
{U'\x02642', U'\x02642'}, // Male Sign
{U'\x0265f', U'\x02660'}, // Black Chess Pawn ..Black Spade Suit
{U'\x02663', U'\x02663'}, // Black Club Suit
{U'\x02665', U'\x02666'}, // Black Heart Suit ..Black Diamond Suit
{U'\x02668', U'\x02668'}, // Hot Springs
{U'\x0267b', U'\x0267b'}, // Black Universal Recycling Symbol
{U'\x0267e', U'\x0267e'}, // Permanent Paper Sign
{U'\x02692', U'\x02692'}, // Hammer And Pick
{U'\x02694', U'\x02697'}, // Crossed Swords ..Alembic
{U'\x02699', U'\x02699'}, // Gear
{U'\x0269b', U'\x0269c'}, // Atom Symbol ..Fleur-de-lis
{U'\x026a0', U'\x026a0'}, // Warning Sign
{U'\x026a7', U'\x026a7'}, // Male With Stroke And Male And Female Sign
{U'\x026b0', U'\x026b1'}, // Coffin ..Funeral Urn
{U'\x026c8', U'\x026c8'}, // Thunder Cloud And Rain
{U'\x026cf', U'\x026cf'}, // Pick
{U'\x026d1', U'\x026d1'}, // Helmet With White Cross
{U'\x026d3', U'\x026d3'}, // Chains
{U'\x026e9', U'\x026e9'}, // Shinto Shrine
{U'\x026f0', U'\x026f1'}, // Mountain ..Umbrella On Ground
{U'\x026f4', U'\x026f4'}, // Ferry
{U'\x026f7', U'\x026f9'}, // Skier ..Person With Ball
{U'\x02702', U'\x02702'}, // Black Scissors
{U'\x02708', U'\x02709'}, // Airplane ..Envelope
{U'\x0270c', U'\x0270d'}, // Victory Hand ..Writing Hand
{U'\x0270f', U'\x0270f'}, // Pencil
{U'\x02712', U'\x02712'}, // Black Nib
{U'\x02714', U'\x02714'}, // Heavy Check Mark
{U'\x02716', U'\x02716'}, // Heavy Multiplication X
{U'\x0271d', U'\x0271d'}, // Latin Cross
{U'\x02721', U'\x02721'}, // Star Of David
{U'\x02733', U'\x02734'}, // Eight Spoked Asterisk ..Eight Pointed Black Star
{U'\x02744', U'\x02744'}, // Snowflake
{U'\x02747', U'\x02747'}, // Sparkle
{U'\x02763', U'\x02764'}, // Heavy Heart Exclamation ..Heavy Black Heart
{U'\x027a1', U'\x027a1'}, // Black Rightwards Arrow
{U'\x02934', U'\x02935'}, // Arrow Pointing Rightward..Arrow Pointing Rightward
{U'\x02b05', U'\x02b07'}, // Leftwards Black Arrow ..Downwards Black Arrow
{U'\x1f170', U'\x1f171'}, // Negative Squared Latin C..Negative Squared Latin C
{U'\x1f17e', U'\x1f17f'}, // Negative Squared Latin C..Negative Squared Latin C
{U'\x1f321', U'\x1f321'}, // Thermometer
{U'\x1f324', U'\x1f32c'}, // White Sun With Small Clo..Wind Blowing Face
{U'\x1f336', U'\x1f336'}, // Hot Pepper
{U'\x1f37d', U'\x1f37d'}, // Fork And Knife With Plate
{U'\x1f396', U'\x1f397'}, // Military Medal ..Reminder Ribbon
{U'\x1f399', U'\x1f39b'}, // Studio Microphone ..Control Knobs
{U'\x1f39e', U'\x1f39f'}, // Film Frames ..Admission Tickets
{U'\x1f3cb', U'\x1f3ce'}, // Weight Lifter ..Racing Car
{U'\x1f3d4', U'\x1f3df'}, // Snow Capped Mountain ..Stadium
{U'\x1f3f3', U'\x1f3f3'}, // Waving White Flag
{U'\x1f3f5', U'\x1f3f5'}, // Rosette
{U'\x1f3f7', U'\x1f3f7'}, // Label
{U'\x1f43f', U'\x1f43f'}, // Chipmunk
{U'\x1f441', U'\x1f441'}, // Eye
{U'\x1f4fd', U'\x1f4fd'}, // Film Projector
{U'\x1f549', U'\x1f54a'}, // Om Symbol ..Dove Of Peace
{U'\x1f56f', U'\x1f570'}, // Candle ..Mantelpiece Clock
{U'\x1f573', U'\x1f579'}, // Hole ..Joystick
{U'\x1f587', U'\x1f587'}, // Linked Paperclips
{U'\x1f58a', U'\x1f58d'}, // Lower Left Ballpoint Pen..Lower Left Crayon
{U'\x1f590', U'\x1f590'}, // Raised Hand With Fingers Splayed
{U'\x1f5a5', U'\x1f5a5'}, // Desktop Computer
{U'\x1f5a8', U'\x1f5a8'}, // Printer
{U'\x1f5b1', U'\x1f5b2'}, // Three Button Mouse ..Trackball
{U'\x1f5bc', U'\x1f5bc'}, // Frame With Picture
{U'\x1f5c2', U'\x1f5c4'}, // Card Index Dividers ..File Cabinet
{U'\x1f5d1', U'\x1f5d3'}, // Wastebasket ..Spiral Calendar Pad
{U'\x1f5dc', U'\x1f5de'}, // Compression ..Rolled-up Newspaper
{U'\x1f5e1', U'\x1f5e1'}, // Dagger Knife
{U'\x1f5e3', U'\x1f5e3'}, // Speaking Head In Silhouette
{U'\x1f5e8', U'\x1f5e8'}, // Left Speech Bubble
{U'\x1f5ef', U'\x1f5ef'}, // Right Anger Bubble
{U'\x1f5f3', U'\x1f5f3'}, // Ballot Box With Ballot
{U'\x1f5fa', U'\x1f5fa'}, // World Map
{U'\x1f6cb', U'\x1f6cb'}, // Couch And Lamp
{U'\x1f6cd', U'\x1f6cf'}, // Shopping Bags ..Bed
{U'\x1f6e0', U'\x1f6e5'}, // Hammer And Wrench ..Motor Boat
{U'\x1f6e9', U'\x1f6e9'}, // Small Airplane
{U'\x1f6f0', U'\x1f6f0'}, // Satellite
{U'\x1f6f3', U'\x1f6f3'}, // Passenger Ship
};
// clang-format on
static size_t bisearch(char32_t ucs, const BoundaryVector& table) {
// TODO: Use STD algorithm to optimize this function
// YYC MARK:
// Do not change this "int" to "size_t" casually,
// because the result of arithmetic operation may be negative.
// Do not change this type before using new algorithm.
int lbound = 0, ubound = table.size() - 1;
if (ucs < table.front().first || ucs > table.back().second) return 0;
while (ubound >= lbound) {
int mid = (lbound + ubound) / 2;
if (ucs > table[mid].second) lbound = mid + 1;
else if (ucs < table[mid].first) ubound = mid - 1;
else return 1;
}
return 0;
}
size_t wcwidth(char32_t wc) {
// TODO: Add lru_cache(maxsize=1000) for this function
// Small optimize for ASCII
if (U'\x20' <= wc && wc < U'\x7F') [[likely]]
return 1;
// C0/C1 control char
// NOTE: Not vanilla implementation. Return 0 instead of 1.
if ((wc && wc < L'\x20') || (L'\x7F' <= wc && wc < L'\xA0')) return 0;
// Zero-width char
if (bisearch(wc, ZERO_WIDTH)) return 0;
// Width 1 or 2
return 1 + bisearch(wc, WIDE_EAST_ASIAN);
}
enum class WcswidthState {
/// Normal character.
Normal,
/// Under ZWJ control char.
/// Ignore the width of next char.
ZeroWidthJoiner,
/// Under ANSI Escape Sequence.
/// Following chars should be treated as escape char.
AnsiEscape,
/// Under CSI control sequence, a part of ANSI Escape Sequence.
/// No width was accumulated before terminal char.
AnsiCsiEscape,
};
struct WcswidthContext {
/// Current state.
WcswidthState state;
/// Tract the last computed char.
/// It will be used for VS16 char.
std::optional<char32_t> last_measured_char;
};
Result<size_t> wcswidth(const std::u32string_view& rhs) {
WcswidthContext ctx{WcswidthState::Normal, std::nullopt};
size_t width = 0;
for (char32_t chr : rhs) {
// Match char value
switch (ctx.state) {
case WcswidthState::Normal: {
switch (chr) {
case U'\x200D': {
// ZWJ control char
ctx.state = WcswidthState::ZeroWidthJoiner;
break;
}
case U'\xFE0F': {
// VS16 control char
// If we have a char which was acknowledged and has width,
// analyse it instead of this control char.
if (ctx.last_measured_char.has_value()) {
width += bisearch(ctx.last_measured_char.value(), VS16_NARROW_TO_WIDE);
ctx.last_measured_char = std::nullopt;
}
break;
}
case U'\x1B': {
// ANSI escape sequence
ctx.state = WcswidthState::AnsiEscape;
break;
}
default: {
// Fetch widht for normal char
int wcw = wcwidth(chr);
// Tract the final non-zero char for VS16 control char
if (wcw > 0) ctx.last_measured_char = wcw;
// Accumulate width
width += wcw;
break;
}
}
break;
}
case WcswidthState::ZeroWidthJoiner: {
// Eat this char and back to normal state.
// This is what ZWJ does.
ctx.state = WcswidthState::Normal;
break;
}
case WcswidthState::AnsiEscape: {
// Check the second char of escape sequence.
// If it is '[', we enter CSI state,
// otherwise we eat it and back to normal state.
// Additionally, there is a range requirement for this char (0x40-0x5F).
if (chr == U'[') {
ctx.state = WcswidthState::AnsiCsiEscape;
} else if (chr >= U'\x40' && chr <= U'\x5F') {
ctx.state = WcswidthState::Normal;
} else {
return std::unexpected(Error::BadAnsiEscSeq);
}
break;
}
case WcswidthState::AnsiCsiEscape: {
// CSI sequence is aonsisted by variable Parameter Char (count can be zero),
// at least one Middle Char and only one Final Char.
// So we eat all chars until we reach the terminal char.
if (chr >= U'\x40' && chr <= U'\x7E') {
// Final Char. Back to normal state.
ctx.state = WcswidthState::Normal;
} else if (chr >= U'\x30' && chr <= U'\x3F') {
; // Parameter Char. Do nothing
} else if (chr >= U'\x20' && chr <= U'\x2F') {
; // Middle Char. Do nothing
} else {
return std::unexpected(Error::BadCsiSeq);
}
break;
}
}
}
return width;
}
Result<size_t> wcswidth(const std::u8string_view& rhs) {
// Cast encoding
auto u32str = ENC::to_utf32(rhs);
if (!u32str.has_value()) return std::unexpected(Error::BadEncoding);
// Call underlying function
return wcswidth(u32str.value());
}
} // namespace yycc::carton::wcwidth

View File

@ -0,0 +1,47 @@
#pragma once
#include <string_view>
#include <expected>
/**
* @brief The namespace replicating Linux-specialized function, "wcswidth", in all platforms.
* @details
* "wcswdith" is a specialized function in Linux.
* It was not included in POSIX standard and only provided on Linux.
* This function can fetch how many space which given string occupied in terminal.
* This is essential and useful function in our library.
* So I create this namespace to make "wcswidth" be available on all platforms.
*
* "wcswidth" is based on \c wchar_t. In Linux, \c wchar_t is 4-bytes length.
* It can represent any characters without surrogate pair.
* However, in Windows, \c wchar_t is 2-bytes length.
* There is possible surrogate pair within \c wchar_t string, which is inconvenient for our programming.
* So in this homebrew namespace, I forcelt use \c char32_t as the basic char type.
*
* Due to the requirements of mine, this implementation is slightly different with original one.
* These differences are list below:
*
* \li We do not return negative value for Control Char in "wcwidth",
* because we need to support the analyse of ANSI Escape Sequence.
* \li Due to the previous change, the type of return value of "wcwidth" and "wcswidth"
* are changed from \c int to \c size_t because there is no negative return value.
* \li "wcswidth" now support ANSI Escape Sequence (e.g. terminal color).
* So it can analyse colorful output with correct space.
*/
namespace yycc::carton::wcwidth {
/// @brief Error occurs in this module
enum class Error {
BadEncoding, ///< Given
BadAnsiEscSeq, ///< Bad char when processing ANSI Escape Sequence
BadCsiSeq, ///< Bad char when processing CSI Sequence.
};
/// @brief Result type for this module
template<typename T>
using Result = std::expected<T, Error>;
size_t wcwidth(char32_t wc);
Result<size_t> wcswidth(const std::u32string_view& rhs);
Result<size_t> wcswidth(const std::u8string_view& rhs);
} // namespace yycc::carton::wcwidth

View File

@ -28,6 +28,7 @@ PRIVATE
yycc/windows/winfct.cpp
yycc/carton/pycodec.cpp
yycc/carton/wcwidth.cpp
)
target_sources(YYCCTestbench
PRIVATE

View File

@ -0,0 +1,52 @@
#include <gtest/gtest.h>
#include <yycc.hpp>
#include <yycc/carton/wcwidth.hpp>
#define WCWDITH ::yycc::carton::wcwidth
namespace yycctest::carton::wcwidth {
#define TEST_SUCCESS(strl, len) \
{ \
auto rv = WCWDITH::wcswidth(strl); \
ASSERT_TRUE(rv.has_value()); \
EXPECT_EQ(rv.value(), len); \
}
#define TEST_FAIL(strl) \
{ \
auto rv = WCWDITH::wcswidth(strl); \
EXPECT_FALSE(rv.has_value()); \
}
TEST(CartonWcwdith, BadAnsi) {
TEST_FAIL(u8"\033?");
}
TEST(CartonWcwdith, BadCsi) {
TEST_FAIL(u8"\033[\t");
}
TEST(CartonWcwdith, English) {
TEST_SUCCESS(u8"abc", 3);
}
TEST(CartonWcwdith, Chinese) {
TEST_SUCCESS(u8"中文", 4);
TEST_SUCCESS(u8"中a文", 5);
}
TEST(CartonWcwdith, Japanese) {
TEST_SUCCESS(u8"ありがとう", 10);
TEST_SUCCESS(u8"アリガトウ", 10);
TEST_SUCCESS(u8"アリガトウ", 6);
}
TEST(CartonWcwdith, Termcolor) {
// TODO: Fix this after finish "termcolor".
// assert_eq!(wcswidth(&colored("abc", Color::Red, Default::default(), Default::default())), 3);
// assert_eq!(wcswidth(&colored("中文", Color::Red, Default::default(), Default::default())), 4);
// assert_eq!(wcswidth(&colored("ありがとう", Color::Red, Default::default(), Default::default())), 10);
}
} // namespace yycctest::carton::wcwidth