From 2f11ba6023fb0f6c4c1afb52236ea817ae073cd5 Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Fri, 15 Aug 2025 16:42:28 +0800 Subject: [PATCH] feat: add new package wcwidth - add wcwidth in carton. - order clang-format do not format some generated content. --- src/CMakeLists.txt | 2 + src/yycc/carton/pycodec.cpp | 104 ++++--- src/yycc/carton/wcwidth.cpp | 478 ++++++++++++++++++++++++++++++ src/yycc/carton/wcwidth.hpp | 47 +++ testbench/CMakeLists.txt | 1 + testbench/yycc/carton/wcwidth.cpp | 52 ++++ 6 files changed, 651 insertions(+), 33 deletions(-) create mode 100644 src/yycc/carton/wcwidth.cpp create mode 100644 src/yycc/carton/wcwidth.hpp create mode 100644 testbench/yycc/carton/wcwidth.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e7e169d..40138f0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,6 +24,7 @@ PRIVATE yycc/encoding/iconv.cpp yycc/carton/pycodec.cpp + yycc/carton/wcwidth.cpp ) target_sources(YYCCommonplace PUBLIC @@ -67,6 +68,7 @@ FILES yycc/encoding/iconv.hpp yycc/carton/pycodec.hpp + yycc/carton/wcwidth.hpp ) # Setup header infomations target_include_directories(YYCCommonplace diff --git a/src/yycc/carton/pycodec.cpp b/src/yycc/carton/pycodec.cpp index 3216bcc..244abca 100644 --- a/src/yycc/carton/pycodec.cpp +++ b/src/yycc/carton/pycodec.cpp @@ -19,6 +19,7 @@ namespace yycc::carton::pycodec { #pragma region Encoding Name + // clang-format off static const std::map ALIAS_MAP{ {u8"646"sv, u8"ascii"sv}, {u8"us-ascii"sv, u8"ascii"sv}, @@ -224,6 +225,7 @@ namespace yycc::carton::pycodec { {u8"utf-8"sv, u8"utf_8"sv}, {u8"cp65001"sv, u8"utf_8"sv}, }; + // clang-format on /** * @brief Resolve encoding name alias and fetch real encoding name. @@ -248,40 +250,74 @@ namespace yycc::carton::pycodec { using CodePage = YYCC_PYCODEC_BACKEND_NS::CodePage; - static const std::map WINCP_MAP{ - {u8"ascii"sv, static_cast(437u)}, {u8"big5"sv, static_cast(950u)}, - {u8"cp037"sv, static_cast(037u)}, {u8"cp437"sv, static_cast(437u)}, - {u8"cp500"sv, static_cast(500u)}, {u8"cp720"sv, static_cast(720u)}, - {u8"cp737"sv, static_cast(737u)}, {u8"cp775"sv, static_cast(775u)}, - {u8"cp850"sv, static_cast(850u)}, {u8"cp852"sv, static_cast(852u)}, - {u8"cp855"sv, static_cast(855u)}, {u8"cp857"sv, static_cast(857u)}, - {u8"cp858"sv, static_cast(858u)}, {u8"cp860"sv, static_cast(860u)}, - {u8"cp861"sv, static_cast(861u)}, {u8"cp862"sv, static_cast(862u)}, - {u8"cp863"sv, static_cast(863u)}, {u8"cp864"sv, static_cast(864u)}, - {u8"cp865"sv, static_cast(865u)}, {u8"cp866"sv, static_cast(866u)}, - {u8"cp869"sv, static_cast(869u)}, {u8"cp874"sv, static_cast(874u)}, - {u8"cp875"sv, static_cast(875u)}, {u8"cp932"sv, static_cast(932u)}, - {u8"cp949"sv, static_cast(949u)}, {u8"cp950"sv, static_cast(950u)}, - {u8"cp1026"sv, static_cast(1026u)}, {u8"cp1140"sv, static_cast(1140u)}, - {u8"cp1250"sv, static_cast(1250u)}, {u8"cp1251"sv, static_cast(1251u)}, - {u8"cp1252"sv, static_cast(1252u)}, {u8"cp1253"sv, static_cast(1253u)}, - {u8"cp1254"sv, static_cast(1254u)}, {u8"cp1255"sv, static_cast(1255u)}, - {u8"cp1256"sv, static_cast(1256u)}, {u8"cp1257"sv, static_cast(1257u)}, - {u8"cp1258"sv, static_cast(1258u)}, {u8"euc_jp"sv, static_cast(20932u)}, - {u8"euc_kr"sv, static_cast(51949u)}, {u8"gb2312"sv, static_cast(936u)}, - {u8"gbk"sv, static_cast(936u)}, {u8"gb18030"sv, static_cast(54936u)}, - {u8"hz"sv, static_cast(52936u)}, {u8"iso2022_jp"sv, static_cast(50220u)}, - {u8"iso2022_kr"sv, static_cast(50225u)}, {u8"latin_1"sv, static_cast(28591u)}, - {u8"iso8859_2"sv, static_cast(28592u)}, {u8"iso8859_3"sv, static_cast(28593u)}, - {u8"iso8859_4"sv, static_cast(28594u)}, {u8"iso8859_5"sv, static_cast(28595u)}, - {u8"iso8859_6"sv, static_cast(28596u)}, {u8"iso8859_7"sv, static_cast(28597u)}, - {u8"iso8859_8"sv, static_cast(28598u)}, {u8"iso8859_9"sv, static_cast(28599u)}, - {u8"iso8859_13"sv, static_cast(28603u)}, {u8"iso8859_15"sv, static_cast(28605u)}, - {u8"johab"sv, static_cast(1361u)}, {u8"mac_cyrillic"sv, static_cast(10007u)}, - {u8"mac_greek"sv, static_cast(10006u)}, {u8"mac_iceland"sv, static_cast(10079u)}, - {u8"mac_turkish"sv, static_cast(10081u)}, {u8"shift_jis"sv, static_cast(932u)}, - {u8"utf_7"sv, static_cast(65000u)}, {u8"utf_8"sv, static_cast(65001u)}, + // clang-format off + static const std::map WINCP_MAP { + { u8"ascii"sv, static_cast(437u) }, + { u8"big5"sv, static_cast(950u) }, + { u8"cp037"sv, static_cast(037u) }, + { u8"cp437"sv, static_cast(437u) }, + { u8"cp500"sv, static_cast(500u) }, + { u8"cp720"sv, static_cast(720u) }, + { u8"cp737"sv, static_cast(737u) }, + { u8"cp775"sv, static_cast(775u) }, + { u8"cp850"sv, static_cast(850u) }, + { u8"cp852"sv, static_cast(852u) }, + { u8"cp855"sv, static_cast(855u) }, + { u8"cp857"sv, static_cast(857u) }, + { u8"cp858"sv, static_cast(858u) }, + { u8"cp860"sv, static_cast(860u) }, + { u8"cp861"sv, static_cast(861u) }, + { u8"cp862"sv, static_cast(862u) }, + { u8"cp863"sv, static_cast(863u) }, + { u8"cp864"sv, static_cast(864u) }, + { u8"cp865"sv, static_cast(865u) }, + { u8"cp866"sv, static_cast(866u) }, + { u8"cp869"sv, static_cast(869u) }, + { u8"cp874"sv, static_cast(874u) }, + { u8"cp875"sv, static_cast(875u) }, + { u8"cp932"sv, static_cast(932u) }, + { u8"cp949"sv, static_cast(949u) }, + { u8"cp950"sv, static_cast(950u) }, + { u8"cp1026"sv, static_cast(1026u) }, + { u8"cp1140"sv, static_cast(1140u) }, + { u8"cp1250"sv, static_cast(1250u) }, + { u8"cp1251"sv, static_cast(1251u) }, + { u8"cp1252"sv, static_cast(1252u) }, + { u8"cp1253"sv, static_cast(1253u) }, + { u8"cp1254"sv, static_cast(1254u) }, + { u8"cp1255"sv, static_cast(1255u) }, + { u8"cp1256"sv, static_cast(1256u) }, + { u8"cp1257"sv, static_cast(1257u) }, + { u8"cp1258"sv, static_cast(1258u) }, + { u8"euc_jp"sv, static_cast(20932u) }, + { u8"euc_kr"sv, static_cast(51949u) }, + { u8"gb2312"sv, static_cast(936u) }, + { u8"gbk"sv, static_cast(936u) }, + { u8"gb18030"sv, static_cast(54936u) }, + { u8"hz"sv, static_cast(52936u) }, + { u8"iso2022_jp"sv, static_cast(50220u) }, + { u8"iso2022_kr"sv, static_cast(50225u) }, + { u8"latin_1"sv, static_cast(28591u) }, + { u8"iso8859_2"sv, static_cast(28592u) }, + { u8"iso8859_3"sv, static_cast(28593u) }, + { u8"iso8859_4"sv, static_cast(28594u) }, + { u8"iso8859_5"sv, static_cast(28595u) }, + { u8"iso8859_6"sv, static_cast(28596u) }, + { u8"iso8859_7"sv, static_cast(28597u) }, + { u8"iso8859_8"sv, static_cast(28598u) }, + { u8"iso8859_9"sv, static_cast(28599u) }, + { u8"iso8859_13"sv, static_cast(28603u) }, + { u8"iso8859_15"sv, static_cast(28605u) }, + { u8"johab"sv, static_cast(1361u) }, + { u8"mac_cyrillic"sv, static_cast(10007u) }, + { u8"mac_greek"sv, static_cast(10006u) }, + { u8"mac_iceland"sv, static_cast(10079u) }, + { u8"mac_turkish"sv, static_cast(10081u) }, + { u8"shift_jis"sv, static_cast(932u) }, + { u8"utf_7"sv, static_cast(65000u) }, + { u8"utf_8"sv, static_cast(65001u) }, }; + // clang-format on static FetchResult fetch_code_page(const std::u8string_view& enc_name) { // resolve alias @@ -296,6 +332,7 @@ namespace yycc::carton::pycodec { #else + // clang-format off static const std::map ICONV_MAP{ {u8"ascii"sv, "ASCII"sv}, {u8"big5"sv, "BIG5"sv}, @@ -359,6 +396,7 @@ namespace yycc::carton::pycodec { {u8"utf_7"sv, "UTF-7"sv}, {u8"utf_8"sv, "UTF-8"sv}, }; + // clang-format on static FetchResult fetch_iconv_name(const std::u8string_view& enc_name) { // resolve alias diff --git a/src/yycc/carton/wcwidth.cpp b/src/yycc/carton/wcwidth.cpp new file mode 100644 index 0000000..05188f0 --- /dev/null +++ b/src/yycc/carton/wcwidth.cpp @@ -0,0 +1,478 @@ +#include "wcwidth.hpp" +#include "../encoding/stl.hpp" +#include +#include +#include + +#define ENC ::yycc::encoding::stl + +namespace yycc::carton::wcwidth { + + using Boundary = std::pair; + using BoundaryVector = std::vector; + + // YYC MARK: + // Following table and code are copied from Python package "wcwidth". + // Although the code of this package are also copied from the original "wcwidth" C implementation. + // + // I do not need so much exact measurement. + // I just want a "it works" wcwdith in all platforms. + // So these tables are coming from the table with lowest UNICODE version + // (original package provides different tables for different UNICODE versions). + + // clang-format off + static const BoundaryVector ZERO_WIDTH{ + {U'\x00000', U'\x00000'}, // (nil) + {U'\x000ad', U'\x000ad'}, // Soft Hyphen + {U'\x00300', U'\x0036f'}, // Combining Grave Accent ..Combining Latin Small Le + {U'\x00483', U'\x00486'}, // Combining Cyrillic Titlo..Combining Cyrillic Psili + {U'\x00488', U'\x00489'}, // Combining Cyrillic Hundr..Combining Cyrillic Milli + {U'\x00591', U'\x005b9'}, // Hebrew Accent Etnahta ..Hebrew Point Holam + {U'\x005bb', U'\x005bd'}, // Hebrew Point Qubuts ..Hebrew Point Meteg + {U'\x005bf', U'\x005bf'}, // Hebrew Point Rafe + {U'\x005c1', U'\x005c2'}, // Hebrew Point Shin Dot ..Hebrew Point Sin Dot + {U'\x005c4', U'\x005c5'}, // Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot + {U'\x005c7', U'\x005c7'}, // Hebrew Point Qamats Qatan + {U'\x00600', U'\x00603'}, // Arabic Number Sign ..Arabic Sign Safha + {U'\x00610', U'\x00615'}, // Arabic Sign Sallallahou ..Arabic Small High Tah + {U'\x0064b', U'\x0065e'}, // Arabic Fathatan ..Arabic Fatha With Two Do + {U'\x00670', U'\x00670'}, // Arabic Letter Superscript Alef + {U'\x006d6', U'\x006e4'}, // Arabic Small High Ligatu..Arabic Small High Madda + {U'\x006e7', U'\x006e8'}, // Arabic Small High Yeh ..Arabic Small High Noon + {U'\x006ea', U'\x006ed'}, // Arabic Empty Centre Low ..Arabic Small Low Meem + {U'\x0070f', U'\x0070f'}, // Syriac Abbreviation Mark + {U'\x00711', U'\x00711'}, // Syriac Letter Superscript Alaph + {U'\x00730', U'\x0074a'}, // Syriac Pthaha Above ..Syriac Barrekh + {U'\x007a6', U'\x007b0'}, // Thaana Abafili ..Thaana Sukun + {U'\x00901', U'\x00903'}, // Devanagari Sign Candrabi..Devanagari Sign Visarga + {U'\x0093c', U'\x0093c'}, // Devanagari Sign Nukta + {U'\x0093e', U'\x0094d'}, // Devanagari Vowel Sign Aa..Devanagari Sign Virama + {U'\x00951', U'\x00954'}, // Devanagari Stress Sign U..Devanagari Acute Accent + {U'\x00962', U'\x00963'}, // Devanagari Vowel Sign Vo..Devanagari Vowel Sign Vo + {U'\x00981', U'\x00983'}, // Bengali Sign Candrabindu..Bengali Sign Visarga + {U'\x009bc', U'\x009bc'}, // Bengali Sign Nukta + {U'\x009be', U'\x009c4'}, // Bengali Vowel Sign Aa ..Bengali Vowel Sign Vocal + {U'\x009c7', U'\x009c8'}, // Bengali Vowel Sign E ..Bengali Vowel Sign Ai + {U'\x009cb', U'\x009cd'}, // Bengali Vowel Sign O ..Bengali Sign Virama + {U'\x009d7', U'\x009d7'}, // Bengali Au Length Mark + {U'\x009e2', U'\x009e3'}, // Bengali Vowel Sign Vocal..Bengali Vowel Sign Vocal + {U'\x00a01', U'\x00a03'}, // Gurmukhi Sign Adak Bindi..Gurmukhi Sign Visarga + {U'\x00a3c', U'\x00a3c'}, // Gurmukhi Sign Nukta + {U'\x00a3e', U'\x00a42'}, // Gurmukhi Vowel Sign Aa ..Gurmukhi Vowel Sign Uu + {U'\x00a47', U'\x00a48'}, // Gurmukhi Vowel Sign Ee ..Gurmukhi Vowel Sign Ai + {U'\x00a4b', U'\x00a4d'}, // Gurmukhi Vowel Sign Oo ..Gurmukhi Sign Virama + {U'\x00a70', U'\x00a71'}, // Gurmukhi Tippi ..Gurmukhi Addak + {U'\x00a81', U'\x00a83'}, // Gujarati Sign Candrabind..Gujarati Sign Visarga + {U'\x00abc', U'\x00abc'}, // Gujarati Sign Nukta + {U'\x00abe', U'\x00ac5'}, // Gujarati Vowel Sign Aa ..Gujarati Vowel Sign Cand + {U'\x00ac7', U'\x00ac9'}, // Gujarati Vowel Sign E ..Gujarati Vowel Sign Cand + {U'\x00acb', U'\x00acd'}, // Gujarati Vowel Sign O ..Gujarati Sign Virama + {U'\x00ae2', U'\x00ae3'}, // Gujarati Vowel Sign Voca..Gujarati Vowel Sign Voca + {U'\x00b01', U'\x00b03'}, // Oriya Sign Candrabindu ..Oriya Sign Visarga + {U'\x00b3c', U'\x00b3c'}, // Oriya Sign Nukta + {U'\x00b3e', U'\x00b43'}, // Oriya Vowel Sign Aa ..Oriya Vowel Sign Vocalic + {U'\x00b47', U'\x00b48'}, // Oriya Vowel Sign E ..Oriya Vowel Sign Ai + {U'\x00b4b', U'\x00b4d'}, // Oriya Vowel Sign O ..Oriya Sign Virama + {U'\x00b56', U'\x00b57'}, // Oriya Ai Length Mark ..Oriya Au Length Mark + {U'\x00b82', U'\x00b82'}, // Tamil Sign Anusvara + {U'\x00bbe', U'\x00bc2'}, // Tamil Vowel Sign Aa ..Tamil Vowel Sign Uu + {U'\x00bc6', U'\x00bc8'}, // Tamil Vowel Sign E ..Tamil Vowel Sign Ai + {U'\x00bca', U'\x00bcd'}, // Tamil Vowel Sign O ..Tamil Sign Virama + {U'\x00bd7', U'\x00bd7'}, // Tamil Au Length Mark + {U'\x00c01', U'\x00c03'}, // Telugu Sign Candrabindu ..Telugu Sign Visarga + {U'\x00c3e', U'\x00c44'}, // Telugu Vowel Sign Aa ..Telugu Vowel Sign Vocali + {U'\x00c46', U'\x00c48'}, // Telugu Vowel Sign E ..Telugu Vowel Sign Ai + {U'\x00c4a', U'\x00c4d'}, // Telugu Vowel Sign O ..Telugu Sign Virama + {U'\x00c55', U'\x00c56'}, // Telugu Length Mark ..Telugu Ai Length Mark + {U'\x00c82', U'\x00c83'}, // Kannada Sign Anusvara ..Kannada Sign Visarga + {U'\x00cbc', U'\x00cbc'}, // Kannada Sign Nukta + {U'\x00cbe', U'\x00cc4'}, // Kannada Vowel Sign Aa ..Kannada Vowel Sign Vocal + {U'\x00cc6', U'\x00cc8'}, // Kannada Vowel Sign E ..Kannada Vowel Sign Ai + {U'\x00cca', U'\x00ccd'}, // Kannada Vowel Sign O ..Kannada Sign Virama + {U'\x00cd5', U'\x00cd6'}, // Kannada Length Mark ..Kannada Ai Length Mark + {U'\x00d02', U'\x00d03'}, // Malayalam Sign Anusvara ..Malayalam Sign Visarga + {U'\x00d3e', U'\x00d43'}, // Malayalam Vowel Sign Aa ..Malayalam Vowel Sign Voc + {U'\x00d46', U'\x00d48'}, // Malayalam Vowel Sign E ..Malayalam Vowel Sign Ai + {U'\x00d4a', U'\x00d4d'}, // Malayalam Vowel Sign O ..Malayalam Sign Virama + {U'\x00d57', U'\x00d57'}, // Malayalam Au Length Mark + {U'\x00d82', U'\x00d83'}, // Sinhala Sign Anusvaraya ..Sinhala Sign Visargaya + {U'\x00dca', U'\x00dca'}, // Sinhala Sign Al-lakuna + {U'\x00dcf', U'\x00dd4'}, // Sinhala Vowel Sign Aela-..Sinhala Vowel Sign Ketti + {U'\x00dd6', U'\x00dd6'}, // Sinhala Vowel Sign Diga Paa-pilla + {U'\x00dd8', U'\x00ddf'}, // Sinhala Vowel Sign Gaett..Sinhala Vowel Sign Gayan + {U'\x00df2', U'\x00df3'}, // Sinhala Vowel Sign Diga ..Sinhala Vowel Sign Diga + {U'\x00e31', U'\x00e31'}, // Thai Character Mai Han-akat + {U'\x00e34', U'\x00e3a'}, // Thai Character Sara I ..Thai Character Phinthu + {U'\x00e47', U'\x00e4e'}, // Thai Character Maitaikhu..Thai Character Yamakkan + {U'\x00eb1', U'\x00eb1'}, // Lao Vowel Sign Mai Kan + {U'\x00eb4', U'\x00eb9'}, // Lao Vowel Sign I ..Lao Vowel Sign Uu + {U'\x00ebb', U'\x00ebc'}, // Lao Vowel Sign Mai Kon ..Lao Semivowel Sign Lo + {U'\x00ec8', U'\x00ecd'}, // Lao Tone Mai Ek ..Lao Niggahita + {U'\x00f18', U'\x00f19'}, // Tibetan Astrological Sig..Tibetan Astrological Sig + {U'\x00f35', U'\x00f35'}, // Tibetan Mark Ngas Bzung Nyi Zla + {U'\x00f37', U'\x00f37'}, // Tibetan Mark Ngas Bzung Sgor Rtags + {U'\x00f39', U'\x00f39'}, // Tibetan Mark Tsa -phru + {U'\x00f3e', U'\x00f3f'}, // Tibetan Sign Yar Tshes ..Tibetan Sign Mar Tshes + {U'\x00f71', U'\x00f84'}, // Tibetan Vowel Sign Aa ..Tibetan Mark Halanta + {U'\x00f86', U'\x00f87'}, // Tibetan Sign Lci Rtags ..Tibetan Sign Yang Rtags + {U'\x00f90', U'\x00f97'}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter + {U'\x00f99', U'\x00fbc'}, // Tibetan Subjoined Letter..Tibetan Subjoined Letter + {U'\x00fc6', U'\x00fc6'}, // Tibetan Symbol Padma Gdan + {U'\x0102c', U'\x01032'}, // Myanmar Vowel Sign Aa ..Myanmar Vowel Sign Ai + {U'\x01036', U'\x01039'}, // Myanmar Sign Anusvara ..Myanmar Sign Virama + {U'\x01056', U'\x01059'}, // Myanmar Vowel Sign Vocal..Myanmar Vowel Sign Vocal + {U'\x01160', U'\x011ff'}, // Hangul Jungseong Filler ..Hangul Jongseong Ssangni + {U'\x0135f', U'\x0135f'}, // Ethiopic Combining Gemination Mark + {U'\x01712', U'\x01714'}, // Tagalog Vowel Sign I ..Tagalog Sign Virama + {U'\x01732', U'\x01734'}, // Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod + {U'\x01752', U'\x01753'}, // Buhid Vowel Sign I ..Buhid Vowel Sign U + {U'\x01772', U'\x01773'}, // Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + {U'\x017b4', U'\x017d3'}, // Khmer Vowel Inherent Aq ..Khmer Sign Bathamasat + {U'\x017dd', U'\x017dd'}, // Khmer Sign Atthacan + {U'\x0180b', U'\x0180d'}, // Mongolian Free Variation..Mongolian Free Variation + {U'\x018a9', U'\x018a9'}, // Mongolian Letter Ali Gali Dagalga + {U'\x01920', U'\x0192b'}, // Limbu Vowel Sign A ..Limbu Subjoined Letter W + {U'\x01930', U'\x0193b'}, // Limbu Small Letter Ka ..Limbu Sign Sa-i + {U'\x019b0', U'\x019c0'}, // New Tai Lue Vowel Sign V..New Tai Lue Vowel Sign I + {U'\x019c8', U'\x019c9'}, // New Tai Lue Tone Mark-1 ..New Tai Lue Tone Mark-2 + {U'\x01a17', U'\x01a1b'}, // Buginese Vowel Sign I ..Buginese Vowel Sign Ae + {U'\x01dc0', U'\x01dc3'}, // Combining Dotted Grave A..Combining Suspension Mar + {U'\x0200b', U'\x0200f'}, // Zero Width Space ..Right-to-left Mark + {U'\x02028', U'\x0202e'}, // Line Separator ..Right-to-left Override + {U'\x02060', U'\x02063'}, // Word Joiner ..Invisible Separator + {U'\x0206a', U'\x0206f'}, // Inhibit Symmetric Swappi..Nominal Digit Shapes + {U'\x020d0', U'\x020eb'}, // Combining Left Harpoon A..Combining Long Double So + {U'\x0302a', U'\x0302f'}, // Ideographic Level Tone M..Hangul Double Dot Tone M + {U'\x03099', U'\x0309a'}, // Combining Katakana-hirag..Combining Katakana-hirag + {U'\x0a802', U'\x0a802'}, // Syloti Nagri Sign Dvisvara + {U'\x0a806', U'\x0a806'}, // Syloti Nagri Sign Hasanta + {U'\x0a80b', U'\x0a80b'}, // Syloti Nagri Sign Anusvara + {U'\x0a823', U'\x0a827'}, // Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign + {U'\x0d7b0', U'\x0d7ff'}, // Hangul Jungseong O-yeo ..(nil) + {U'\x0fb1e', U'\x0fb1e'}, // Hebrew Point Judeo-spanish Varika + {U'\x0fe00', U'\x0fe0f'}, // Variation Selector-1 ..Variation Selector-16 + {U'\x0fe20', U'\x0fe23'}, // Combining Ligature Left ..Combining Double Tilde R + {U'\x0feff', U'\x0feff'}, // Zero Width No-break Space + {U'\x0fff9', U'\x0fffb'}, // Interlinear Annotation A..Interlinear Annotation T + {U'\x10a01', U'\x10a03'}, // Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo + {U'\x10a05', U'\x10a06'}, // Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O + {U'\x10a0c', U'\x10a0f'}, // Kharoshthi Vowel Length ..Kharoshthi Sign Visarga + {U'\x10a38', U'\x10a3a'}, // Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo + {U'\x10a3f', U'\x10a3f'}, // Kharoshthi Virama + {U'\x1d165', U'\x1d169'}, // Musical Symbol Combining..Musical Symbol Combining + {U'\x1d16d', U'\x1d182'}, // Musical Symbol Combining..Musical Symbol Combining + {U'\x1d185', U'\x1d18b'}, // Musical Symbol Combining..Musical Symbol Combining + {U'\x1d1aa', U'\x1d1ad'}, // Musical Symbol Combining..Musical Symbol Combining + {U'\x1d242', U'\x1d244'}, // Combining Greek Musical ..Combining Greek Musical + {U'\xe0001', U'\xe0001'}, // Language Tag + {U'\xe0020', U'\xe007f'}, // Tag Space ..Cancel Tag + {U'\xe0100', U'\xe01ef'}, // Variation Selector-17 ..Variation Selector-256 + }; + + static const BoundaryVector WIDE_EAST_ASIAN{ + {U'\x01100', U'\x01159'}, // Hangul Choseong Kiyeok ..Hangul Choseong Yeorinhi + {U'\x0115f', U'\x0115f'}, // Hangul Choseong Filler + {U'\x02329', U'\x0232a'}, // Left-pointing Angle Brac..Right-pointing Angle Bra + {U'\x02e80', U'\x02e99'}, // Cjk Radical Repeat ..Cjk Radical Rap + {U'\x02e9b', U'\x02ef3'}, // Cjk Radical Choke ..Cjk Radical C-simplified + {U'\x02f00', U'\x02fd5'}, // Kangxi Radical One ..Kangxi Radical Flute + {U'\x02ff0', U'\x02ffb'}, // Ideographic Description ..Ideographic Description + {U'\x03000', U'\x03029'}, // Ideographic Space ..Hangzhou Numeral Nine + {U'\x03030', U'\x0303e'}, // Wavy Dash ..Ideographic Variation In + {U'\x03041', U'\x03096'}, // Hiragana Letter Small A ..Hiragana Letter Small Ke + {U'\x0309b', U'\x030ff'}, // Katakana-hiragana Voiced..Katakana Digraph Koto + {U'\x03105', U'\x0312c'}, // Bopomofo Letter B ..Bopomofo Letter Gn + {U'\x03131', U'\x0318e'}, // Hangul Letter Kiyeok ..Hangul Letter Araeae + {U'\x03190', U'\x031b7'}, // Ideographic Annotation L..Bopomofo Final Letter H + {U'\x031c0', U'\x031cf'}, // Cjk Stroke T ..Cjk Stroke N + {U'\x031f0', U'\x0321e'}, // Katakana Letter Small Ku..Parenthesized Korean Cha + {U'\x03220', U'\x03243'}, // Parenthesized Ideograph ..Parenthesized Ideograph + {U'\x03250', U'\x032fe'}, // Partnership Sign ..Circled Katakana Wo + {U'\x03300', U'\x04db5'}, // Square Apaato ..Cjk Unified Ideograph-4d + {U'\x04e00', U'\x09fbb'}, // Cjk Unified Ideograph-4e..Cjk Unified Ideograph-9f + {U'\x0a000', U'\x0a48c'}, // Yi Syllable It ..Yi Syllable Yyr + {U'\x0a490', U'\x0a4c6'}, // Yi Radical Qot ..Yi Radical Ke + {U'\x0ac00', U'\x0d7a3'}, // Hangul Syllable Ga ..Hangul Syllable Hih + {U'\x0f900', U'\x0fa2d'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr + {U'\x0fa30', U'\x0fa6a'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr + {U'\x0fa70', U'\x0fad9'}, // Cjk Compatibility Ideogr..Cjk Compatibility Ideogr + {U'\x0fe10', U'\x0fe19'}, // Presentation Form For Ve..Presentation Form For Ve + {U'\x0fe30', U'\x0fe52'}, // Presentation Form For Ve..Small Full Stop + {U'\x0fe54', U'\x0fe66'}, // Small Semicolon ..Small Equals Sign + {U'\x0fe68', U'\x0fe6b'}, // Small Reverse Solidus ..Small Commercial At + {U'\x0ff01', U'\x0ff60'}, // Fullwidth Exclamation Ma..Fullwidth Right White Pa + {U'\x0ffe0', U'\x0ffe6'}, // Fullwidth Cent Sign ..Fullwidth Won Sign + {U'\x20000', U'\x2fffd'}, // Cjk Unified Ideograph-20..(nil) + {U'\x30000', U'\x3fffd'}, // Cjk Unified Ideograph-30..(nil) + }; + + static const BoundaryVector VS16_NARROW_TO_WIDE{ + {U'\x00023', U'\x00023'}, // Number Sign + {U'\x0002a', U'\x0002a'}, // Asterisk + {U'\x00030', U'\x00039'}, // Digit Zero ..Digit Nine + {U'\x000a9', U'\x000a9'}, // Copyright Sign + {U'\x000ae', U'\x000ae'}, // Registered Sign + {U'\x0203c', U'\x0203c'}, // Double Exclamation Mark + {U'\x02049', U'\x02049'}, // Exclamation Question Mark + {U'\x02122', U'\x02122'}, // Trade Mark Sign + {U'\x02139', U'\x02139'}, // Information Source + {U'\x02194', U'\x02199'}, // Left Right Arrow ..South West Arrow + {U'\x021a9', U'\x021aa'}, // Leftwards Arrow With Hoo..Rightwards Arrow With Ho + {U'\x02328', U'\x02328'}, // Keyboard + {U'\x023cf', U'\x023cf'}, // Eject Symbol + {U'\x023ed', U'\x023ef'}, // Black Right-pointing Dou..Black Right-pointing Tri + {U'\x023f1', U'\x023f2'}, // Stopwatch ..Timer Clock + {U'\x023f8', U'\x023fa'}, // Double Vertical Bar ..Black Circle For Record + {U'\x024c2', U'\x024c2'}, // Circled Latin Capital Letter M + {U'\x025aa', U'\x025ab'}, // Black Small Square ..White Small Square + {U'\x025b6', U'\x025b6'}, // Black Right-pointing Triangle + {U'\x025c0', U'\x025c0'}, // Black Left-pointing Triangle + {U'\x025fb', U'\x025fc'}, // White Medium Square ..Black Medium Square + {U'\x02600', U'\x02604'}, // Black Sun With Rays ..Comet + {U'\x0260e', U'\x0260e'}, // Black Telephone + {U'\x02611', U'\x02611'}, // Ballot Box With Check + {U'\x02618', U'\x02618'}, // Shamrock + {U'\x0261d', U'\x0261d'}, // White Up Pointing Index + {U'\x02620', U'\x02620'}, // Skull And Crossbones + {U'\x02622', U'\x02623'}, // Radioactive Sign ..Biohazard Sign + {U'\x02626', U'\x02626'}, // Orthodox Cross + {U'\x0262a', U'\x0262a'}, // Star And Crescent + {U'\x0262e', U'\x0262f'}, // Peace Symbol ..Yin Yang + {U'\x02638', U'\x0263a'}, // Wheel Of Dharma ..White Smiling Face + {U'\x02640', U'\x02640'}, // Female Sign + {U'\x02642', U'\x02642'}, // Male Sign + {U'\x0265f', U'\x02660'}, // Black Chess Pawn ..Black Spade Suit + {U'\x02663', U'\x02663'}, // Black Club Suit + {U'\x02665', U'\x02666'}, // Black Heart Suit ..Black Diamond Suit + {U'\x02668', U'\x02668'}, // Hot Springs + {U'\x0267b', U'\x0267b'}, // Black Universal Recycling Symbol + {U'\x0267e', U'\x0267e'}, // Permanent Paper Sign + {U'\x02692', U'\x02692'}, // Hammer And Pick + {U'\x02694', U'\x02697'}, // Crossed Swords ..Alembic + {U'\x02699', U'\x02699'}, // Gear + {U'\x0269b', U'\x0269c'}, // Atom Symbol ..Fleur-de-lis + {U'\x026a0', U'\x026a0'}, // Warning Sign + {U'\x026a7', U'\x026a7'}, // Male With Stroke And Male And Female Sign + {U'\x026b0', U'\x026b1'}, // Coffin ..Funeral Urn + {U'\x026c8', U'\x026c8'}, // Thunder Cloud And Rain + {U'\x026cf', U'\x026cf'}, // Pick + {U'\x026d1', U'\x026d1'}, // Helmet With White Cross + {U'\x026d3', U'\x026d3'}, // Chains + {U'\x026e9', U'\x026e9'}, // Shinto Shrine + {U'\x026f0', U'\x026f1'}, // Mountain ..Umbrella On Ground + {U'\x026f4', U'\x026f4'}, // Ferry + {U'\x026f7', U'\x026f9'}, // Skier ..Person With Ball + {U'\x02702', U'\x02702'}, // Black Scissors + {U'\x02708', U'\x02709'}, // Airplane ..Envelope + {U'\x0270c', U'\x0270d'}, // Victory Hand ..Writing Hand + {U'\x0270f', U'\x0270f'}, // Pencil + {U'\x02712', U'\x02712'}, // Black Nib + {U'\x02714', U'\x02714'}, // Heavy Check Mark + {U'\x02716', U'\x02716'}, // Heavy Multiplication X + {U'\x0271d', U'\x0271d'}, // Latin Cross + {U'\x02721', U'\x02721'}, // Star Of David + {U'\x02733', U'\x02734'}, // Eight Spoked Asterisk ..Eight Pointed Black Star + {U'\x02744', U'\x02744'}, // Snowflake + {U'\x02747', U'\x02747'}, // Sparkle + {U'\x02763', U'\x02764'}, // Heavy Heart Exclamation ..Heavy Black Heart + {U'\x027a1', U'\x027a1'}, // Black Rightwards Arrow + {U'\x02934', U'\x02935'}, // Arrow Pointing Rightward..Arrow Pointing Rightward + {U'\x02b05', U'\x02b07'}, // Leftwards Black Arrow ..Downwards Black Arrow + {U'\x1f170', U'\x1f171'}, // Negative Squared Latin C..Negative Squared Latin C + {U'\x1f17e', U'\x1f17f'}, // Negative Squared Latin C..Negative Squared Latin C + {U'\x1f321', U'\x1f321'}, // Thermometer + {U'\x1f324', U'\x1f32c'}, // White Sun With Small Clo..Wind Blowing Face + {U'\x1f336', U'\x1f336'}, // Hot Pepper + {U'\x1f37d', U'\x1f37d'}, // Fork And Knife With Plate + {U'\x1f396', U'\x1f397'}, // Military Medal ..Reminder Ribbon + {U'\x1f399', U'\x1f39b'}, // Studio Microphone ..Control Knobs + {U'\x1f39e', U'\x1f39f'}, // Film Frames ..Admission Tickets + {U'\x1f3cb', U'\x1f3ce'}, // Weight Lifter ..Racing Car + {U'\x1f3d4', U'\x1f3df'}, // Snow Capped Mountain ..Stadium + {U'\x1f3f3', U'\x1f3f3'}, // Waving White Flag + {U'\x1f3f5', U'\x1f3f5'}, // Rosette + {U'\x1f3f7', U'\x1f3f7'}, // Label + {U'\x1f43f', U'\x1f43f'}, // Chipmunk + {U'\x1f441', U'\x1f441'}, // Eye + {U'\x1f4fd', U'\x1f4fd'}, // Film Projector + {U'\x1f549', U'\x1f54a'}, // Om Symbol ..Dove Of Peace + {U'\x1f56f', U'\x1f570'}, // Candle ..Mantelpiece Clock + {U'\x1f573', U'\x1f579'}, // Hole ..Joystick + {U'\x1f587', U'\x1f587'}, // Linked Paperclips + {U'\x1f58a', U'\x1f58d'}, // Lower Left Ballpoint Pen..Lower Left Crayon + {U'\x1f590', U'\x1f590'}, // Raised Hand With Fingers Splayed + {U'\x1f5a5', U'\x1f5a5'}, // Desktop Computer + {U'\x1f5a8', U'\x1f5a8'}, // Printer + {U'\x1f5b1', U'\x1f5b2'}, // Three Button Mouse ..Trackball + {U'\x1f5bc', U'\x1f5bc'}, // Frame With Picture + {U'\x1f5c2', U'\x1f5c4'}, // Card Index Dividers ..File Cabinet + {U'\x1f5d1', U'\x1f5d3'}, // Wastebasket ..Spiral Calendar Pad + {U'\x1f5dc', U'\x1f5de'}, // Compression ..Rolled-up Newspaper + {U'\x1f5e1', U'\x1f5e1'}, // Dagger Knife + {U'\x1f5e3', U'\x1f5e3'}, // Speaking Head In Silhouette + {U'\x1f5e8', U'\x1f5e8'}, // Left Speech Bubble + {U'\x1f5ef', U'\x1f5ef'}, // Right Anger Bubble + {U'\x1f5f3', U'\x1f5f3'}, // Ballot Box With Ballot + {U'\x1f5fa', U'\x1f5fa'}, // World Map + {U'\x1f6cb', U'\x1f6cb'}, // Couch And Lamp + {U'\x1f6cd', U'\x1f6cf'}, // Shopping Bags ..Bed + {U'\x1f6e0', U'\x1f6e5'}, // Hammer And Wrench ..Motor Boat + {U'\x1f6e9', U'\x1f6e9'}, // Small Airplane + {U'\x1f6f0', U'\x1f6f0'}, // Satellite + {U'\x1f6f3', U'\x1f6f3'}, // Passenger Ship + }; + // clang-format on + + static size_t bisearch(char32_t ucs, const BoundaryVector& table) { + // TODO: Use STD algorithm to optimize this function + + // YYC MARK: + // Do not change this "int" to "size_t" casually, + // because the result of arithmetic operation may be negative. + // Do not change this type before using new algorithm. + int lbound = 0, ubound = table.size() - 1; + + if (ucs < table.front().first || ucs > table.back().second) return 0; + + while (ubound >= lbound) { + int mid = (lbound + ubound) / 2; + if (ucs > table[mid].second) lbound = mid + 1; + else if (ucs < table[mid].first) ubound = mid - 1; + else return 1; + } + + return 0; + } + + size_t wcwidth(char32_t wc) { + // TODO: Add lru_cache(maxsize=1000) for this function + + // Small optimize for ASCII + if (U'\x20' <= wc && wc < U'\x7F') [[likely]] + return 1; + + // C0/C1 control char + // NOTE: Not vanilla implementation. Return 0 instead of 1. + if ((wc && wc < L'\x20') || (L'\x7F' <= wc && wc < L'\xA0')) return 0; + + // Zero-width char + if (bisearch(wc, ZERO_WIDTH)) return 0; + + // Width 1 or 2 + return 1 + bisearch(wc, WIDE_EAST_ASIAN); + } + + enum class WcswidthState { + /// Normal character. + Normal, + /// Under ZWJ control char. + /// Ignore the width of next char. + ZeroWidthJoiner, + /// Under ANSI Escape Sequence. + /// Following chars should be treated as escape char. + AnsiEscape, + /// Under CSI control sequence, a part of ANSI Escape Sequence. + /// No width was accumulated before terminal char. + AnsiCsiEscape, + }; + + struct WcswidthContext { + /// Current state. + WcswidthState state; + /// Tract the last computed char. + /// It will be used for VS16 char. + std::optional last_measured_char; + }; + + Result wcswidth(const std::u32string_view& rhs) { + WcswidthContext ctx{WcswidthState::Normal, std::nullopt}; + size_t width = 0; + + for (char32_t chr : rhs) { + // Match char value + switch (ctx.state) { + case WcswidthState::Normal: { + switch (chr) { + case U'\x200D': { + // ZWJ control char + ctx.state = WcswidthState::ZeroWidthJoiner; + break; + } + case U'\xFE0F': { + // VS16 control char + // If we have a char which was acknowledged and has width, + // analyse it instead of this control char. + if (ctx.last_measured_char.has_value()) { + width += bisearch(ctx.last_measured_char.value(), VS16_NARROW_TO_WIDE); + ctx.last_measured_char = std::nullopt; + } + break; + } + case U'\x1B': { + // ANSI escape sequence + ctx.state = WcswidthState::AnsiEscape; + break; + } + default: { + // Fetch widht for normal char + int wcw = wcwidth(chr); + // Tract the final non-zero char for VS16 control char + if (wcw > 0) ctx.last_measured_char = wcw; + // Accumulate width + width += wcw; + break; + } + } + break; + } + case WcswidthState::ZeroWidthJoiner: { + // Eat this char and back to normal state. + // This is what ZWJ does. + ctx.state = WcswidthState::Normal; + break; + } + case WcswidthState::AnsiEscape: { + // Check the second char of escape sequence. + // If it is '[', we enter CSI state, + // otherwise we eat it and back to normal state. + // Additionally, there is a range requirement for this char (0x40-0x5F). + if (chr == U'[') { + ctx.state = WcswidthState::AnsiCsiEscape; + } else if (chr >= U'\x40' && chr <= U'\x5F') { + ctx.state = WcswidthState::Normal; + } else { + return std::unexpected(Error::BadAnsiEscSeq); + } + break; + } + case WcswidthState::AnsiCsiEscape: { + // CSI sequence is aonsisted by variable Parameter Char (count can be zero), + // at least one Middle Char and only one Final Char. + // So we eat all chars until we reach the terminal char. + if (chr >= U'\x40' && chr <= U'\x7E') { + // Final Char. Back to normal state. + ctx.state = WcswidthState::Normal; + } else if (chr >= U'\x30' && chr <= U'\x3F') { + ; // Parameter Char. Do nothing + } else if (chr >= U'\x20' && chr <= U'\x2F') { + ; // Middle Char. Do nothing + } else { + return std::unexpected(Error::BadCsiSeq); + } + break; + } + } + } + + return width; + } + + Result wcswidth(const std::u8string_view& rhs) { + // Cast encoding + auto u32str = ENC::to_utf32(rhs); + if (!u32str.has_value()) return std::unexpected(Error::BadEncoding); + // Call underlying function + return wcswidth(u32str.value()); + } + +} // namespace yycc::carton::wcwidth diff --git a/src/yycc/carton/wcwidth.hpp b/src/yycc/carton/wcwidth.hpp new file mode 100644 index 0000000..2b1520c --- /dev/null +++ b/src/yycc/carton/wcwidth.hpp @@ -0,0 +1,47 @@ +#pragma once +#include +#include + +/** + * @brief The namespace replicating Linux-specialized function, "wcswidth", in all platforms. + * @details + * "wcswdith" is a specialized function in Linux. + * It was not included in POSIX standard and only provided on Linux. + * This function can fetch how many space which given string occupied in terminal. + * This is essential and useful function in our library. + * So I create this namespace to make "wcswidth" be available on all platforms. + * + * "wcswidth" is based on \c wchar_t. In Linux, \c wchar_t is 4-bytes length. + * It can represent any characters without surrogate pair. + * However, in Windows, \c wchar_t is 2-bytes length. + * There is possible surrogate pair within \c wchar_t string, which is inconvenient for our programming. + * So in this homebrew namespace, I forcelt use \c char32_t as the basic char type. + * + * Due to the requirements of mine, this implementation is slightly different with original one. + * These differences are list below: + * + * \li We do not return negative value for Control Char in "wcwidth", + * because we need to support the analyse of ANSI Escape Sequence. + * \li Due to the previous change, the type of return value of "wcwidth" and "wcswidth" + * are changed from \c int to \c size_t because there is no negative return value. + * \li "wcswidth" now support ANSI Escape Sequence (e.g. terminal color). + * So it can analyse colorful output with correct space. + */ +namespace yycc::carton::wcwidth { + + /// @brief Error occurs in this module + enum class Error { + BadEncoding, ///< Given + BadAnsiEscSeq, ///< Bad char when processing ANSI Escape Sequence + BadCsiSeq, ///< Bad char when processing CSI Sequence. + }; + + /// @brief Result type for this module + template + using Result = std::expected; + + size_t wcwidth(char32_t wc); + Result wcswidth(const std::u32string_view& rhs); + Result wcswidth(const std::u8string_view& rhs); + +} // namespace yycc::carton::wcwidth diff --git a/testbench/CMakeLists.txt b/testbench/CMakeLists.txt index f4cc139..9a731b6 100644 --- a/testbench/CMakeLists.txt +++ b/testbench/CMakeLists.txt @@ -28,6 +28,7 @@ PRIVATE yycc/windows/winfct.cpp yycc/carton/pycodec.cpp + yycc/carton/wcwidth.cpp ) target_sources(YYCCTestbench PRIVATE diff --git a/testbench/yycc/carton/wcwidth.cpp b/testbench/yycc/carton/wcwidth.cpp new file mode 100644 index 0000000..ddaeebb --- /dev/null +++ b/testbench/yycc/carton/wcwidth.cpp @@ -0,0 +1,52 @@ +#include +#include +#include + +#define WCWDITH ::yycc::carton::wcwidth + +namespace yycctest::carton::wcwidth { + +#define TEST_SUCCESS(strl, len) \ + { \ + auto rv = WCWDITH::wcswidth(strl); \ + ASSERT_TRUE(rv.has_value()); \ + EXPECT_EQ(rv.value(), len); \ + } + +#define TEST_FAIL(strl) \ + { \ + auto rv = WCWDITH::wcswidth(strl); \ + EXPECT_FALSE(rv.has_value()); \ + } + + TEST(CartonWcwdith, BadAnsi) { + TEST_FAIL(u8"\033?"); + } + + TEST(CartonWcwdith, BadCsi) { + TEST_FAIL(u8"\033[\t"); + } + + TEST(CartonWcwdith, English) { + TEST_SUCCESS(u8"abc", 3); + } + + TEST(CartonWcwdith, Chinese) { + TEST_SUCCESS(u8"中文", 4); + TEST_SUCCESS(u8"中a文", 5); + } + + TEST(CartonWcwdith, Japanese) { + TEST_SUCCESS(u8"ありがとう", 10); + TEST_SUCCESS(u8"アリガトウ", 10); + TEST_SUCCESS(u8"アリガトウ", 6); + } + + TEST(CartonWcwdith, Termcolor) { + // TODO: Fix this after finish "termcolor". + // assert_eq!(wcswidth(&colored("abc", Color::Red, Default::default(), Default::default())), 3); + // assert_eq!(wcswidth(&colored("中文", Color::Red, Default::default(), Default::default())), 4); + // assert_eq!(wcswidth(&colored("ありがとう", Color::Red, Default::default(), Default::default())), 10); + } + +} // namespace yycctest::carton::wcwidth