From 190beeed5849a9849b7020dbaacdca2dbb79b24c Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Fri, 26 Sep 2025 21:43:12 +0800 Subject: [PATCH] feat: add code point splittor for utf8 string. - this feature is added for strip function in string op. --- src/yycc/string/op.cpp | 145 +++++++++++++++++++++++++++++++++++ src/yycc/string/op.hpp | 66 +++++++++++++++- testbench/yycc/string/op.cpp | 22 ++++++ 3 files changed, 231 insertions(+), 2 deletions(-) diff --git a/src/yycc/string/op.cpp b/src/yycc/string/op.cpp index 6e1d9bd..03f3d17 100644 --- a/src/yycc/string/op.cpp +++ b/src/yycc/string/op.cpp @@ -166,6 +166,151 @@ namespace yycc::string::op { #pragma endregion +#pragma region Strip + +#pragma region Code Point Iterator + + class CodePointIterator { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = std::u8string_view; + using difference_type = std::ptrdiff_t; + using pointer = const std::u8string_view*; + using reference = const std::u8string_view&; + + private: + std::u8string_view current_str; + std::u8string_view next_str; + + public: + CodePointIterator(const std::u8string_view& strl) : current_str(), next_str(strl) { ++(*this); } + + reference operator*() const { return this->current_str; } + + pointer operator->() const { return &this->current_str; } + + CodePointIterator& operator++() { + // move next string to current string and analyse it + current_str = next_str; + next_str = std::u8string_view(); + + // we only process it if there is some chars + if (!current_str.empty()) { + // extract the string to be checked + std::u8string_view strl = current_str; + + // get how many bytes this code point occupied. + size_t bytes_to_skip = evaluate_utf8_byte_count(strl.front()); + // if evaluate skip size is overflow the whole size of string, throw exception + if (bytes_to_skip > strl.size()) throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes."); + // check following bytes are starts with 0b10 + for (size_t i = 1; i < bytes_to_skip; ++i) { + check_continuation_byte(strl[i]); + } + + // Everything is okey, set current string and next string + current_str = strl.substr(0, bytes_to_skip); + next_str = strl.substr(bytes_to_skip); + } + + // return self + return *this; + } + + CodePointIterator operator++(int) { + CodePointIterator temp = *this; + ++(*this); + return temp; + } + + bool operator==(const CodePointIterator& other) const { + return this->current_str == other.current_str && this->next_str == other.next_str; + } + + bool operator!=(const CodePointIterator& other) const { return !(*this == other); } + + private: + /** + * @brief Calulate how many bytes following code point occupied according to first byte of sequence. + * @param[in] byte First sequence for checking. + * @return The size of following code point occupied ranging from 1 to 4 (inclusive). + */ + size_t evaluate_utf8_byte_count(char8_t c) const { + auto byte = static_cast(c); + if ((byte & 0x80) == 0x00) return 1; // 0xxxxxxx + if ((byte & 0xE0) == 0xC0) return 2; // 110xxxxx + if ((byte & 0xF0) == 0xE0) return 3; // 1110xxxx + if ((byte & 0xF8) == 0xF0) return 4; // 11110xxx + throw std::runtime_error("invalid utf8 sequence. bad start byte"); + } + /** + * @brief Check whether given byte is a valid continuation byte in UTF8. + * @param[in] c Byte for checking. + */ + void check_continuation_byte(char8_t c) const { + auto byte = static_cast(c); + if ((byte & 0xC0) != 0x80) { + throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes."); + } + } + }; + +#pragma endregion + +#pragma region Code Point + + class CodePoint { + private: + std::u8string_view u8str; + + public: + explicit CodePoint(std::u8string_view u8str) : u8str(u8str) {} + + CodePointIterator begin() const { return CodePointIterator(u8str); } + + CodePointIterator end() const { + // Pass empty string view indicate end. + return CodePointIterator(std::u8string_view()); + } + }; + +#pragma endregion + + template + void internal_strip(std::u8string& strl, const std::u8string_view& words) { + if constexpr (bDoLeft) { + } + + if constexpr (bDoRight) { + } + } + + void strip(std::u8string& strl, const std::u8string_view& words) {} + + std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) { + std::u8string rv(strl); + strip(rv, words); + return rv; + } + + void lstrip(std::u8string& strl, const std::u8string_view& words) {} + + std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) { + std::u8string rv(strl); + lstrip(rv, words); + return rv; + } + + void rstrip(std::u8string& strl, const std::u8string_view& words) {} + + std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) { + std::u8string rv(strl); + rstrip(rv, words); + return rv; + } + +#pragma endregion + #pragma region Split // Reference: diff --git a/src/yycc/string/op.hpp b/src/yycc/string/op.hpp index f455153..c202e28 100644 --- a/src/yycc/string/op.hpp +++ b/src/yycc/string/op.hpp @@ -9,6 +9,8 @@ namespace yycc::string::op { +#pragma region Printf + /** * @brief Perform an UTF8 string formatting operation. * @param[in] format The format string. @@ -38,6 +40,10 @@ namespace yycc::string::op { */ std::string vprintf(const char* format, va_list argptr); +#pragma endregion + +#pragma region Replace + /** * @brief Modify given string with all occurrences of substring \e old replaced by \e new. * @param[in,out] strl The string for replacing @@ -54,6 +60,10 @@ namespace yycc::string::op { */ std::u8string replace(const std::u8string_view& _strl, const std::u8string_view& _from_strl, const std::u8string_view& _to_strl); +#pragma endregion + +#pragma region Join + /** * @brief The data provider of general join function. * @details @@ -100,6 +110,10 @@ namespace yycc::string::op { delimiter); } +#pragma endregion + +#pragma region Lower Upper + /** * @brief Convert given string to lowercase. * @param[in,out] strl The string to be lowercase. @@ -123,8 +137,56 @@ namespace yycc::string::op { */ std::u8string to_upper(const std::u8string_view& strl); - // TODO: - // Add strip, lstrip and rstrip functions. +#pragma endregion + +#pragma region Strip + + /** + * @brief Remove leading and trailing whitespace from the string. + * @param[in,out] strl The string to be stripped. + * @param[in] words The characters to be stripped. + */ + void strip(std::u8string& strl, const std::u8string_view& words); + /** + * @brief Return a copy of the string with leading and trailing whitespace removed. + * @param[in] strl The string to be stripped. + * @param[in] words The characters to be stripped. + * @return The copy of the string with leading and trailing whitespace removed. + */ + std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words); + + /** + * @brief Remove leading whitespace from the string. + * @param[in,out] strl The string to be stripped. + * @param[in] words The characters to be stripped. + */ + void lstrip(std::u8string& strl, const std::u8string_view& words); + + /** + * @brief Return a copy of the string with leading whitespace removed. + * @param[in] strl The string to be stripped. + * @param[in] words The characters to be stripped. + * @return The copy of the string with leading whitespace removed. + */ + std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words); + + /** + * @brief Remove trailing whitespace from the string. + * @param[in,out] strl The string to be stripped. + * @param[in] words The characters to be stripped. + */ + void rstrip(std::u8string& strl, const std::u8string_view& words); + + /** + * @brief Return a copy of the string with trailing whitespace removed. + * @param[in] strl The string to be stripped. + * @param[in] words The characters to be stripped. + * @return The copy of the string with trailing whitespace removed. + */ + std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words); + + +#pragma endregion #pragma region Split diff --git a/testbench/yycc/string/op.cpp b/testbench/yycc/string/op.cpp index c1102f8..3c86001 100644 --- a/testbench/yycc/string/op.cpp +++ b/testbench/yycc/string/op.cpp @@ -71,6 +71,28 @@ namespace yycctest::string::op { EXPECT_EQ(rv, u8", 1, 2, "); } + TEST(StringOp, Strip) { + // Normal strip + { + auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n"); + EXPECT_EQ(rv, u8"aaa"); + } + + // Special strip + { + auto rv = OP::to_strip(u8"亜亜亜aaaあああ", u8"亜あ"); + EXPECT_EQ(rv, u8"aaa"); + } + { + auto rv = OP::to_strip(u8"亜亜亜aaaあああ", u8"亜"); + EXPECT_EQ(rv, u8"aaaあああ"); + } + { + auto rv = OP::to_strip(u8"亜亜亜aaaあああ", u8"あ"); + EXPECT_EQ(rv, u8"亜亜亜aaa"); + } + } + TEST(StringOp, Split) { // Normal {