feat: add code point splittor for utf8 string.

- this feature is added for strip function in string op.
2025-09-26 21:43:12 +08:00
parent 99146ddd55
commit 190beeed58
3 changed files with 231 additions and 2 deletions
--- a/src/yycc/string/op.cpp
+++ b/src/yycc/string/op.cpp
@@ -166,6 +166,151 @@ namespace yycc::string::op {

 #pragma endregion

+#pragma region Strip
+
+#pragma region Code Point Iterator
+
+    class CodePointIterator {
+    public:
+        using iterator_category = std::forward_iterator_tag;
+        using value_type = std::u8string_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const std::u8string_view*;
+        using reference = const std::u8string_view&;
+
+    private:
+        std::u8string_view current_str;
+        std::u8string_view next_str;
+
+    public:
+        CodePointIterator(const std::u8string_view& strl) : current_str(), next_str(strl) { ++(*this); }
+
+        reference operator*() const { return this->current_str; }
+
+        pointer operator->() const { return &this->current_str; }
+
+        CodePointIterator& operator++() {
+            // move next string to current string and analyse it
+            current_str = next_str;
+            next_str = std::u8string_view();
+
+            // we only process it if there is some chars
+            if (!current_str.empty()) {
+                // extract the string to be checked
+                std::u8string_view strl = current_str;
+
+                // get how many bytes this code point occupied.
+                size_t bytes_to_skip = evaluate_utf8_byte_count(strl.front());
+                // if evaluate skip size is overflow the whole size of string, throw exception
+                if (bytes_to_skip > strl.size()) throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes.");
+                // check following bytes are starts with 0b10
+                for (size_t i = 1; i < bytes_to_skip; ++i) {
+                    check_continuation_byte(strl[i]);
+                }
+
+                // Everything is okey, set current string and next string
+                current_str = strl.substr(0, bytes_to_skip);
+                next_str = strl.substr(bytes_to_skip);
+            }
+
+            // return self
+            return *this;
+        }
+
+        CodePointIterator operator++(int) {
+            CodePointIterator temp = *this;
+            ++(*this);
+            return temp;
+        }
+
+        bool operator==(const CodePointIterator& other) const {
+            return this->current_str == other.current_str && this->next_str == other.next_str;
+        }
+
+        bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
+
+    private:
+        /**
+         * @brief Calulate how many bytes following code point occupied according to first byte of sequence.
+         * @param[in] byte First sequence for checking.
+         * @return The size of following code point occupied ranging from 1 to 4 (inclusive).
+         */
+        size_t evaluate_utf8_byte_count(char8_t c) const {
+            auto byte = static_cast<uint8_t>(c);
+            if ((byte & 0x80) == 0x00) return 1; // 0xxxxxxx
+            if ((byte & 0xE0) == 0xC0) return 2; // 110xxxxx
+            if ((byte & 0xF0) == 0xE0) return 3; // 1110xxxx
+            if ((byte & 0xF8) == 0xF0) return 4; // 11110xxx
+            throw std::runtime_error("invalid utf8 sequence. bad start byte");
+        }
+        /**
+         * @brief Check whether given byte is a valid continuation byte in UTF8.
+         * @param[in] c Byte for checking.
+         */
+        void check_continuation_byte(char8_t c) const {
+            auto byte = static_cast<uint8_t>(c);
+            if ((byte & 0xC0) != 0x80) {
+                throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes.");
+            }
+        }
+    };
+
+#pragma endregion
+
+#pragma region Code Point
+
+    class CodePoint {
+    private:
+        std::u8string_view u8str;
+
+    public:
+        explicit CodePoint(std::u8string_view u8str) : u8str(u8str) {}
+
+        CodePointIterator begin() const { return CodePointIterator(u8str); }
+
+        CodePointIterator end() const {
+            // Pass empty string view indicate end.
+            return CodePointIterator(std::u8string_view());
+        }
+    };
+
+#pragma endregion
+
+    template<bool bDoLeft, bool bDoRight>
+    void internal_strip(std::u8string& strl, const std::u8string_view& words) {
+        if constexpr (bDoLeft) {
+        }
+
+        if constexpr (bDoRight) {
+        }
+    }
+
+    void strip(std::u8string& strl, const std::u8string_view& words) {}
+
+    std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) {
+        std::u8string rv(strl);
+        strip(rv, words);
+        return rv;
+    }
+
+    void lstrip(std::u8string& strl, const std::u8string_view& words) {}
+
+    std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
+        std::u8string rv(strl);
+        lstrip(rv, words);
+        return rv;
+    }
+
+    void rstrip(std::u8string& strl, const std::u8string_view& words) {}
+
+    std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
+        std::u8string rv(strl);
+        rstrip(rv, words);
+        return rv;
+    }
+
+#pragma endregion
+
 #pragma region Split

    // Reference:
--- a/src/yycc/string/op.hpp
+++ b/src/yycc/string/op.hpp
@@ -9,6 +9,8 @@

 namespace yycc::string::op {

+#pragma region Printf
+
    /**
     * @brief Perform an UTF8 string formatting operation.
     * @param[in] format The format string.
@@ -38,6 +40,10 @@ namespace yycc::string::op {
    */
    std::string vprintf(const char* format, va_list argptr);

+#pragma endregion
+
+#pragma region Replace
+
    /**
 	 * @brief Modify given string with all occurrences of substring \e old replaced by \e new.
 	 * @param[in,out] strl The string for replacing
@@ -54,6 +60,10 @@ namespace yycc::string::op {
 	*/
    std::u8string replace(const std::u8string_view& _strl, const std::u8string_view& _from_strl, const std::u8string_view& _to_strl);

+#pragma endregion
+
+#pragma region Join
+
    /**
 	 * @brief The data provider of general join function.
 	 * @details
@@ -100,6 +110,10 @@ namespace yycc::string::op {
            delimiter);
    }

+#pragma endregion
+
+#pragma region Lower Upper
+
    /**
 	 * @brief Convert given string to lowercase.
 	 * @param[in,out] strl The string to be lowercase.
@@ -123,8 +137,56 @@ namespace yycc::string::op {
 	*/
    std::u8string to_upper(const std::u8string_view& strl);

-    // TODO:
-    // Add strip, lstrip and rstrip functions.
+#pragma endregion
+
+#pragma region Strip
+
+    /**
+     * @brief Remove leading and trailing whitespace from the string.
+     * @param[in,out] strl The string to be stripped.
+     * @param[in] words The characters to be stripped.
+     */
+    void strip(std::u8string& strl, const std::u8string_view& words);
+    /**
+     * @brief Return a copy of the string with leading and trailing whitespace removed.
+     * @param[in] strl The string to be stripped.
+     * @param[in] words The characters to be stripped.
+     * @return The copy of the string with leading and trailing whitespace removed.
+     */
+    std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words);
+
+    /**
+     * @brief Remove leading whitespace from the string.
+     * @param[in,out] strl The string to be stripped.
+     * @param[in] words The characters to be stripped.
+     */
+    void lstrip(std::u8string& strl, const std::u8string_view& words);
+
+    /**
+     * @brief Return a copy of the string with leading whitespace removed.
+     * @param[in] strl The string to be stripped.
+     * @param[in] words The characters to be stripped.
+     * @return The copy of the string with leading whitespace removed.
+     */
+    std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words);
+
+    /**
+     * @brief Remove trailing whitespace from the string.
+     * @param[in,out] strl The string to be stripped.
+     * @param[in] words The characters to be stripped.
+     */
+    void rstrip(std::u8string& strl, const std::u8string_view& words);
+
+    /**
+     * @brief Return a copy of the string with trailing whitespace removed.
+     * @param[in] strl The string to be stripped.
+     * @param[in] words The characters to be stripped.
+     * @return The copy of the string with trailing whitespace removed.
+     */
+    std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words);
+
+
+#pragma endregion

 #pragma region Split

--- a/testbench/yycc/string/op.cpp
+++ b/testbench/yycc/string/op.cpp
@@ -71,6 +71,28 @@ namespace yycctest::string::op {
        EXPECT_EQ(rv, u8", 1, 2, ");
    }

+    TEST(StringOp, Strip) {
+        // Normal strip
+        {
+            auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n");
+            EXPECT_EQ(rv, u8"aaa");
+        }
+
+        // Special strip
+        {
+            auto rv = OP::to_strip(u8"亜亜亜aaaあああ", u8"亜あ");
+            EXPECT_EQ(rv, u8"aaa");
+        }
+        {
+            auto rv = OP::to_strip(u8"亜亜亜aaaあああ", u8"亜");
+            EXPECT_EQ(rv, u8"aaaあああ");
+        }
+        {
+            auto rv = OP::to_strip(u8"亜亜亜aaaあああ", u8"あ");
+            EXPECT_EQ(rv, u8"亜亜亜aaa");
+        }
+    }
+
    TEST(StringOp, Split) {
        // Normal
        {