feat: finish string strip (maybe)

- add Trie Tree for string strip op. - finish string strip but it may still buggy (though i have tested it)
2025-09-28 16:13:06 +08:00
parent 190beeed58
commit cc1ce5bb04
3 changed files with 164 additions and 55 deletions
--- a/src/yycc/string/op.cpp
+++ b/src/yycc/string/op.cpp
@ -1,4 +1,7 @@
 #include "op.hpp"
+#include <map>
+#include <memory>
+#include <iterator>
 #include <type_traits>
 #include <algorithm>
 #include <stdexcept>
@ -172,7 +175,7 @@ namespace yycc::string::op {

    class CodePointIterator {
    public:
-        using iterator_category = std::forward_iterator_tag;
+        using iterator_category = std::input_iterator_tag;
        using value_type = std::u8string_view;
        using difference_type = std::ptrdiff_t;
        using pointer = const std::u8string_view*;
@ -274,39 +277,149 @@ namespace yycc::string::op {
        }
    };

+#pragma endregion
+
+#pragma region Trie Tree Node
+
+    struct TrieTreeNode {
+        TrieTreeNode() : is_end(false), children() {}
+        bool is_end;                                               ///< Whether this node is a viable end.
+        std::map<char8_t, std::unique_ptr<TrieTreeNode>> children; ///< The children node.
+    };
+
+#pragma endregion
+
+#pragma region Trie Tree
+
+    class TrieTree {
+    private:
+        std::unique_ptr<TrieTreeNode> root;
+
+    public:
+        TrieTree() : root(std::make_unique<TrieTreeNode>()) {
+            // Do not accept root element always (no empty string).
+            root->is_end = false;
+        }
+        
+        /**
+         * @brief Insert new words in trie tree.
+         * @details
+         * The reason why use iterator, rather than string view, is that in strip function, we may need insert a string backwardly,
+         * so using string view reverse iterator and iterator argument can easily achieve this goal without any more burden.
+         * @tparam InputIt The iterator following input iterator name convention.
+         * @param[in] first The head of iterator.
+         * @param[in] last The tail of iterator.
+         */
+        template<std::input_iterator InputIt>
+            requires std::is_same_v<std::iter_value_t<InputIt>, char8_t>
+        void insert(InputIt first, InputIt last) {
+            // prevent empty string
+            if (first == last) return;
+
+            // insert item
+            TrieTreeNode* node = root.get();
+            for (auto it = first; it != last; ++it) {
+                char8_t c = *it;
+                if (node->children.find(c) == node->children.end()) {
+                    node->children[c] = std::make_unique<TrieTreeNode>();
+                }
+                node = node->children[c].get();
+            }
+            node->is_end = true;
+        }
+
+        /**
+         * @brief Check whether given words has prefix stored in this trie tree.
+         * @details Same reason for using iterator as function argument.
+         * @tparam InputIt The iterator following input iterator name convention.
+         * @param[in] first The head of iterator.
+         * @param[in] last The tail of iterator.
+         * @return \c std::nullopt if there is no match, otherwise the length of matched prefix.
+         */
+        template<std::input_iterator InputIt>
+            requires std::is_same_v<std::iter_value_t<InputIt>, char8_t>
+        std::optional<size_t> search(InputIt first, InputIt last) {
+            TrieTreeNode* node = root.get();
+            size_t cnt = 0;
+
+            for (auto it = first; it != last; ++it) {
+                char8_t c = *it;
+                auto finder = node->children.find(c);
+                if (finder == node->children.end()) {
+                    // There is no more matching, break the while.
+                    break;
+                } else {
+                    // There are more matching item, find next one.
+                    node = finder->second.get();
+                    ++cnt;
+                }
+            }
+
+            // YYC MARK:
+            // There is a fatal bug for Trie Tree, but it doesn't matter with our usage scenario.
+            // 
+            // Assume there is two string "ab" and "abcd". If user give "abc",
+            // we should match it with "ab" prefix, but this function will return there is no match.
+            // However, this is impossible for UTF8 sequence.
+            // There is no possibility that two UTF8 sequence, indicating two different Unicode code point respectively,
+            // has the same prefix and different length. Because their first byte must be different,
+            // the first byte indicate the length of sequence.
+            // 
+            // This result also can be proven for suffix, 
+            // because first byte must not be equal to any other continuation bytes.
+            // It is impossible that they have same "ab".
+            // 
+            // So it is safe for our usage scenario although this bug is presented.
+
+            // check whether current is valid end.
+            // if it is, return the count of prefix, otherwise return nothing.
+            if (node->is_end) {
+                return cnt;
+            } else {
+                return std::nullopt;
+            }
+        }
+    };
+
 #pragma endregion

    template<bool bDoLeft, bool bDoRight>
-    void internal_strip(std::u8string& strl, const std::u8string_view& words) {
+    std::u8string_view internal_strip(const std::u8string_view& strl, const std::u8string_view& words) {
+        std::optional<TrieTree> prefix, suffix;
+        if constexpr (bDoLeft) prefix = TrieTree();
+        if constexpr (bDoRight) suffix = TrieTree();
+
+        CodePoint code_point(words);
+        for (const auto& seq : code_point) {
+            if (prefix.has_value()) prefix.value().insert(seq.begin(), seq.end());
+            if (suffix.has_value()) suffix.value().insert(seq.rbegin(), seq.rend());
+        }
+
+        std::u8string_view striped = strl;
        if constexpr (bDoLeft) {
+            while (auto cnt = prefix.value().search(striped.begin(), striped.end())) {
+                striped = striped.substr(cnt.value());
+            }
        }
-
        if constexpr (bDoRight) {
+            while (auto cnt = suffix.value().search(striped.rbegin(), striped.rend())) {
+                striped = striped.substr(0, striped.size() - cnt.value());
            }
        }

-    void strip(std::u8string& strl, const std::u8string_view& words) {}
-
-    std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) {
-        std::u8string rv(strl);
-        strip(rv, words);
-        return rv;
+        return striped;
    }

-    void lstrip(std::u8string& strl, const std::u8string_view& words) {}
-
-    std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
-        std::u8string rv(strl);
-        lstrip(rv, words);
-        return rv;
+    std::u8string_view strip(const std::u8string_view& strl, const std::u8string_view& words) {
+        return internal_strip<true, true>(strl, words);
    }

-    void rstrip(std::u8string& strl, const std::u8string_view& words) {}
+    std::u8string_view lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
+        return internal_strip<true, false>(strl, words);
+    }

-    std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
-        std::u8string rv(strl);
-        rstrip(rv, words);
-        return rv;
+    std::u8string_view rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
+        return internal_strip<false, true>(strl, words);
    }

 #pragma endregion
--- a/src/yycc/string/op.hpp
+++ b/src/yycc/string/op.hpp
@ -145,46 +145,25 @@ namespace yycc::string::op {
     * @brief Remove leading and trailing whitespace from the string.
     * @param[in,out] strl The string to be stripped.
     * @param[in] words The characters to be stripped.
+     * @return The string view with leading and trailing whitespace removed.
     */
-    void strip(std::u8string& strl, const std::u8string_view& words);
-    /**
-     * @brief Return a copy of the string with leading and trailing whitespace removed.
-     * @param[in] strl The string to be stripped.
-     * @param[in] words The characters to be stripped.
-     * @return The copy of the string with leading and trailing whitespace removed.
-     */
-    std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words);
+    std::u8string_view strip(const std::u8string_view& strl, const std::u8string_view& words);

    /**
     * @brief Remove leading whitespace from the string.
     * @param[in,out] strl The string to be stripped.
     * @param[in] words The characters to be stripped.
+     * @return The string view with leading whitespace removed.
     */
-    void lstrip(std::u8string& strl, const std::u8string_view& words);
-
-    /**
-     * @brief Return a copy of the string with leading whitespace removed.
-     * @param[in] strl The string to be stripped.
-     * @param[in] words The characters to be stripped.
-     * @return The copy of the string with leading whitespace removed.
-     */
-    std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words);
+    std::u8string_view lstrip(const std::u8string_view& strl, const std::u8string_view& words);

    /**
     * @brief Remove trailing whitespace from the string.
     * @param[in,out] strl The string to be stripped.
     * @param[in] words The characters to be stripped.
+     * @return The string view with trailing whitespace removed.
     */
-    void rstrip(std::u8string& strl, const std::u8string_view& words);
-
-    /**
-     * @brief Return a copy of the string with trailing whitespace removed.
-     * @param[in] strl The string to be stripped.
-     * @param[in] words The characters to be stripped.
-     * @return The copy of the string with trailing whitespace removed.
-     */
-    std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words);
-
+    std::u8string_view rstrip(const std::u8string_view& strl, const std::u8string_view& words);

 #pragma endregion

@ -195,7 +174,7 @@ namespace yycc::string::op {
     */
    class LazySplitIterator {
    public:
-        using iterator_category = std::forward_iterator_tag;
+        using iterator_category = std::input_iterator_tag;
        using value_type = std::u8string_view;
        using difference_type = std::ptrdiff_t;
        using pointer = const std::u8string_view*;
--- a/testbench/yycc/string/op.cpp
+++ b/testbench/yycc/string/op.cpp
@ -74,22 +74,39 @@ namespace yycctest::string::op {
    TEST(StringOp, Strip) {
        // Normal strip
        {
-            auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n");
+            auto rv = OP::strip(u8" \taaa\n", u8" \t\r\n");
            EXPECT_EQ(rv, u8"aaa");
        }
+        {
+            auto rv = OP::lstrip(u8" \taaa\n", u8" \t\r\n");
+            EXPECT_EQ(rv, u8"aaa\n");
+        }
+        {
+            auto rv = OP::rstrip(u8" \taaa\n", u8" \t\r\n");
+            EXPECT_EQ(rv, u8" \taaa");
+        }

        // Special strip
        {
-            auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
+            auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"啊あ");
            EXPECT_EQ(rv, u8"aaa");
        }
        {
-            auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
-            EXPECT_EQ(rv, u8"aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
+            auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"啊");
+            EXPECT_EQ(rv, u8"aaaあああ");
        }
        {
-            auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
-            EXPECT_EQ(rv, u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa");
+            auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"あ");
+            EXPECT_EQ(rv, u8"啊啊啊aaa");
+        }
+
+        // Possible buggy strip.
+        // We use 2 UTF8 code points introduced following:
+        // U+00AA  (UTF-8: C2 AA)
+        // U+1002A (UTF-8 : F0 90 80 AA)
+        {
+            auto rv = OP::rstrip(u8"aaa\u00AA", u8"\u00AA\U0001002A");
+            EXPECT_EQ(rv, u8"aaa");
        }
    }