diff --git a/src/yycc/string/op.cpp b/src/yycc/string/op.cpp index 03f3d17..79af3ef 100644 --- a/src/yycc/string/op.cpp +++ b/src/yycc/string/op.cpp @@ -1,4 +1,7 @@ #include "op.hpp" +#include +#include +#include #include #include #include @@ -172,7 +175,7 @@ namespace yycc::string::op { class CodePointIterator { public: - using iterator_category = std::forward_iterator_tag; + using iterator_category = std::input_iterator_tag; using value_type = std::u8string_view; using difference_type = std::ptrdiff_t; using pointer = const std::u8string_view*; @@ -274,39 +277,149 @@ namespace yycc::string::op { } }; +#pragma endregion + +#pragma region Trie Tree Node + + struct TrieTreeNode { + TrieTreeNode() : is_end(false), children() {} + bool is_end; ///< Whether this node is a viable end. + std::map> children; ///< The children node. + }; + +#pragma endregion + +#pragma region Trie Tree + + class TrieTree { + private: + std::unique_ptr root; + + public: + TrieTree() : root(std::make_unique()) { + // Do not accept root element always (no empty string). + root->is_end = false; + } + + /** + * @brief Insert new words in trie tree. + * @details + * The reason why use iterator, rather than string view, is that in strip function, we may need insert a string backwardly, + * so using string view reverse iterator and iterator argument can easily achieve this goal without any more burden. + * @tparam InputIt The iterator following input iterator name convention. + * @param[in] first The head of iterator. + * @param[in] last The tail of iterator. + */ + template + requires std::is_same_v, char8_t> + void insert(InputIt first, InputIt last) { + // prevent empty string + if (first == last) return; + + // insert item + TrieTreeNode* node = root.get(); + for (auto it = first; it != last; ++it) { + char8_t c = *it; + if (node->children.find(c) == node->children.end()) { + node->children[c] = std::make_unique(); + } + node = node->children[c].get(); + } + node->is_end = true; + } + + /** + * @brief Check whether given words has prefix stored in this trie tree. + * @details Same reason for using iterator as function argument. + * @tparam InputIt The iterator following input iterator name convention. + * @param[in] first The head of iterator. + * @param[in] last The tail of iterator. + * @return \c std::nullopt if there is no match, otherwise the length of matched prefix. + */ + template + requires std::is_same_v, char8_t> + std::optional search(InputIt first, InputIt last) { + TrieTreeNode* node = root.get(); + size_t cnt = 0; + + for (auto it = first; it != last; ++it) { + char8_t c = *it; + auto finder = node->children.find(c); + if (finder == node->children.end()) { + // There is no more matching, break the while. + break; + } else { + // There are more matching item, find next one. + node = finder->second.get(); + ++cnt; + } + } + + // YYC MARK: + // There is a fatal bug for Trie Tree, but it doesn't matter with our usage scenario. + // + // Assume there is two string "ab" and "abcd". If user give "abc", + // we should match it with "ab" prefix, but this function will return there is no match. + // However, this is impossible for UTF8 sequence. + // There is no possibility that two UTF8 sequence, indicating two different Unicode code point respectively, + // has the same prefix and different length. Because their first byte must be different, + // the first byte indicate the length of sequence. + // + // This result also can be proven for suffix, + // because first byte must not be equal to any other continuation bytes. + // It is impossible that they have same "ab". + // + // So it is safe for our usage scenario although this bug is presented. + + // check whether current is valid end. + // if it is, return the count of prefix, otherwise return nothing. + if (node->is_end) { + return cnt; + } else { + return std::nullopt; + } + } + }; + #pragma endregion template - void internal_strip(std::u8string& strl, const std::u8string_view& words) { + std::u8string_view internal_strip(const std::u8string_view& strl, const std::u8string_view& words) { + std::optional prefix, suffix; + if constexpr (bDoLeft) prefix = TrieTree(); + if constexpr (bDoRight) suffix = TrieTree(); + + CodePoint code_point(words); + for (const auto& seq : code_point) { + if (prefix.has_value()) prefix.value().insert(seq.begin(), seq.end()); + if (suffix.has_value()) suffix.value().insert(seq.rbegin(), seq.rend()); + } + + std::u8string_view striped = strl; if constexpr (bDoLeft) { + while (auto cnt = prefix.value().search(striped.begin(), striped.end())) { + striped = striped.substr(cnt.value()); + } } - if constexpr (bDoRight) { + while (auto cnt = suffix.value().search(striped.rbegin(), striped.rend())) { + striped = striped.substr(0, striped.size() - cnt.value()); + } } + + return striped; } - void strip(std::u8string& strl, const std::u8string_view& words) {} - - std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) { - std::u8string rv(strl); - strip(rv, words); - return rv; + std::u8string_view strip(const std::u8string_view& strl, const std::u8string_view& words) { + return internal_strip(strl, words); } - void lstrip(std::u8string& strl, const std::u8string_view& words) {} - - std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) { - std::u8string rv(strl); - lstrip(rv, words); - return rv; + std::u8string_view lstrip(const std::u8string_view& strl, const std::u8string_view& words) { + return internal_strip(strl, words); } - void rstrip(std::u8string& strl, const std::u8string_view& words) {} - - std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) { - std::u8string rv(strl); - rstrip(rv, words); - return rv; + std::u8string_view rstrip(const std::u8string_view& strl, const std::u8string_view& words) { + return internal_strip(strl, words); } #pragma endregion diff --git a/src/yycc/string/op.hpp b/src/yycc/string/op.hpp index c202e28..e9a9a3d 100644 --- a/src/yycc/string/op.hpp +++ b/src/yycc/string/op.hpp @@ -145,46 +145,25 @@ namespace yycc::string::op { * @brief Remove leading and trailing whitespace from the string. * @param[in,out] strl The string to be stripped. * @param[in] words The characters to be stripped. + * @return The string view with leading and trailing whitespace removed. */ - void strip(std::u8string& strl, const std::u8string_view& words); - /** - * @brief Return a copy of the string with leading and trailing whitespace removed. - * @param[in] strl The string to be stripped. - * @param[in] words The characters to be stripped. - * @return The copy of the string with leading and trailing whitespace removed. - */ - std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words); + std::u8string_view strip(const std::u8string_view& strl, const std::u8string_view& words); /** * @brief Remove leading whitespace from the string. * @param[in,out] strl The string to be stripped. * @param[in] words The characters to be stripped. + * @return The string view with leading whitespace removed. */ - void lstrip(std::u8string& strl, const std::u8string_view& words); - - /** - * @brief Return a copy of the string with leading whitespace removed. - * @param[in] strl The string to be stripped. - * @param[in] words The characters to be stripped. - * @return The copy of the string with leading whitespace removed. - */ - std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words); + std::u8string_view lstrip(const std::u8string_view& strl, const std::u8string_view& words); /** * @brief Remove trailing whitespace from the string. * @param[in,out] strl The string to be stripped. * @param[in] words The characters to be stripped. + * @return The string view with trailing whitespace removed. */ - void rstrip(std::u8string& strl, const std::u8string_view& words); - - /** - * @brief Return a copy of the string with trailing whitespace removed. - * @param[in] strl The string to be stripped. - * @param[in] words The characters to be stripped. - * @return The copy of the string with trailing whitespace removed. - */ - std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words); - + std::u8string_view rstrip(const std::u8string_view& strl, const std::u8string_view& words); #pragma endregion @@ -195,7 +174,7 @@ namespace yycc::string::op { */ class LazySplitIterator { public: - using iterator_category = std::forward_iterator_tag; + using iterator_category = std::input_iterator_tag; using value_type = std::u8string_view; using difference_type = std::ptrdiff_t; using pointer = const std::u8string_view*; diff --git a/testbench/yycc/string/op.cpp b/testbench/yycc/string/op.cpp index 3c86001..ff290ad 100644 --- a/testbench/yycc/string/op.cpp +++ b/testbench/yycc/string/op.cpp @@ -74,22 +74,39 @@ namespace yycctest::string::op { TEST(StringOp, Strip) { // Normal strip { - auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n"); + auto rv = OP::strip(u8" \taaa\n", u8" \t\r\n"); EXPECT_EQ(rv, u8"aaa"); } + { + auto rv = OP::lstrip(u8" \taaa\n", u8" \t\r\n"); + EXPECT_EQ(rv, u8"aaa\n"); + } + { + auto rv = OP::rstrip(u8" \taaa\n", u8" \t\r\n"); + EXPECT_EQ(rv, u8" \taaa"); + } // Special strip { - auto rv = OP::to_strip(u8"aaa", u8""); + auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"啊あ"); EXPECT_EQ(rv, u8"aaa"); } { - auto rv = OP::to_strip(u8"aaa", u8""); - EXPECT_EQ(rv, u8"aaa"); + auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"啊"); + EXPECT_EQ(rv, u8"aaaあああ"); } { - auto rv = OP::to_strip(u8"aaa", u8""); - EXPECT_EQ(rv, u8"aaa"); + auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"あ"); + EXPECT_EQ(rv, u8"啊啊啊aaa"); + } + + // Possible buggy strip. + // We use 2 UTF8 code points introduced following: + // U+00AA (UTF-8: C2 AA) + // U+1002A (UTF-8 : F0 90 80 AA) + { + auto rv = OP::rstrip(u8"aaa\u00AA", u8"\u00AA\U0001002A"); + EXPECT_EQ(rv, u8"aaa"); } }