feat: finish string strip (maybe)
- add Trie Tree for string strip op. - finish string strip but it may still buggy (though i have tested it)
This commit is contained in:
@ -1,4 +1,7 @@
|
||||
#include "op.hpp"
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
@ -172,7 +175,7 @@ namespace yycc::string::op {
|
||||
|
||||
class CodePointIterator {
|
||||
public:
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
using value_type = std::u8string_view;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using pointer = const std::u8string_view*;
|
||||
@ -274,39 +277,149 @@ namespace yycc::string::op {
|
||||
}
|
||||
};
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Trie Tree Node
|
||||
|
||||
struct TrieTreeNode {
|
||||
TrieTreeNode() : is_end(false), children() {}
|
||||
bool is_end; ///< Whether this node is a viable end.
|
||||
std::map<char8_t, std::unique_ptr<TrieTreeNode>> children; ///< The children node.
|
||||
};
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Trie Tree
|
||||
|
||||
class TrieTree {
|
||||
private:
|
||||
std::unique_ptr<TrieTreeNode> root;
|
||||
|
||||
public:
|
||||
TrieTree() : root(std::make_unique<TrieTreeNode>()) {
|
||||
// Do not accept root element always (no empty string).
|
||||
root->is_end = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Insert new words in trie tree.
|
||||
* @details
|
||||
* The reason why use iterator, rather than string view, is that in strip function, we may need insert a string backwardly,
|
||||
* so using string view reverse iterator and iterator argument can easily achieve this goal without any more burden.
|
||||
* @tparam InputIt The iterator following input iterator name convention.
|
||||
* @param[in] first The head of iterator.
|
||||
* @param[in] last The tail of iterator.
|
||||
*/
|
||||
template<std::input_iterator InputIt>
|
||||
requires std::is_same_v<std::iter_value_t<InputIt>, char8_t>
|
||||
void insert(InputIt first, InputIt last) {
|
||||
// prevent empty string
|
||||
if (first == last) return;
|
||||
|
||||
// insert item
|
||||
TrieTreeNode* node = root.get();
|
||||
for (auto it = first; it != last; ++it) {
|
||||
char8_t c = *it;
|
||||
if (node->children.find(c) == node->children.end()) {
|
||||
node->children[c] = std::make_unique<TrieTreeNode>();
|
||||
}
|
||||
node = node->children[c].get();
|
||||
}
|
||||
node->is_end = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check whether given words has prefix stored in this trie tree.
|
||||
* @details Same reason for using iterator as function argument.
|
||||
* @tparam InputIt The iterator following input iterator name convention.
|
||||
* @param[in] first The head of iterator.
|
||||
* @param[in] last The tail of iterator.
|
||||
* @return \c std::nullopt if there is no match, otherwise the length of matched prefix.
|
||||
*/
|
||||
template<std::input_iterator InputIt>
|
||||
requires std::is_same_v<std::iter_value_t<InputIt>, char8_t>
|
||||
std::optional<size_t> search(InputIt first, InputIt last) {
|
||||
TrieTreeNode* node = root.get();
|
||||
size_t cnt = 0;
|
||||
|
||||
for (auto it = first; it != last; ++it) {
|
||||
char8_t c = *it;
|
||||
auto finder = node->children.find(c);
|
||||
if (finder == node->children.end()) {
|
||||
// There is no more matching, break the while.
|
||||
break;
|
||||
} else {
|
||||
// There are more matching item, find next one.
|
||||
node = finder->second.get();
|
||||
++cnt;
|
||||
}
|
||||
}
|
||||
|
||||
// YYC MARK:
|
||||
// There is a fatal bug for Trie Tree, but it doesn't matter with our usage scenario.
|
||||
//
|
||||
// Assume there is two string "ab" and "abcd". If user give "abc",
|
||||
// we should match it with "ab" prefix, but this function will return there is no match.
|
||||
// However, this is impossible for UTF8 sequence.
|
||||
// There is no possibility that two UTF8 sequence, indicating two different Unicode code point respectively,
|
||||
// has the same prefix and different length. Because their first byte must be different,
|
||||
// the first byte indicate the length of sequence.
|
||||
//
|
||||
// This result also can be proven for suffix,
|
||||
// because first byte must not be equal to any other continuation bytes.
|
||||
// It is impossible that they have same "ab".
|
||||
//
|
||||
// So it is safe for our usage scenario although this bug is presented.
|
||||
|
||||
// check whether current is valid end.
|
||||
// if it is, return the count of prefix, otherwise return nothing.
|
||||
if (node->is_end) {
|
||||
return cnt;
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#pragma endregion
|
||||
|
||||
template<bool bDoLeft, bool bDoRight>
|
||||
void internal_strip(std::u8string& strl, const std::u8string_view& words) {
|
||||
std::u8string_view internal_strip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::optional<TrieTree> prefix, suffix;
|
||||
if constexpr (bDoLeft) prefix = TrieTree();
|
||||
if constexpr (bDoRight) suffix = TrieTree();
|
||||
|
||||
CodePoint code_point(words);
|
||||
for (const auto& seq : code_point) {
|
||||
if (prefix.has_value()) prefix.value().insert(seq.begin(), seq.end());
|
||||
if (suffix.has_value()) suffix.value().insert(seq.rbegin(), seq.rend());
|
||||
}
|
||||
|
||||
std::u8string_view striped = strl;
|
||||
if constexpr (bDoLeft) {
|
||||
while (auto cnt = prefix.value().search(striped.begin(), striped.end())) {
|
||||
striped = striped.substr(cnt.value());
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (bDoRight) {
|
||||
while (auto cnt = suffix.value().search(striped.rbegin(), striped.rend())) {
|
||||
striped = striped.substr(0, striped.size() - cnt.value());
|
||||
}
|
||||
}
|
||||
|
||||
void strip(std::u8string& strl, const std::u8string_view& words) {}
|
||||
|
||||
std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::u8string rv(strl);
|
||||
strip(rv, words);
|
||||
return rv;
|
||||
return striped;
|
||||
}
|
||||
|
||||
void lstrip(std::u8string& strl, const std::u8string_view& words) {}
|
||||
|
||||
std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::u8string rv(strl);
|
||||
lstrip(rv, words);
|
||||
return rv;
|
||||
std::u8string_view strip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
return internal_strip<true, true>(strl, words);
|
||||
}
|
||||
|
||||
void rstrip(std::u8string& strl, const std::u8string_view& words) {}
|
||||
std::u8string_view lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
return internal_strip<true, false>(strl, words);
|
||||
}
|
||||
|
||||
std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::u8string rv(strl);
|
||||
rstrip(rv, words);
|
||||
return rv;
|
||||
std::u8string_view rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
return internal_strip<false, true>(strl, words);
|
||||
}
|
||||
|
||||
#pragma endregion
|
||||
|
@ -145,46 +145,25 @@ namespace yycc::string::op {
|
||||
* @brief Remove leading and trailing whitespace from the string.
|
||||
* @param[in,out] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The string view with leading and trailing whitespace removed.
|
||||
*/
|
||||
void strip(std::u8string& strl, const std::u8string_view& words);
|
||||
/**
|
||||
* @brief Return a copy of the string with leading and trailing whitespace removed.
|
||||
* @param[in] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The copy of the string with leading and trailing whitespace removed.
|
||||
*/
|
||||
std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
std::u8string_view strip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Remove leading whitespace from the string.
|
||||
* @param[in,out] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The string view with leading whitespace removed.
|
||||
*/
|
||||
void lstrip(std::u8string& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Return a copy of the string with leading whitespace removed.
|
||||
* @param[in] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The copy of the string with leading whitespace removed.
|
||||
*/
|
||||
std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
std::u8string_view lstrip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Remove trailing whitespace from the string.
|
||||
* @param[in,out] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The string view with trailing whitespace removed.
|
||||
*/
|
||||
void rstrip(std::u8string& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Return a copy of the string with trailing whitespace removed.
|
||||
* @param[in] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The copy of the string with trailing whitespace removed.
|
||||
*/
|
||||
std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
std::u8string_view rstrip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
#pragma endregion
|
||||
|
||||
@ -195,7 +174,7 @@ namespace yycc::string::op {
|
||||
*/
|
||||
class LazySplitIterator {
|
||||
public:
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
using value_type = std::u8string_view;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using pointer = const std::u8string_view*;
|
||||
|
@ -74,22 +74,39 @@ namespace yycctest::string::op {
|
||||
TEST(StringOp, Strip) {
|
||||
// Normal strip
|
||||
{
|
||||
auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n");
|
||||
auto rv = OP::strip(u8" \taaa\n", u8" \t\r\n");
|
||||
EXPECT_EQ(rv, u8"aaa");
|
||||
}
|
||||
{
|
||||
auto rv = OP::lstrip(u8" \taaa\n", u8" \t\r\n");
|
||||
EXPECT_EQ(rv, u8"aaa\n");
|
||||
}
|
||||
{
|
||||
auto rv = OP::rstrip(u8" \taaa\n", u8" \t\r\n");
|
||||
EXPECT_EQ(rv, u8" \taaa");
|
||||
}
|
||||
|
||||
// Special strip
|
||||
{
|
||||
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"啊あ");
|
||||
EXPECT_EQ(rv, u8"aaa");
|
||||
}
|
||||
{
|
||||
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
|
||||
EXPECT_EQ(rv, u8"aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"啊");
|
||||
EXPECT_EQ(rv, u8"aaaあああ");
|
||||
}
|
||||
{
|
||||
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
|
||||
EXPECT_EQ(rv, u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa");
|
||||
auto rv = OP::strip(u8"啊啊啊aaaあああ", u8"あ");
|
||||
EXPECT_EQ(rv, u8"啊啊啊aaa");
|
||||
}
|
||||
|
||||
// Possible buggy strip.
|
||||
// We use 2 UTF8 code points introduced following:
|
||||
// U+00AA (UTF-8: C2 AA)
|
||||
// U+1002A (UTF-8 : F0 90 80 AA)
|
||||
{
|
||||
auto rv = OP::rstrip(u8"aaa\u00AA", u8"\u00AA\U0001002A");
|
||||
EXPECT_EQ(rv, u8"aaa");
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user