1
0

feat: add code point splittor for utf8 string.

- this feature is added for strip function in string op.
This commit is contained in:
2025-09-26 21:43:12 +08:00
parent 99146ddd55
commit 190beeed58
3 changed files with 231 additions and 2 deletions

View File

@ -166,6 +166,151 @@ namespace yycc::string::op {
#pragma endregion
#pragma region Strip
#pragma region Code Point Iterator
class CodePointIterator {
public:
using iterator_category = std::forward_iterator_tag;
using value_type = std::u8string_view;
using difference_type = std::ptrdiff_t;
using pointer = const std::u8string_view*;
using reference = const std::u8string_view&;
private:
std::u8string_view current_str;
std::u8string_view next_str;
public:
CodePointIterator(const std::u8string_view& strl) : current_str(), next_str(strl) { ++(*this); }
reference operator*() const { return this->current_str; }
pointer operator->() const { return &this->current_str; }
CodePointIterator& operator++() {
// move next string to current string and analyse it
current_str = next_str;
next_str = std::u8string_view();
// we only process it if there is some chars
if (!current_str.empty()) {
// extract the string to be checked
std::u8string_view strl = current_str;
// get how many bytes this code point occupied.
size_t bytes_to_skip = evaluate_utf8_byte_count(strl.front());
// if evaluate skip size is overflow the whole size of string, throw exception
if (bytes_to_skip > strl.size()) throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes.");
// check following bytes are starts with 0b10
for (size_t i = 1; i < bytes_to_skip; ++i) {
check_continuation_byte(strl[i]);
}
// Everything is okey, set current string and next string
current_str = strl.substr(0, bytes_to_skip);
next_str = strl.substr(bytes_to_skip);
}
// return self
return *this;
}
CodePointIterator operator++(int) {
CodePointIterator temp = *this;
++(*this);
return temp;
}
bool operator==(const CodePointIterator& other) const {
return this->current_str == other.current_str && this->next_str == other.next_str;
}
bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
private:
/**
* @brief Calulate how many bytes following code point occupied according to first byte of sequence.
* @param[in] byte First sequence for checking.
* @return The size of following code point occupied ranging from 1 to 4 (inclusive).
*/
size_t evaluate_utf8_byte_count(char8_t c) const {
auto byte = static_cast<uint8_t>(c);
if ((byte & 0x80) == 0x00) return 1; // 0xxxxxxx
if ((byte & 0xE0) == 0xC0) return 2; // 110xxxxx
if ((byte & 0xF0) == 0xE0) return 3; // 1110xxxx
if ((byte & 0xF8) == 0xF0) return 4; // 11110xxx
throw std::runtime_error("invalid utf8 sequence. bad start byte");
}
/**
* @brief Check whether given byte is a valid continuation byte in UTF8.
* @param[in] c Byte for checking.
*/
void check_continuation_byte(char8_t c) const {
auto byte = static_cast<uint8_t>(c);
if ((byte & 0xC0) != 0x80) {
throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes.");
}
}
};
#pragma endregion
#pragma region Code Point
class CodePoint {
private:
std::u8string_view u8str;
public:
explicit CodePoint(std::u8string_view u8str) : u8str(u8str) {}
CodePointIterator begin() const { return CodePointIterator(u8str); }
CodePointIterator end() const {
// Pass empty string view indicate end.
return CodePointIterator(std::u8string_view());
}
};
#pragma endregion
template<bool bDoLeft, bool bDoRight>
void internal_strip(std::u8string& strl, const std::u8string_view& words) {
if constexpr (bDoLeft) {
}
if constexpr (bDoRight) {
}
}
void strip(std::u8string& strl, const std::u8string_view& words) {}
std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) {
std::u8string rv(strl);
strip(rv, words);
return rv;
}
void lstrip(std::u8string& strl, const std::u8string_view& words) {}
std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
std::u8string rv(strl);
lstrip(rv, words);
return rv;
}
void rstrip(std::u8string& strl, const std::u8string_view& words) {}
std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
std::u8string rv(strl);
rstrip(rv, words);
return rv;
}
#pragma endregion
#pragma region Split
// Reference:

View File

@ -9,6 +9,8 @@
namespace yycc::string::op {
#pragma region Printf
/**
* @brief Perform an UTF8 string formatting operation.
* @param[in] format The format string.
@ -38,6 +40,10 @@ namespace yycc::string::op {
*/
std::string vprintf(const char* format, va_list argptr);
#pragma endregion
#pragma region Replace
/**
* @brief Modify given string with all occurrences of substring \e old replaced by \e new.
* @param[in,out] strl The string for replacing
@ -54,6 +60,10 @@ namespace yycc::string::op {
*/
std::u8string replace(const std::u8string_view& _strl, const std::u8string_view& _from_strl, const std::u8string_view& _to_strl);
#pragma endregion
#pragma region Join
/**
* @brief The data provider of general join function.
* @details
@ -100,6 +110,10 @@ namespace yycc::string::op {
delimiter);
}
#pragma endregion
#pragma region Lower Upper
/**
* @brief Convert given string to lowercase.
* @param[in,out] strl The string to be lowercase.
@ -123,8 +137,56 @@ namespace yycc::string::op {
*/
std::u8string to_upper(const std::u8string_view& strl);
// TODO:
// Add strip, lstrip and rstrip functions.
#pragma endregion
#pragma region Strip
/**
* @brief Remove leading and trailing whitespace from the string.
* @param[in,out] strl The string to be stripped.
* @param[in] words The characters to be stripped.
*/
void strip(std::u8string& strl, const std::u8string_view& words);
/**
* @brief Return a copy of the string with leading and trailing whitespace removed.
* @param[in] strl The string to be stripped.
* @param[in] words The characters to be stripped.
* @return The copy of the string with leading and trailing whitespace removed.
*/
std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words);
/**
* @brief Remove leading whitespace from the string.
* @param[in,out] strl The string to be stripped.
* @param[in] words The characters to be stripped.
*/
void lstrip(std::u8string& strl, const std::u8string_view& words);
/**
* @brief Return a copy of the string with leading whitespace removed.
* @param[in] strl The string to be stripped.
* @param[in] words The characters to be stripped.
* @return The copy of the string with leading whitespace removed.
*/
std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words);
/**
* @brief Remove trailing whitespace from the string.
* @param[in,out] strl The string to be stripped.
* @param[in] words The characters to be stripped.
*/
void rstrip(std::u8string& strl, const std::u8string_view& words);
/**
* @brief Return a copy of the string with trailing whitespace removed.
* @param[in] strl The string to be stripped.
* @param[in] words The characters to be stripped.
* @return The copy of the string with trailing whitespace removed.
*/
std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words);
#pragma endregion
#pragma region Split

View File

@ -71,6 +71,28 @@ namespace yycctest::string::op {
EXPECT_EQ(rv, u8", 1, 2, ");
}
TEST(StringOp, Strip) {
// Normal strip
{
auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n");
EXPECT_EQ(rv, u8"aaa");
}
// Special strip
{
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
EXPECT_EQ(rv, u8"aaa");
}
{
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
EXPECT_EQ(rv, u8"aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
{
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
EXPECT_EQ(rv, u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa");
}
}
TEST(StringOp, Split) {
// Normal
{