feat: add code point splittor for utf8 string.
- this feature is added for strip function in string op.
This commit is contained in:
@ -166,6 +166,151 @@ namespace yycc::string::op {
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Strip
|
||||
|
||||
#pragma region Code Point Iterator
|
||||
|
||||
class CodePointIterator {
|
||||
public:
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
using value_type = std::u8string_view;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using pointer = const std::u8string_view*;
|
||||
using reference = const std::u8string_view&;
|
||||
|
||||
private:
|
||||
std::u8string_view current_str;
|
||||
std::u8string_view next_str;
|
||||
|
||||
public:
|
||||
CodePointIterator(const std::u8string_view& strl) : current_str(), next_str(strl) { ++(*this); }
|
||||
|
||||
reference operator*() const { return this->current_str; }
|
||||
|
||||
pointer operator->() const { return &this->current_str; }
|
||||
|
||||
CodePointIterator& operator++() {
|
||||
// move next string to current string and analyse it
|
||||
current_str = next_str;
|
||||
next_str = std::u8string_view();
|
||||
|
||||
// we only process it if there is some chars
|
||||
if (!current_str.empty()) {
|
||||
// extract the string to be checked
|
||||
std::u8string_view strl = current_str;
|
||||
|
||||
// get how many bytes this code point occupied.
|
||||
size_t bytes_to_skip = evaluate_utf8_byte_count(strl.front());
|
||||
// if evaluate skip size is overflow the whole size of string, throw exception
|
||||
if (bytes_to_skip > strl.size()) throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes.");
|
||||
// check following bytes are starts with 0b10
|
||||
for (size_t i = 1; i < bytes_to_skip; ++i) {
|
||||
check_continuation_byte(strl[i]);
|
||||
}
|
||||
|
||||
// Everything is okey, set current string and next string
|
||||
current_str = strl.substr(0, bytes_to_skip);
|
||||
next_str = strl.substr(bytes_to_skip);
|
||||
}
|
||||
|
||||
// return self
|
||||
return *this;
|
||||
}
|
||||
|
||||
CodePointIterator operator++(int) {
|
||||
CodePointIterator temp = *this;
|
||||
++(*this);
|
||||
return temp;
|
||||
}
|
||||
|
||||
bool operator==(const CodePointIterator& other) const {
|
||||
return this->current_str == other.current_str && this->next_str == other.next_str;
|
||||
}
|
||||
|
||||
bool operator!=(const CodePointIterator& other) const { return !(*this == other); }
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Calulate how many bytes following code point occupied according to first byte of sequence.
|
||||
* @param[in] byte First sequence for checking.
|
||||
* @return The size of following code point occupied ranging from 1 to 4 (inclusive).
|
||||
*/
|
||||
size_t evaluate_utf8_byte_count(char8_t c) const {
|
||||
auto byte = static_cast<uint8_t>(c);
|
||||
if ((byte & 0x80) == 0x00) return 1; // 0xxxxxxx
|
||||
if ((byte & 0xE0) == 0xC0) return 2; // 110xxxxx
|
||||
if ((byte & 0xF0) == 0xE0) return 3; // 1110xxxx
|
||||
if ((byte & 0xF8) == 0xF0) return 4; // 11110xxx
|
||||
throw std::runtime_error("invalid utf8 sequence. bad start byte");
|
||||
}
|
||||
/**
|
||||
* @brief Check whether given byte is a valid continuation byte in UTF8.
|
||||
* @param[in] c Byte for checking.
|
||||
*/
|
||||
void check_continuation_byte(char8_t c) const {
|
||||
auto byte = static_cast<uint8_t>(c);
|
||||
if ((byte & 0xC0) != 0x80) {
|
||||
throw std::runtime_error("bad utf8 sequence. no sufficient continuation bytes.");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Code Point
|
||||
|
||||
class CodePoint {
|
||||
private:
|
||||
std::u8string_view u8str;
|
||||
|
||||
public:
|
||||
explicit CodePoint(std::u8string_view u8str) : u8str(u8str) {}
|
||||
|
||||
CodePointIterator begin() const { return CodePointIterator(u8str); }
|
||||
|
||||
CodePointIterator end() const {
|
||||
// Pass empty string view indicate end.
|
||||
return CodePointIterator(std::u8string_view());
|
||||
}
|
||||
};
|
||||
|
||||
#pragma endregion
|
||||
|
||||
template<bool bDoLeft, bool bDoRight>
|
||||
void internal_strip(std::u8string& strl, const std::u8string_view& words) {
|
||||
if constexpr (bDoLeft) {
|
||||
}
|
||||
|
||||
if constexpr (bDoRight) {
|
||||
}
|
||||
}
|
||||
|
||||
void strip(std::u8string& strl, const std::u8string_view& words) {}
|
||||
|
||||
std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::u8string rv(strl);
|
||||
strip(rv, words);
|
||||
return rv;
|
||||
}
|
||||
|
||||
void lstrip(std::u8string& strl, const std::u8string_view& words) {}
|
||||
|
||||
std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::u8string rv(strl);
|
||||
lstrip(rv, words);
|
||||
return rv;
|
||||
}
|
||||
|
||||
void rstrip(std::u8string& strl, const std::u8string_view& words) {}
|
||||
|
||||
std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words) {
|
||||
std::u8string rv(strl);
|
||||
rstrip(rv, words);
|
||||
return rv;
|
||||
}
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Split
|
||||
|
||||
// Reference:
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
namespace yycc::string::op {
|
||||
|
||||
#pragma region Printf
|
||||
|
||||
/**
|
||||
* @brief Perform an UTF8 string formatting operation.
|
||||
* @param[in] format The format string.
|
||||
@ -38,6 +40,10 @@ namespace yycc::string::op {
|
||||
*/
|
||||
std::string vprintf(const char* format, va_list argptr);
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Replace
|
||||
|
||||
/**
|
||||
* @brief Modify given string with all occurrences of substring \e old replaced by \e new.
|
||||
* @param[in,out] strl The string for replacing
|
||||
@ -54,6 +60,10 @@ namespace yycc::string::op {
|
||||
*/
|
||||
std::u8string replace(const std::u8string_view& _strl, const std::u8string_view& _from_strl, const std::u8string_view& _to_strl);
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Join
|
||||
|
||||
/**
|
||||
* @brief The data provider of general join function.
|
||||
* @details
|
||||
@ -100,6 +110,10 @@ namespace yycc::string::op {
|
||||
delimiter);
|
||||
}
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Lower Upper
|
||||
|
||||
/**
|
||||
* @brief Convert given string to lowercase.
|
||||
* @param[in,out] strl The string to be lowercase.
|
||||
@ -123,8 +137,56 @@ namespace yycc::string::op {
|
||||
*/
|
||||
std::u8string to_upper(const std::u8string_view& strl);
|
||||
|
||||
// TODO:
|
||||
// Add strip, lstrip and rstrip functions.
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Strip
|
||||
|
||||
/**
|
||||
* @brief Remove leading and trailing whitespace from the string.
|
||||
* @param[in,out] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
*/
|
||||
void strip(std::u8string& strl, const std::u8string_view& words);
|
||||
/**
|
||||
* @brief Return a copy of the string with leading and trailing whitespace removed.
|
||||
* @param[in] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The copy of the string with leading and trailing whitespace removed.
|
||||
*/
|
||||
std::u8string to_strip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Remove leading whitespace from the string.
|
||||
* @param[in,out] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
*/
|
||||
void lstrip(std::u8string& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Return a copy of the string with leading whitespace removed.
|
||||
* @param[in] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The copy of the string with leading whitespace removed.
|
||||
*/
|
||||
std::u8string to_lstrip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Remove trailing whitespace from the string.
|
||||
* @param[in,out] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
*/
|
||||
void rstrip(std::u8string& strl, const std::u8string_view& words);
|
||||
|
||||
/**
|
||||
* @brief Return a copy of the string with trailing whitespace removed.
|
||||
* @param[in] strl The string to be stripped.
|
||||
* @param[in] words The characters to be stripped.
|
||||
* @return The copy of the string with trailing whitespace removed.
|
||||
*/
|
||||
std::u8string to_rstrip(const std::u8string_view& strl, const std::u8string_view& words);
|
||||
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Split
|
||||
|
||||
|
@ -71,6 +71,28 @@ namespace yycctest::string::op {
|
||||
EXPECT_EQ(rv, u8", 1, 2, ");
|
||||
}
|
||||
|
||||
TEST(StringOp, Strip) {
|
||||
// Normal strip
|
||||
{
|
||||
auto rv = OP::to_strip(u8" \taaa\n", u8" \t\r\n");
|
||||
EXPECT_EQ(rv, u8"aaa");
|
||||
}
|
||||
|
||||
// Special strip
|
||||
{
|
||||
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
EXPECT_EQ(rv, u8"aaa");
|
||||
}
|
||||
{
|
||||
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
|
||||
EXPECT_EQ(rv, u8"aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
|
||||
}
|
||||
{
|
||||
auto rv = OP::to_strip(u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", u8"<EFBFBD><EFBFBD>");
|
||||
EXPECT_EQ(rv, u8"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>aaa");
|
||||
}
|
||||
}
|
||||
|
||||
TEST(StringOp, Split) {
|
||||
// Normal
|
||||
{
|
||||
|
Reference in New Issue
Block a user