From e1823d4b8ee8a494c86b6759cf977d44fbab15ca Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Sat, 29 Jun 2024 17:39:13 +0800 Subject: [PATCH] feat: add new split function reducing memory cost. - add a new split function, SplitView which can reduce cost memory by using string view. - add a new testbench for split function for testing empty source string. - add documentation for some string helper function. - improve library encoding documentation. --- doc/src/intro.dox | 10 ++-- doc/src/library_encoding.dox | 113 +++++++++++++++++++++++++---------- doc/src/string_helper.dox | 37 ++++++++++++ src/StringHelper.cpp | 31 ++++++---- src/StringHelper.hpp | 6 +- testbench/main.cpp | 9 ++- 6 files changed, 153 insertions(+), 53 deletions(-) diff --git a/doc/src/intro.dox b/doc/src/intro.dox index aa43b28..0600780 100644 --- a/doc/src/intro.dox +++ b/doc/src/intro.dox @@ -11,9 +11,9 @@ It's also good for bug fix. If I found bug in these code, I only need to fix it in this project. Otherwise I need to fix them one by one in each project because they share the same code. -\section intro_why Why YYCCommonplace +\section intro__why Why YYCCommonplace -\subsection intro_why_windows Windows Issues +\subsection intro__why__windows Windows Issues I frequently program on Windows environment because the software I programming for, Virtools, is Windows-only software. During programming, I found Windows is super lack in UTF8 supports. @@ -33,7 +33,7 @@ This is one of the reasons why I create this library. I create much wrappers for these weird Windows functions. Thus I can have a similar Linux C++ programming experience on Windows. -\subsection intro_why_std Standard Library Issues +\subsection intro__why__std Standard Library Issues The eccentric decision of standard commission also is the reason why I create this library. @@ -50,7 +50,7 @@ That's why I create this library. I bring these function in this library. Not industrial level, but easy to use and have enough performance in my project. -\subsection intro_why_boost Boost Issues +\subsection intro__why__boost Boost Issues Bosst is a powerful C++ library. But the shortcoming is overt. It's tooooo big. This drawback will be more obvious considering the bad dependency mechanism of C++. @@ -65,7 +65,7 @@ I don't need extreme performance. I just want my code works. So I create this library, bring some Boost functions with ordinary but not bad implementation. -\section intro_usage Library Usage +\section intro__usage Library Usage Before using this library, I suggest you read this manual fully to have a full overview of this library. Otherwise you may make mistake during using this library. diff --git a/doc/src/library_encoding.dox b/doc/src/library_encoding.dox index 1b10d8c..c04b40e 100644 --- a/doc/src/library_encoding.dox +++ b/doc/src/library_encoding.dox @@ -8,31 +8,94 @@ for example, function explicitly order the encoding of input parameters. In following content of this article, you will know the details about how we use UTF8 in this library. -\section library_encoding_utf8_type UTF8 Type +\section library_encoding__utf8_type UTF8 Type + +YYCC uses custom UTF8 char type, string container and string view all over the library, from parameters to return value. +Following content will introduce how we define them. + +\subsection library_encoding__utf8_type__char_type Char Type YYCC library has its own UTF8 char type, \c yycc_char8_t. -You may notice C++ standard library also has a UTF8 char type called \c char8_t. You are right. +This is how we define it: + +\code +#if defined(__cpp_char8_t) +using yycc_char8_t = char8_t; +#else +using yycc_char8_t = unsigned char; +#endif +\endcode + If your environment (higher or equal to C++ 20) supports \c char8_t provided by standard library, \c yycc_char8_t is just an alias to \c char8_t, otherwise (lower than C++ 20, e.g. C++ 17), \c yycc_char8_t will be defined as \c unsigned \c char like C++ 20 does (this can be seen as a polyfill). -After confirming the UTF8 char type, other derived types also will be decided. -YYCC also defines \c yycc_u8string to \c std::basic_string and \c yycc_u8string_view to \c std::basic_string_view. -In \c char8_t environment, they are just the alias to \c std::u8string and \c std::u8string_view respectively. +This means that if you already have used \c char8_t provided by standard library, +you do not need to do any extra modification before using this library. +Because all types are compatible. -Now, library has all essential UTF8 related types. -These types are used in library everywhere, from parameters to return value. +\subsection library_encoding__utf8_type__container_type String Container and View + +We define string container and string view like this: + +\code +using yycc_u8string = std::basic_string; +using yycc_u8string_view = std::basic_string_view; +\endcode + +The real code written in library may be slightly different with this but they have same meanings. + +In \c char8_t environment, they are just the alias to \c std::u8string and \c std::u8string_view respectively. +So if you have already used them, no need to any modification for your code before using this library. + +\subsection library_encoding__utf8_type__why Why? You may curious why I create a new UTF8 char type, rather than using standard library UTF8 char type directly. There are 2 reasons. + First, It was too late that I notice I can use standard library UTF8 char type. My UTF8 char type has been used in library everywhere and its tough to fully replace them into standard library UTF8 char type. + Second, UTF8 related content of standard library is \e volatile. I notice standard library change UTF8 related functions frequently and its API are not stable. For example, standard library brings \c std::codecvt_utf8 in C++ 11, deprecate it in C++ 17 and even remove it in C++ 26. That's unacceptable! So I create my own UTF8 type to avoid the scenario that standard library remove \c char8_t in future. -\section library_encoding_utf8_literal UTF8 Literal +\section library_encoding__utf8_literal UTF8 Literal -C++ standard allows programmer declare an UTF8 literal explicitly by writing code like this: +String literal is a C++ concept. +If you are not familar with it, please browse related article first, such as CppReference. + +\subsection library_encoding__utf8_literal__single Single Literal + +In short words, YYCC allow you declare an UTF8 literal like this: + +\code +YYCC_U8("This is UTF8 literal.") +\endcode + +YYCC_U8 is macro. +You don't need add extra \c u8 prefix in string given to the macro. +This macro will do this automatically. + +In detail, this macro do a \c reinterpret_cast to change the type of given argument to \c const \c yycc_char8_t* forcely. +This ensure that declared UTF8 literal is compatible with YYCC UTF8 types. + +\subsection library_encoding__utf8_literal__concatenation Literal Concatenation + +YYCC_U8 macro also works for string literal concatenation: + +\code +YYCC_U8("Error code: " PRIu32 ". Please contact me."); +\endcode + +According to C++ standard for string literal concatenation, +"If one of the strings has an encoding prefix and the other does not, the one that does not will be considered to have the same encoding prefix as the other." +At the same time, YYCC_U8 macro will automatically add \c u8 prefix for the first component of this string literal concatenation. +So the whole string will be UTF8 literal. +It also order you should \b not add any prefix for other components of this string literal concatenation. + +\subsection library_encoding__utf8_literal__why Why? + +You may know that C++ standard allows programmer declare an UTF8 literal explicitly by writing code like this: \code u8"foo bar" @@ -44,27 +107,12 @@ otherwise it will return \c const \c char*. This behavior cause that you can not assign this UTF8 literal to \c yycc_u8string if you are in the environment which do not support \c char8_t, because their types are different. Thereas you can not use the functions provided by this library because they are all use YYCC defined UTF8 char type. -So I will tell you how to correctly create UTF8 literal in the following content. -YYCC provides a macro \c YYCC_U8 to resolve this issue. -You can declare UTF8 literal like this: +\section library_encoding__utf8_pointer UTF8 String Pointer -\code -YYCC_U8("This is UTF8 literal.") -\endcode - -You don't need add extra \c u8 prefix in string given to the macro. -This macro will do this automatically. - -In detail, this macro do a \c reinterpret_cast to change the type of given argument to \c const \c yycc_char8_t* forcely. -This ensure that declared UTF8 literal is compatible with YYCC UTF8 types. - -\section library_encoding_utf8_pointer UTF8 String Pointer - -Besides UTF8 literal, another issue you may be faced is how to convert native UTF8 string pointer to YYCC UTF8 type -(\e native means \c const \c char* or \c char*, the string using char as its char type). -Many legacy code assume \c char* is encoded with UTF8 (the exception is Windows). But \c char* is incompatible with yycc_char8_t. +String pointer means the raw pointer pointing to a string, such as \c const \c char*, \c char*, \c char32_t* and etc. +Many legacy code assume \c char* is encoded with UTF8 (the exception is Windows). But \c char* is incompatible with \c yycc_char8_t. YYCC provides YYCC::EncodingHelper::ToUTF8 to resolve this issue. There is an exmaple: \code @@ -77,7 +125,7 @@ yycc_char8_t* mutable_converted = YYCC::EncodingHelper::ToUTF8(mutable_utf8); YYCC::EncodingHelper::ToUTF8 has 2 overloads which can handle const and mutable stirng pointer convertion respectively. -YYCC also provide ability that convert YYCC UTF8 char type to native char type by YYCC::EncodingHelper::ToNative. +YYCC also has ability that convert YYCC UTF8 char type to native char type by YYCC::EncodingHelper::ToNative. Here is an exmaple: \code @@ -90,15 +138,14 @@ char* mutable_converted = YYCC::EncodingHelper::ToNative(mutable_yycc_utf8); Same as YYCC::EncodingHelper::ToUTF8, YYCC::EncodingHelper::ToNative also has 2 overloads to handle const and mutable string pointer. -\section library_encoding_utf8_container UTF8 String Container +\section library_encoding__utf8_container UTF8 String Container + +String container usually means the standard library string container, such as \c std::string, \c std::wstring, \c std::u32string and etc. -The final issue you faced is string container. In many personal project, programmer may use \c std::string everywhere because \c std::u8string may not be presented when writing peoject. How to do convertion between native string container and YYCC UTF8 string container? - It is definitely illegal that directly do force convertion. Because they may have different class layout. Calm down and I will tell you how to do correct convertion. - YYCC provides YYCC::EncodingHelper::ToUTF8 to convert native string container to YYCC UTF8 string container. There is an exmaple: @@ -129,7 +176,7 @@ Same as UTF8 string pointer, we also have YYCC::EncodingHelper::ToNative and YYC Try to do your own research and figure out how to use them. It's pretty easy. -\section library_encoding_windows Warnings to Windows Programmer +\section library_encoding__windows Warnings to Windows Programmer Due to the legacy of MSVC, the encoding of \c char* may not be UTF8 in most cases. If you run the convertion code introduced in this article with the string which is not encoded with UTF8, it may cause undefined behavior. diff --git a/doc/src/string_helper.dox b/doc/src/string_helper.dox index bfd93c5..e8004b2 100644 --- a/doc/src/string_helper.dox +++ b/doc/src/string_helper.dox @@ -3,4 +3,41 @@ \page string_helper String Helper +\section string_helper_lower_upper Lower Upper + +String helper provides Python-like string lower and upper function. +Both lower and upper function have 2 overloads: + +\code +yycc_u8string Lower(const yycc_char8_t*); +void Lower(yycc_u8string&); +\endcode + +First overload accepts a NULL-terminated string as argument and return a \b copy whose content are all the lower case of original string. +Second overload accepts a mutable string container as argument and will make all characters stored in it become their lower case. +You can choose on of them for your flavor and requirements. +Upper also has similar 2 overloads. + +\section string_helper_split Split + +String helper provides Python-like string split function. +It has 2 types for you: + +\code +std::vector Split(const yycc_u8string_view&, const yycc_char8_t*); +std::vector SplitView(const yycc_u8string_view&, const yycc_char8_t*); +\endcode + +All these overloads take a string view as the first argument for the string need to be split. +The second argument is a raw string pointer representing the decilmer for splitting. +The only difference between these 2 split function are overt according to their names. +The first split function will return a list of copied string as its split result. +The second split function will return a list of string view as its split result, +and it will keep valid as long as the life time of your given string view argument. +It also means that the last type will cost less memory if you don't need the copy of original string. + +If the source string (the string need to be split) is empty, or the decilmer is \c nullptr or empty, +the result will only has 1 item and this item is source string itself. +There is no way that this method return an empty list, except the code is buggy. + */ diff --git a/src/StringHelper.cpp b/src/StringHelper.cpp index fc1eafa..b53c084 100644 --- a/src/StringHelper.cpp +++ b/src/StringHelper.cpp @@ -212,32 +212,43 @@ namespace YYCC::StringHelper { #pragma region Split - std::vector Split(const yycc_char8_t* _strl, const yycc_char8_t* _decilmer) { + std::vector Split(const yycc_u8string_view& strl, const yycc_char8_t* _decilmer) { + // call split view + auto view_result = SplitView(strl, _decilmer); + + // copy string view result to string + std::vector elems; + for (const auto& strl_view : view_result) { + elems.emplace_back(yycc_u8string(strl_view)); + } + // return copied result + return elems; + } + + std::vector SplitView(const yycc_u8string_view& strl, const yycc_char8_t* _decilmer) { // Reference: // https://stackoverflow.com/questions/14265581/parse-split-a-string-in-c-using-string-delimiter-standard-c // prepare return value - std::vector elems; + std::vector elems; - // if the string need to be splitted is nullptr, return empty result. - if (_strl == nullptr) return elems; - yycc_u8string strl(_strl); - // if decilmer is nullptr, or decilmer is zero length, return original string + // if string need to be splitted is empty, return original string (empty item). + // if decilmer is nullptr, or decilmer is zero length, return original string. yycc_u8string decilmer; - if (_decilmer == nullptr || (decilmer = _decilmer, decilmer.empty())) { - elems.push_back(strl); + if (strl.empty() || _decilmer == nullptr || (decilmer = _decilmer, decilmer.empty())) { + elems.emplace_back(strl); return elems; } // start spliting std::size_t previous = 0, current; while ((current = strl.find(decilmer.c_str(), previous)) != yycc_u8string::npos) { - elems.push_back(strl.substr(previous, current - previous)); + elems.emplace_back(strl.substr(previous, current - previous)); previous = current + decilmer.size(); } // try insert last part but prevent possible out of range exception if (previous <= strl.size()) { - elems.push_back(strl.substr(previous)); + elems.emplace_back(strl.substr(previous)); } return elems; } diff --git a/src/StringHelper.hpp b/src/StringHelper.hpp index fd5536f..5be70cc 100644 --- a/src/StringHelper.hpp +++ b/src/StringHelper.hpp @@ -49,7 +49,7 @@ namespace YYCC::StringHelper { /** * @brief General Split function. - * @param _strl[in] The string need to be splitting. + * @param strl[in] The string need to be splitting. * If this is nullptr, the result will be empty. * @param _decilmer[in] The decilmer for splitting. * If decilmer is nullptr or zero length, the result will only have 1 element which is original string. @@ -58,5 +58,7 @@ namespace YYCC::StringHelper { * It can works in most toy cases but not suit for high performance scenario. * Also, this function will produce a copy of original string because it is not zero copy. */ - std::vector Split(const yycc_char8_t* _strl, const yycc_char8_t* _decilmer); + std::vector Split(const yycc_u8string_view& strl, const yycc_char8_t* _decilmer); + std::vector SplitView(const yycc_u8string_view& strl, const yycc_char8_t* _decilmer); + } diff --git a/testbench/main.cpp b/testbench/main.cpp index b62949e..62f2111 100644 --- a/testbench/main.cpp +++ b/testbench/main.cpp @@ -221,18 +221,21 @@ namespace YYCCTestbench { Assert(test_join == YYCC_U8(", 2, 1, "), YYCC_U8("YYCC::StringHelper::Join")); // Test Split - auto test_split = YYCC::StringHelper::Split(YYCC_U8(", 1, 2, "), YYCC_U8(", ")); + auto test_split = YYCC::StringHelper::Split(YYCC_U8(", 1, 2, "), YYCC_U8(", ")); // normal Assert(test_split.size() == 4u, YYCC_U8("YYCC::StringHelper::Split")); Assert(test_split[0] == YYCC_U8(""), YYCC_U8("YYCC::StringHelper::Split")); Assert(test_split[1] == YYCC_U8("1"), YYCC_U8("YYCC::StringHelper::Split")); Assert(test_split[2] == YYCC_U8("2"), YYCC_U8("YYCC::StringHelper::Split")); Assert(test_split[3] == YYCC_U8(""), YYCC_U8("YYCC::StringHelper::Split")); - test_split = YYCC::StringHelper::Split(YYCC_U8("test"), YYCC_U8("-")); + test_split = YYCC::StringHelper::Split(YYCC_U8("test"), YYCC_U8("-")); // no matched decilmer Assert(test_split.size() == 1u, YYCC_U8("YYCC::StringHelper::Split")); Assert(test_split[0] == YYCC_U8("test"), YYCC_U8("YYCC::StringHelper::Split")); - test_split = YYCC::StringHelper::Split(YYCC_U8("test"), YYCC_U8("")); + test_split = YYCC::StringHelper::Split(YYCC_U8("test"), YYCC_U8("")); // empty decilmer Assert(test_split.size() == 1u, YYCC_U8("YYCC::StringHelper::Split")); Assert(test_split[0] == YYCC_U8("test"), YYCC_U8("YYCC::StringHelper::Split")); + test_split = YYCC::StringHelper::Split(YYCC::yycc_u8string_view(), YYCC_U8("")); // empty source string + Assert(test_split.size() == 1u, YYCC_U8("YYCC::StringHelper::Split")); + Assert(test_split[0].empty(), YYCC_U8("YYCC::StringHelper::Split")); }