fix: fix NlpCodec compile issue.

- fix std::ifstream length getter. - use std::format in throwing exception.
2024-12-11 16:20:21 +08:00
parent 52ea2745dd
commit 6193a2ede6
1 changed files with 113 additions and 30 deletions
--- a/NlpCodec/NlpCodec.cpp
+++ b/NlpCodec/NlpCodec.cpp
@ -9,9 +9,13 @@
 #include <limits>
 #include <stdexcept>
 #include <utility>
 #include <format>
 namespace NlpCodec {
    /// @brief NlpCodec universal exception.
    /// @details Once this exception was thrown, it means that somethings went wrong.
    /// and main function should catch it, output error message and exit program immediately.
    class NlpException : public std::exception {
    public:
        NlpException(const char* msg) : message(msg ? msg : "") {}
@ -22,25 +26,107 @@ namespace NlpCodec {
        std::string message;
    };
-    /// @brief The safe version of static_cast which throw exception
+    /// @brief The safe version of `static_cast` which throw exception
    /// if given value can not be cast into given type (out of range).
    template<typename _TyTo, typename _TyFrom>
    static constexpr _TyTo SafeCast(_TyFrom value) {
        if (!std::in_range<_TyTo>(value))
            throw NlpException(
-                    "Fail to cast integral number because given value is greater than container."
+                    "Fail to cast integral number because given value is greater than container. "
                    "This is usually caused by your input or output file is too long.");
        return static_cast<_TyTo>(value);
    }
-    /// @brief The magic DWORD for file length encrption.
+#pragma region "Encryption Stuff" {
    /// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
    /// @brief The size of extra part of NLP file which store the size of original plain text file.
    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
-    /// @brief The core array for data encryption.
+    /*
    # NLP File Structure
    |Annotation         |Size       |
    |:---               |:---       |
    |Body               |variable   |
    |Raw File Length    |4 bytes    |
    |Checksum           |4 bytes    |
    ## Body
    The first part is a zlib compressed byte array.
    Before any process, we need use zlib to decompress it first.
    If we need do reverse operation, e.g. build this compressed byte array,
    the compression level must be maximum value (best compression, e.g. 9).
    After decompress this byte array, we need to an extra step called circular XOR operation
    to get human-readable plain text data.
    In this operation, we first have a hard-code `XOR_ARRAY`,
    then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
    When we reaching the tail of `XOR_ARRAY`,
    the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
    That's the reason why we call this operation is "circular" XOR operation.
    The reverse operation of this step is nothing changed.
    Because the reverse operation of XOR is perform it again.
    After all byte are XORed, we can get what we want,
    a human-readable translation file in plain text for following processing.
    ## Raw File Length
    The `uint32_t` field following Body is Raw File Length,
    which store the length of raw data, e.g. the length of zlib decompressed byte array.
    It's convenient when decompress Body.
    However, this field is encrypted when storing in NLP file.
    We need to do some extra operations before using it.
    Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
    I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
    So, just do it and don't worry too much.
    By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
    The reverse operation, e.g. building this file when creating NLP, is also simple.
    It's okey that just flip the whole steps I introduced above.
    ## Checksum
    The `uint32_t` field following Body is Checksum,
    which is just the CRC32 of Body.
    This field is usually used to validate the integrity of NLP file.
    Same like Raw File Length, this field is also encrypted in NLP file.
    But its encryption method is quitely simpler than Raw File Length.
    For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
    The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
    */
    /// @brief The size of non-Body part of NLP file
    /// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
    /// @brief The magic DWORD for Raw File Length field encrption.
    /// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
    /// @brief Encrypt Raw File Length field for writting NLP file.
    static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
        return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
    }
    /// @brief Decrypt Raw File Length field read from NLP file.
    static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
        return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
    }
    /// @brief The magic DWORD for Checksum field encryption.
    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
    /// @brief Encrypt Checksum field for writting NLP file.
    static constexpr uint32_t EncryptChecksum(uint32_t value) {
        return value + CHECKSUM_OFFSET;
    }
    /// @brief Decrypt Checksum field read from NLP file.
    static constexpr uint32_t DecryptChecksum(uint32_t value) {
        return value - CHECKSUM_OFFSET;
    }
    /// @brief The core array for Body circular XOR encryption.
    /// @details First byte will XOR with the first byte of this array, and so on.
    /// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
    constexpr const uint8_t XOR_ARRAY[] {
@ -53,37 +139,36 @@ namespace NlpCodec {
        0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
        0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
    };
-    /// @brief The size of above array.
+    /// @brief The size of `XOR_ARRAY`.
    constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
-    /// @brief A convenient mask for above array when performing modulo.
+    /// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
    constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
    // Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
    // Because some stupid programmers (like me) may change above array and fill a series of wrong data,
    // then this mask was computed wrongly.
    static_assert(XOR_ARRAY_MASK == 0x7Fu);
-
+    /// @brief Encrypt or decrypt decompressed Body field.
-    static void GeneralXorOperation(void* data, size_t data_len) {
+    static void CircularXorOperation(void* data, size_t data_len) {
        uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
        for (size_t i = 0u; i < data_len; ++i) {
            ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
        }
    }
 #pragma endregion }
    /// @brief Get the length of given file stream.
    static uint32_t GetFileLength(std::ifstream& fin) {
        // Fetch the types this stream used for following convenience.
        using stream_pos_t = std::ifstream::pos_type;
        using stream_off_t = std::ifstream::off_type;
        // Backups current file cursor.
-        stream_pos_t current_pos = fin.tellg();
+        stream_off_t current_pos = fin.tellg();
        // Seek to the tail and get corresponding offset to get the length of file.
        fin.seekg(0, std::ios_base::end);
-        stream_pos_t tail_pos = fin.tellg();
+        stream_off_t tail_pos = fin.tellg();
        if (std::numeric_limits<uint32_t>::max() < tail_pos)
            throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
        // Restore to previous backup file cursor
-        fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
+        fin.seekg(current_pos, std::ios_base::beg);
        // Safely reurn cast length.
        return SafeCast<uint32_t>(tail_pos);
@ -115,7 +200,7 @@ namespace NlpCodec {
            throw NlpException("Fail to read file data into buffer.");
        // Do XOR operation
-        GeneralXorOperation(inbuf.get(), raw_size);
+        CircularXorOperation(inbuf.get(), raw_size);
        // Do compress and get the size of compressed data.
        uLongf dest_len = static_cast<uLongf>(computed_boundary);
@ -139,8 +224,8 @@ namespace NlpCodec {
            throw NlpException("Fail to write data into file.");
        // Raw size and checksum need some extra encryption before writting
-        raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
+        raw_size = EncryptRawFileLength(raw_size);
-        checksum = checksum + CHECKSUM_OFFSET;
+        checksum = EncryptChecksum(checksum);
        // Write raw size and checksum
        fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
@ -167,8 +252,8 @@ namespace NlpCodec {
        fin.seekg(0, std::ios_base::beg);
        // Raw size and checksum data need to do some extra decryption.
-        expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
+        expected_raw_size = DecryptRawFileLength(expected_raw_size);
-        expected_checksum = expected_checksum - CHECKSUM_OFFSET;
+        expected_checksum = DecryptChecksum(expected_checksum);
        // Allocate memory to store data
        std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
@ -183,12 +268,10 @@ namespace NlpCodec {
        // Test checksum
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
-        if (checksum != expected_checksum) {
+        if (checksum != expected_checksum)
-            fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
+            throw NlpException(
-                expected_checksum, checksum
+                std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
            );
            return false;
        }
        // Do decompress
        uLongf _destLen = static_cast<uLongf>(expected_raw_size);
@ -201,7 +284,7 @@ namespace NlpCodec {
            throw NlpException("Zlib uncompress() failed.");
        // do xor operation
-        GeneralXorOperation(outbuf.get(), expected_raw_size);
+        CircularXorOperation(outbuf.get(), expected_raw_size);
        // Write result into file
        fout.write(outbuf.get(), expected_raw_size);