fix: fix NlpCodec compile issue.

- fix std::ifstream length getter. - use std::format in throwing exception.
2024-12-11 16:20:21 +08:00
parent 52ea2745dd
commit 6193a2ede6
1 changed files with 113 additions and 30 deletions
--- a/NlpCodec/NlpCodec.cpp
+++ b/NlpCodec/NlpCodec.cpp
@ -9,9 +9,13 @@
 #include <limits>
 #include <stdexcept>
 #include <utility>
+#include <format>

 namespace NlpCodec {

+    /// @brief NlpCodec universal exception.
+    /// @details Once this exception was thrown, it means that somethings went wrong.
+    /// and main function should catch it, output error message and exit program immediately.
    class NlpException : public std::exception {
    public:
        NlpException(const char* msg) : message(msg ? msg : "") {}
@ -22,7 +26,7 @@ namespace NlpCodec {
        std::string message;
    };

-    /// @brief The safe version of static_cast which throw exception
+    /// @brief The safe version of `static_cast` which throw exception
    /// if given value can not be cast into given type (out of range).
    template<typename _TyTo, typename _TyFrom>
    static constexpr _TyTo SafeCast(_TyFrom value) {
@ -33,14 +37,96 @@ namespace NlpCodec {
        return static_cast<_TyTo>(value);
    }

-    /// @brief The magic DWORD for file length encrption.
-    /// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
-    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
-    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
-    /// @brief The size of extra part of NLP file which store the size of original plain text file.
-    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
+#pragma region "Encryption Stuff" {

-    /// @brief The core array for data encryption.
+    /*
+
+    # NLP File Structure
+
+    |Annotation         |Size       |
+    |:---               |:---       |
+    |Body               |variable   |
+    |Raw File Length    |4 bytes    |
+    |Checksum           |4 bytes    |
+
+    ## Body
+
+    The first part is a zlib compressed byte array.
+    Before any process, we need use zlib to decompress it first.
+    If we need do reverse operation, e.g. build this compressed byte array,
+    the compression level must be maximum value (best compression, e.g. 9).
+
+    After decompress this byte array, we need to an extra step called circular XOR operation
+    to get human-readable plain text data.
+    In this operation, we first have a hard-code `XOR_ARRAY`,
+    then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
+    When we reaching the tail of `XOR_ARRAY`,
+    the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
+    That's the reason why we call this operation is "circular" XOR operation.
+    The reverse operation of this step is nothing changed.
+    Because the reverse operation of XOR is perform it again.
+
+    After all byte are XORed, we can get what we want,
+    a human-readable translation file in plain text for following processing.
+
+    ## Raw File Length
+
+    The `uint32_t` field following Body is Raw File Length,
+    which store the length of raw data, e.g. the length of zlib decompressed byte array.
+    It's convenient when decompress Body.
+
+    However, this field is encrypted when storing in NLP file.
+    We need to do some extra operations before using it.
+    Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
+    I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
+    So, just do it and don't worry too much.
+    By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
+
+    The reverse operation, e.g. building this file when creating NLP, is also simple.
+    It's okey that just flip the whole steps I introduced above.
+
+    ## Checksum
+
+    The `uint32_t` field following Body is Checksum,
+    which is just the CRC32 of Body.
+    This field is usually used to validate the integrity of NLP file.
+
+    Same like Raw File Length, this field is also encrypted in NLP file.
+    But its encryption method is quitely simpler than Raw File Length.
+    For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
+
+    The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
+
+    */
+
+    /// @brief The size of non-Body part of NLP file
+    /// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
+    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
+
+    /// @brief The magic DWORD for Raw File Length field encrption.
+    /// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
+    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
+    /// @brief Encrypt Raw File Length field for writting NLP file.
+    static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
+        return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
+    }
+    /// @brief Decrypt Raw File Length field read from NLP file.
+    static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
+        return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
+    }
+
+    /// @brief The magic DWORD for Checksum field encryption.
+    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
+    /// @brief Encrypt Checksum field for writting NLP file.
+    static constexpr uint32_t EncryptChecksum(uint32_t value) {
+        return value + CHECKSUM_OFFSET;
+    }
+    /// @brief Decrypt Checksum field read from NLP file.
+    static constexpr uint32_t DecryptChecksum(uint32_t value) {
+        return value - CHECKSUM_OFFSET;
+    }
+
+    /// @brief The core array for Body circular XOR encryption.
    /// @details First byte will XOR with the first byte of this array, and so on.
    /// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
    constexpr const uint8_t XOR_ARRAY[] {
@ -53,37 +139,36 @@ namespace NlpCodec {
        0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
        0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
    };
-    /// @brief The size of above array.
+    /// @brief The size of `XOR_ARRAY`.
    constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
-    /// @brief A convenient mask for above array when performing modulo.
+    /// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
    constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
    // Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
    // Because some stupid programmers (like me) may change above array and fill a series of wrong data,
    // then this mask was computed wrongly.
    static_assert(XOR_ARRAY_MASK == 0x7Fu);
-
-    static void GeneralXorOperation(void* data, size_t data_len) {
+    /// @brief Encrypt or decrypt decompressed Body field.
+    static void CircularXorOperation(void* data, size_t data_len) {
        uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
        for (size_t i = 0u; i < data_len; ++i) {
            ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
        }
    }

+#pragma endregion }
+
    /// @brief Get the length of given file stream.
    static uint32_t GetFileLength(std::ifstream& fin) {
        // Fetch the types this stream used for following convenience.
-        using stream_pos_t = std::ifstream::pos_type;
        using stream_off_t = std::ifstream::off_type;

        // Backups current file cursor.
-        stream_pos_t current_pos = fin.tellg();
+        stream_off_t current_pos = fin.tellg();
        // Seek to the tail and get corresponding offset to get the length of file.
        fin.seekg(0, std::ios_base::end);
-        stream_pos_t tail_pos = fin.tellg();
-        if (std::numeric_limits<uint32_t>::max() < tail_pos)
-            throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
+        stream_off_t tail_pos = fin.tellg();
        // Restore to previous backup file cursor
-        fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
+        fin.seekg(current_pos, std::ios_base::beg);

        // Safely reurn cast length.
        return SafeCast<uint32_t>(tail_pos);
@ -115,7 +200,7 @@ namespace NlpCodec {
            throw NlpException("Fail to read file data into buffer.");

        // Do XOR operation
-        GeneralXorOperation(inbuf.get(), raw_size);
+        CircularXorOperation(inbuf.get(), raw_size);

        // Do compress and get the size of compressed data.
        uLongf dest_len = static_cast<uLongf>(computed_boundary);
@ -139,8 +224,8 @@ namespace NlpCodec {
            throw NlpException("Fail to write data into file.");

        // Raw size and checksum need some extra encryption before writting
-        raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
-        checksum = checksum + CHECKSUM_OFFSET;
+        raw_size = EncryptRawFileLength(raw_size);
+        checksum = EncryptChecksum(checksum);

        // Write raw size and checksum
        fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
@ -167,8 +252,8 @@ namespace NlpCodec {
        fin.seekg(0, std::ios_base::beg);

        // Raw size and checksum data need to do some extra decryption.
-        expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
-        expected_checksum = expected_checksum - CHECKSUM_OFFSET;
+        expected_raw_size = DecryptRawFileLength(expected_raw_size);
+        expected_checksum = DecryptChecksum(expected_checksum);

        // Allocate memory to store data
        std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
@ -183,12 +268,10 @@ namespace NlpCodec {

        // Test checksum
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
-        if (checksum != expected_checksum) {
-            fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
-                expected_checksum, checksum
+        if (checksum != expected_checksum)
+            throw NlpException(
+                std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
            );
-            return false;
-        }

        // Do decompress
        uLongf _destLen = static_cast<uLongf>(expected_raw_size);
@ -201,7 +284,7 @@ namespace NlpCodec {
            throw NlpException("Zlib uncompress() failed.");

        // do xor operation
-        GeneralXorOperation(outbuf.get(), expected_raw_size);
+        CircularXorOperation(outbuf.get(), expected_raw_size);

        // Write result into file
        fout.write(outbuf.get(), expected_raw_size);