fix: fix NlpCodec compile issue.
- fix std::ifstream length getter. - use std::format in throwing exception.
This commit is contained in:
		@ -9,9 +9,13 @@
 | 
			
		||||
#include <limits>
 | 
			
		||||
#include <stdexcept>
 | 
			
		||||
#include <utility>
 | 
			
		||||
#include <format>
 | 
			
		||||
 | 
			
		||||
namespace NlpCodec {
 | 
			
		||||
 | 
			
		||||
    /// @brief NlpCodec universal exception.
 | 
			
		||||
    /// @details Once this exception was thrown, it means that somethings went wrong.
 | 
			
		||||
    /// and main function should catch it, output error message and exit program immediately.
 | 
			
		||||
    class NlpException : public std::exception {
 | 
			
		||||
    public:
 | 
			
		||||
        NlpException(const char* msg) : message(msg ? msg : "") {}
 | 
			
		||||
@ -22,25 +26,107 @@ namespace NlpCodec {
 | 
			
		||||
        std::string message;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    /// @brief The safe version of static_cast which throw exception
 | 
			
		||||
    /// @brief The safe version of `static_cast` which throw exception
 | 
			
		||||
    /// if given value can not be cast into given type (out of range).
 | 
			
		||||
    template<typename _TyTo, typename _TyFrom>
 | 
			
		||||
    static constexpr _TyTo SafeCast(_TyFrom value) {
 | 
			
		||||
        if (!std::in_range<_TyTo>(value))
 | 
			
		||||
            throw NlpException(
 | 
			
		||||
                    "Fail to cast integral number because given value is greater than container."
 | 
			
		||||
                    "Fail to cast integral number because given value is greater than container. "
 | 
			
		||||
                    "This is usually caused by your input or output file is too long.");
 | 
			
		||||
        return static_cast<_TyTo>(value);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// @brief The magic DWORD for file length encrption.
 | 
			
		||||
    /// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
 | 
			
		||||
    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
 | 
			
		||||
    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
 | 
			
		||||
    /// @brief The size of extra part of NLP file which store the size of original plain text file.
 | 
			
		||||
    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
 | 
			
		||||
#pragma region "Encryption Stuff" {
 | 
			
		||||
 | 
			
		||||
    /// @brief The core array for data encryption.
 | 
			
		||||
    /*
 | 
			
		||||
 | 
			
		||||
    # NLP File Structure
 | 
			
		||||
 | 
			
		||||
    |Annotation         |Size       |
 | 
			
		||||
    |:---               |:---       |
 | 
			
		||||
    |Body               |variable   |
 | 
			
		||||
    |Raw File Length    |4 bytes    |
 | 
			
		||||
    |Checksum           |4 bytes    |
 | 
			
		||||
 | 
			
		||||
    ## Body
 | 
			
		||||
 | 
			
		||||
    The first part is a zlib compressed byte array.
 | 
			
		||||
    Before any process, we need use zlib to decompress it first.
 | 
			
		||||
    If we need do reverse operation, e.g. build this compressed byte array,
 | 
			
		||||
    the compression level must be maximum value (best compression, e.g. 9).
 | 
			
		||||
 | 
			
		||||
    After decompress this byte array, we need to an extra step called circular XOR operation
 | 
			
		||||
    to get human-readable plain text data.
 | 
			
		||||
    In this operation, we first have a hard-code `XOR_ARRAY`,
 | 
			
		||||
    then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
 | 
			
		||||
    When we reaching the tail of `XOR_ARRAY`,
 | 
			
		||||
    the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
 | 
			
		||||
    That's the reason why we call this operation is "circular" XOR operation.
 | 
			
		||||
    The reverse operation of this step is nothing changed.
 | 
			
		||||
    Because the reverse operation of XOR is perform it again.
 | 
			
		||||
 | 
			
		||||
    After all byte are XORed, we can get what we want,
 | 
			
		||||
    a human-readable translation file in plain text for following processing.
 | 
			
		||||
 | 
			
		||||
    ## Raw File Length
 | 
			
		||||
 | 
			
		||||
    The `uint32_t` field following Body is Raw File Length,
 | 
			
		||||
    which store the length of raw data, e.g. the length of zlib decompressed byte array.
 | 
			
		||||
    It's convenient when decompress Body.
 | 
			
		||||
 | 
			
		||||
    However, this field is encrypted when storing in NLP file.
 | 
			
		||||
    We need to do some extra operations before using it.
 | 
			
		||||
    Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
 | 
			
		||||
    I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
 | 
			
		||||
    So, just do it and don't worry too much.
 | 
			
		||||
    By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
 | 
			
		||||
 | 
			
		||||
    The reverse operation, e.g. building this file when creating NLP, is also simple.
 | 
			
		||||
    It's okey that just flip the whole steps I introduced above.
 | 
			
		||||
 | 
			
		||||
    ## Checksum
 | 
			
		||||
 | 
			
		||||
    The `uint32_t` field following Body is Checksum,
 | 
			
		||||
    which is just the CRC32 of Body.
 | 
			
		||||
    This field is usually used to validate the integrity of NLP file.
 | 
			
		||||
 | 
			
		||||
    Same like Raw File Length, this field is also encrypted in NLP file.
 | 
			
		||||
    But its encryption method is quitely simpler than Raw File Length.
 | 
			
		||||
    For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
 | 
			
		||||
 | 
			
		||||
    The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
 | 
			
		||||
 | 
			
		||||
    */
 | 
			
		||||
 | 
			
		||||
    /// @brief The size of non-Body part of NLP file
 | 
			
		||||
    /// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
 | 
			
		||||
    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
 | 
			
		||||
 | 
			
		||||
    /// @brief The magic DWORD for Raw File Length field encrption.
 | 
			
		||||
    /// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
 | 
			
		||||
    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
 | 
			
		||||
    /// @brief Encrypt Raw File Length field for writting NLP file.
 | 
			
		||||
    static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
 | 
			
		||||
        return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
 | 
			
		||||
    }
 | 
			
		||||
    /// @brief Decrypt Raw File Length field read from NLP file.
 | 
			
		||||
    static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
 | 
			
		||||
        return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// @brief The magic DWORD for Checksum field encryption.
 | 
			
		||||
    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
 | 
			
		||||
    /// @brief Encrypt Checksum field for writting NLP file.
 | 
			
		||||
    static constexpr uint32_t EncryptChecksum(uint32_t value) {
 | 
			
		||||
        return value + CHECKSUM_OFFSET;
 | 
			
		||||
    }
 | 
			
		||||
    /// @brief Decrypt Checksum field read from NLP file.
 | 
			
		||||
    static constexpr uint32_t DecryptChecksum(uint32_t value) {
 | 
			
		||||
        return value - CHECKSUM_OFFSET;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// @brief The core array for Body circular XOR encryption.
 | 
			
		||||
    /// @details First byte will XOR with the first byte of this array, and so on.
 | 
			
		||||
    /// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
 | 
			
		||||
    constexpr const uint8_t XOR_ARRAY[] {
 | 
			
		||||
@ -53,37 +139,36 @@ namespace NlpCodec {
 | 
			
		||||
        0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
 | 
			
		||||
        0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
 | 
			
		||||
    };
 | 
			
		||||
    /// @brief The size of above array.
 | 
			
		||||
    /// @brief The size of `XOR_ARRAY`.
 | 
			
		||||
    constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
 | 
			
		||||
    /// @brief A convenient mask for above array when performing modulo.
 | 
			
		||||
    /// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
 | 
			
		||||
    constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
 | 
			
		||||
    // Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
 | 
			
		||||
    // Because some stupid programmers (like me) may change above array and fill a series of wrong data,
 | 
			
		||||
    // then this mask was computed wrongly.
 | 
			
		||||
    static_assert(XOR_ARRAY_MASK == 0x7Fu);
 | 
			
		||||
 | 
			
		||||
    static void GeneralXorOperation(void* data, size_t data_len) {
 | 
			
		||||
    /// @brief Encrypt or decrypt decompressed Body field.
 | 
			
		||||
    static void CircularXorOperation(void* data, size_t data_len) {
 | 
			
		||||
        uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
 | 
			
		||||
        for (size_t i = 0u; i < data_len; ++i) {
 | 
			
		||||
            ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#pragma endregion }
 | 
			
		||||
 | 
			
		||||
    /// @brief Get the length of given file stream.
 | 
			
		||||
    static uint32_t GetFileLength(std::ifstream& fin) {
 | 
			
		||||
        // Fetch the types this stream used for following convenience.
 | 
			
		||||
        using stream_pos_t = std::ifstream::pos_type;
 | 
			
		||||
        using stream_off_t = std::ifstream::off_type;
 | 
			
		||||
 | 
			
		||||
        // Backups current file cursor.
 | 
			
		||||
        stream_pos_t current_pos = fin.tellg();
 | 
			
		||||
        stream_off_t current_pos = fin.tellg();
 | 
			
		||||
        // Seek to the tail and get corresponding offset to get the length of file.
 | 
			
		||||
        fin.seekg(0, std::ios_base::end);
 | 
			
		||||
        stream_pos_t tail_pos = fin.tellg();
 | 
			
		||||
        if (std::numeric_limits<uint32_t>::max() < tail_pos)
 | 
			
		||||
            throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
 | 
			
		||||
        stream_off_t tail_pos = fin.tellg();
 | 
			
		||||
        // Restore to previous backup file cursor
 | 
			
		||||
        fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
 | 
			
		||||
        fin.seekg(current_pos, std::ios_base::beg);
 | 
			
		||||
 | 
			
		||||
        // Safely reurn cast length.
 | 
			
		||||
        return SafeCast<uint32_t>(tail_pos);
 | 
			
		||||
@ -115,7 +200,7 @@ namespace NlpCodec {
 | 
			
		||||
            throw NlpException("Fail to read file data into buffer.");
 | 
			
		||||
 | 
			
		||||
        // Do XOR operation
 | 
			
		||||
        GeneralXorOperation(inbuf.get(), raw_size);
 | 
			
		||||
        CircularXorOperation(inbuf.get(), raw_size);
 | 
			
		||||
 | 
			
		||||
        // Do compress and get the size of compressed data.
 | 
			
		||||
        uLongf dest_len = static_cast<uLongf>(computed_boundary);
 | 
			
		||||
@ -139,8 +224,8 @@ namespace NlpCodec {
 | 
			
		||||
            throw NlpException("Fail to write data into file.");
 | 
			
		||||
 | 
			
		||||
        // Raw size and checksum need some extra encryption before writting
 | 
			
		||||
        raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
 | 
			
		||||
        checksum = checksum + CHECKSUM_OFFSET;
 | 
			
		||||
        raw_size = EncryptRawFileLength(raw_size);
 | 
			
		||||
        checksum = EncryptChecksum(checksum);
 | 
			
		||||
 | 
			
		||||
        // Write raw size and checksum
 | 
			
		||||
        fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
 | 
			
		||||
@ -167,8 +252,8 @@ namespace NlpCodec {
 | 
			
		||||
        fin.seekg(0, std::ios_base::beg);
 | 
			
		||||
 | 
			
		||||
        // Raw size and checksum data need to do some extra decryption.
 | 
			
		||||
        expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
 | 
			
		||||
        expected_checksum = expected_checksum - CHECKSUM_OFFSET;
 | 
			
		||||
        expected_raw_size = DecryptRawFileLength(expected_raw_size);
 | 
			
		||||
        expected_checksum = DecryptChecksum(expected_checksum);
 | 
			
		||||
 | 
			
		||||
        // Allocate memory to store data
 | 
			
		||||
        std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
 | 
			
		||||
@ -183,12 +268,10 @@ namespace NlpCodec {
 | 
			
		||||
 | 
			
		||||
        // Test checksum
 | 
			
		||||
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
 | 
			
		||||
        if (checksum != expected_checksum) {
 | 
			
		||||
            fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
 | 
			
		||||
                expected_checksum, checksum
 | 
			
		||||
        if (checksum != expected_checksum)
 | 
			
		||||
            throw NlpException(
 | 
			
		||||
                std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
 | 
			
		||||
            );
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Do decompress
 | 
			
		||||
        uLongf _destLen = static_cast<uLongf>(expected_raw_size);
 | 
			
		||||
@ -201,7 +284,7 @@ namespace NlpCodec {
 | 
			
		||||
            throw NlpException("Zlib uncompress() failed.");
 | 
			
		||||
 | 
			
		||||
        // do xor operation
 | 
			
		||||
        GeneralXorOperation(outbuf.get(), expected_raw_size);
 | 
			
		||||
        CircularXorOperation(outbuf.get(), expected_raw_size);
 | 
			
		||||
 | 
			
		||||
        // Write result into file
 | 
			
		||||
        fout.write(outbuf.get(), expected_raw_size);
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user