NlpCodec/NlpCodec.cpp

#include <zlib.h>
#include <iostream>
#include <cstdint>
#include <cinttypes>
#include <filesystem>
#include <string>
#include <fstream>
#include <memory>
#include <limits>
#include <stdexcept>
#include <utility>
#include <format>

namespace NlpCodec {

    /// @brief NlpCodec universal exception.
    /// @details Once this exception was thrown, it means that somethings went wrong.
    /// and main function should catch it, output error message and exit program immediately.
    class NlpException : public std::exception {
    public:
        NlpException(const char* msg) : message(msg ? msg : "") {}
        NlpException(const NlpException& rhs) : message(rhs.message) {}
        virtual ~NlpException() {}
        [[nodiscard]] virtual const char* what() const noexcept override { return message.c_str(); }
    private:
        std::string message;
    };

    /// @brief The safe version of `static_cast` which throw exception
    /// if given value can not be cast into given type (out of range).
    template<typename _TyTo, typename _TyFrom>
    static constexpr _TyTo SafeCast(_TyFrom value) {
        if (!std::in_range<_TyTo>(value))
            throw NlpException(
                    "Fail to cast integral number because given value is greater than container. "
                    "This is usually caused by your input or output file is too long.");
        return static_cast<_TyTo>(value);
    }

#pragma region "Encryption Stuff" {

    /*

    # NLP File Structure

    |Annotation         |Size       |
    |:---               |:---       |
    |Body               |variable   |
    |Raw File Length    |4 bytes    |
    |Checksum           |4 bytes    |

    ## Body

    The first part is a zlib compressed byte array.
    Before any process, we need use zlib to decompress it first.
    If we need do reverse operation, e.g. build this compressed byte array,
    the compression level must be maximum value (best compression, e.g. 9).

    After decompress this byte array, we need to an extra step called circular XOR operation
    to get human-readable plain text data.
    In this operation, we first have a hard-code `XOR_ARRAY`,
    then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
    When we reaching the tail of `XOR_ARRAY`,
    the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
    That's the reason why we call this operation is "circular" XOR operation.
    The reverse operation of this step is nothing changed.
    Because the reverse operation of XOR is perform it again.

    After all byte are XORed, we can get what we want,
    a human-readable translation file in plain text for following processing.

    ## Raw File Length

    The `uint32_t` field following Body is Raw File Length,
    which store the length of raw data, e.g. the length of zlib decompressed byte array.
    It's convenient when decompress Body.

    However, this field is encrypted when storing in NLP file.
    We need to do some extra operations before using it.
    Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
    I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
    So, just do it and don't worry too much.
    By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.

    The reverse operation, e.g. building this file when creating NLP, is also simple.
    It's okey that just flip the whole steps I introduced above.

    ## Checksum

    The `uint32_t` field following Body is Checksum,
    which is just the CRC32 of Body.
    This field is usually used to validate the integrity of NLP file.

    Same like Raw File Length, this field is also encrypted in NLP file.
    But its encryption method is quitely simpler than Raw File Length.
    For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).

    The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.

    */

    /// @brief The size of non-Body part of NLP file
    /// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);

    /// @brief The magic DWORD for Raw File Length field encrption.
    /// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
    /// @brief Encrypt Raw File Length field for writting NLP file.
    static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
        return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
    }
    /// @brief Decrypt Raw File Length field read from NLP file.
    static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
        return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
    }

    /// @brief The magic DWORD for Checksum field encryption.
    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
    /// @brief Encrypt Checksum field for writting NLP file.
    static constexpr uint32_t EncryptChecksum(uint32_t value) {
        return value + CHECKSUM_OFFSET;
    }
    /// @brief Decrypt Checksum field read from NLP file.
    static constexpr uint32_t DecryptChecksum(uint32_t value) {
        return value - CHECKSUM_OFFSET;
    }

    /// @brief The core array for Body circular XOR encryption.
    /// @details First byte will XOR with the first byte of this array, and so on.
    /// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
    constexpr const uint8_t XOR_ARRAY[] {
        0x2C, 0xA8, 0x56, 0xF9, 0xBD, 0xA6, 0x8D, 0x15, 0x25, 0x38, 0x1A, 0xD4, 0x65, 0x58, 0x28, 0x37,
        0xFA, 0x6B, 0xB5, 0xA1, 0x2C, 0x96, 0x13, 0xA2, 0xAB, 0x4F, 0xC5, 0xA1, 0x3E, 0xA7, 0x91, 0x8D,
        0x2C, 0xDF, 0x78, 0x6D, 0x3C, 0xFC, 0x92, 0x1F, 0x1A, 0x62, 0xA7, 0x9C, 0x92, 0x29, 0x44, 0x6D,
        0x3D, 0xA9, 0x2B, 0xE1, 0x91, 0xAD, 0x49, 0x3C, 0xE2, 0x33, 0xD2, 0x1A, 0x55, 0x92, 0xE7, 0x95,
        0x8C, 0xDA, 0xD2, 0xCD, 0xA2, 0xCF, 0x92, 0x9A, 0xE1, 0xF9, 0x3A, 0x26, 0xFA, 0xC4, 0xA9, 0x23,
        0xA9, 0x4D, 0x1A, 0x2C, 0x3C, 0x2A, 0xAC, 0x62, 0xA3, 0x92, 0xAC, 0x1F, 0x3E, 0xA6, 0xC9, 0xC8,
        0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
        0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
    };
    /// @brief The size of `XOR_ARRAY`.
    constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
    /// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
    constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
    // Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
    // Because some stupid programmers (like me) may change above array and fill a series of wrong data,
    // then this mask was computed wrongly.
    static_assert(XOR_ARRAY_MASK == 0x7Fu);
    /// @brief Encrypt or decrypt decompressed Body field.
    static void CircularXorOperation(void* data, size_t data_len) {
        uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
        for (size_t i = 0u; i < data_len; ++i) {
            ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
        }
    }

#pragma endregion }

    /// @brief Get the length of given file stream.
    static uint32_t GetFileLength(std::ifstream& fin) {
        // Fetch the types this stream used for following convenience.
        using stream_off_t = std::ifstream::off_type;

        // Backups current file cursor.
        stream_off_t current_pos = fin.tellg();
        // Seek to the tail and get corresponding offset to get the length of file.
        fin.seekg(0, std::ios_base::end);
        stream_off_t tail_pos = fin.tellg();
        // Restore to previous backup file cursor
        fin.seekg(current_pos, std::ios_base::beg);

        // Safely reurn cast length.
        return SafeCast<uint32_t>(tail_pos);
    }

    // HINTS:
    // In zlib, uLong and uLongf is 32-bit or more.
    // So when casting them to uint32_t, you need use SafeCast to perform boundary check.
    // However, you can directly cast uint32_t to them because there is no overflow issue.
    // Additionally, uInt is 16-bit or more.
    // So when processing with uInt, please more carefully.

    static void EncodeNlp(std::ifstream& fin, std::ofstream& fout) {
        // Get file length and fetch
        uint32_t raw_size = GetFileLength(fin);
        // Fetch corresponding zlib boundary for the convenience of zlib encode.
        // uLong is 32-bit or more, so we need check whether uint32_t can hold the result first.
        uint32_t computed_boundary = SafeCast<uint32_t>(compressBound(static_cast<uLong>(raw_size)));

        // Create buffer first
        std::unique_ptr<char[]> inbuf(new(std::nothrow) char[raw_size]);
        std::unique_ptr<char[]> outbuf(new(std::nothrow) char[computed_boundary]);
        if (inbuf == nullptr || outbuf == nullptr)
            throw NlpException("Fail to allocate memory.");

        // Read data from file to input buffer
        fin.read(inbuf.get(), raw_size);
        if (!fin.good() || fin.gcount() != raw_size)
            throw NlpException("Fail to read file data into buffer.");

        // Do XOR operation
        CircularXorOperation(inbuf.get(), raw_size);

        // Do compress and get the size of compressed data.
        uLongf dest_len = static_cast<uLongf>(computed_boundary);
        int ret = compress2(
            reinterpret_cast<Bytef*>(outbuf.get()), &dest_len,
            reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
            Z_BEST_COMPRESSION
        );
        // Check ZLib result.
        if (ret != Z_OK)
            throw NlpException("Zlib compress() failed.");
        // Fetch final compressed size.
        uint32_t compressed_size = SafeCast<uint32_t>(dest_len);

        // Produce checksum
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));

        // Write compressed data into file
        fout.write(outbuf.get(), compressed_size);
        if (!fout.good())
            throw NlpException("Fail to write data into file.");

        // Raw size and checksum need some extra encryption before writting
        raw_size = EncryptRawFileLength(raw_size);
        checksum = EncryptChecksum(checksum);

        // Write raw size and checksum
        fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
        if (!fout.good())
            throw NlpException("Fail to write raw size into file.");
        fout.write(reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
        if (!fout.good())
            throw NlpException("Fail to write checksum into file.");

    }

    static void DecodeNlp(std::ifstream& fin, std::ofstream& fout) {
        // Seek to tail to get essential data
        uint32_t compressed_size = GetFileLength(fin);
        if (compressed_size < TAIL_SIZE)
            throw NlpException("Invalid file. File is too short.");

        // Get expected raw size and checksum
        compressed_size -= TAIL_SIZE;
        fin.seekg(compressed_size, std::ios_base::beg);
        uint32_t expected_raw_size = 0u, expected_checksum = 0u;
        fin.read(reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
        fin.read(reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
        fin.seekg(0, std::ios_base::beg);

        // Raw size and checksum data need to do some extra decryption.
        expected_raw_size = DecryptRawFileLength(expected_raw_size);
        expected_checksum = DecryptChecksum(expected_checksum);

        // Allocate memory to store data
        std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
        std::unique_ptr<char[]> outbuf(new(std::nothrow) char[expected_raw_size]);
        if (inbuf == nullptr || outbuf == nullptr)
            throw NlpException("Fail to allocate memory.");

        // Read file into buffer
        fin.read(inbuf.get(), compressed_size);
        if (!fin.good() || fin.gcount() != compressed_size)
            throw NlpException("Fail to read data into buffer.\n");

        // Test checksum
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
        if (checksum != expected_checksum)
            throw NlpException(
                std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
            );

        // Do decompress
        uLongf _destLen = static_cast<uLongf>(expected_raw_size);
        int ret = uncompress(
            reinterpret_cast<Bytef*>(outbuf.get()), &_destLen,
            reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(compressed_size)
        );
        // Check zlib result
        if (ret != Z_OK)
            throw NlpException("Zlib uncompress() failed.");

        // do xor operation
        CircularXorOperation(outbuf.get(), expected_raw_size);

        // Write result into file
        fout.write(outbuf.get(), expected_raw_size);
        if (!fout.good())
            throw NlpException("Fail to write data into file.");

    }

}

namespace NlpCodec::Runtime {

    enum class UserOperation {
        Encode,
        Decode,
        Version,
        Help
    };

    struct UserRequest {
        UserOperation mUserOperation;
        std::filesystem::path mInputFile;
        std::filesystem::path mOutputFile;
    };

    static void PrintHelp() {
        std::cout
                << "NlpCodec Usage" << std::endl
                << "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
                << std::endl
                << "version - print version info about this program." << std::endl
                << "help - print this page." << std::endl
                << std::endl
                << "encode - encode text file into NLP file." << std::endl
                << "decode - decode NLP file into text file." << std::endl
                << "<src> - the source file." << std::endl
                << "        the path to text file in encode mode." << std::endl
                << "        the path to NLP file in decode mode." << std::endl
                << "<dest> - the destination file." << std::endl
                << "         the path to NLP file in encode mode." << std::endl
                << "         the path to text file in decode mode." << std::endl
                << "" << std::endl;
    }

    static void PrintVersion() {
        std::cout
                << "NlpCodec built at " __DATE__ " " __TIME__ << std::endl
                << "MIT License. Copyright (c) 2022-2024 yyc12345" << std::endl;
    }

    static UserRequest ResolveArguments(int argc, char* argv[]) {
        // Prepare return value
        UserRequest ret { UserOperation::Version, "", "" };

        switch (argc) {
        case 2: {
            // Get mode string
            std::string mode(argv[1]);

            // Check `help` and `version`
            if (mode == "version") {
                ret.mUserOperation = UserOperation::Version;
            } else if (mode == "help") {
                ret.mUserOperation = UserOperation::Help;
            } else {
                // Not matched.
                throw NlpException("Invalid argument! Must be one of `version` or `help`");
            }

            // Return value
            return ret;
        }
        case 4: {
            // Get mode string
            std::string mode(argv[1]);

            // Check `encode` and `decode`
            if (mode == "encode") {
                ret.mUserOperation = UserOperation::Encode;
            } else if (mode == "decode") {
                ret.mUserOperation = UserOperation::Decode;
            } else {
                // Not matched.
                throw NlpException("Invalid argument! Must be one of `encode` or `decode`");
            }

            // Setup input output file path
            ret.mInputFile = std::filesystem::path(argv[2]);
            ret.mOutputFile = std::filesystem::path(argv[3]);

            // Return value
            return ret;
        }
        default:
            throw NlpException("Invalid argument count!");
        }
    }

    static void ExecuteWorker(const UserRequest& user_request) {
        // Take action according to different request first
        bool is_encode;
        switch (user_request.mUserOperation) {
        case UserOperation::Version:
            PrintVersion();
            return;
        case NlpCodec::Runtime::UserOperation::Help:
            PrintHelp();
            return;
        case NlpCodec::Runtime::UserOperation::Encode:
            is_encode = true;
            break;
        case NlpCodec::Runtime::UserOperation::Decode:
            is_encode = false;
            break;
        }

        // Do real codec related works.
        // Try to open files
        std::ifstream in_file;
        in_file.open(user_request.mInputFile, std::ios_base::in | std::ios_base::binary);
        std::ofstream out_file;
        out_file.open(user_request.mOutputFile, std::ios_base::out | std::ios_base::binary);
        // Check file status
        if (!in_file.is_open() || !out_file.is_open()) {
            throw NlpException("Fail to open input or output file.");
        }

        // Perform codec
        if (is_encode) {
            ::NlpCodec::EncodeNlp(in_file, out_file);
        } else {
            ::NlpCodec::DecodeNlp(in_file, out_file);
        }

        // Free resources
        in_file.close();
        out_file.close();
    }

}

int main(int argc, char* argv[]) {

    // Try parsing given arguments
    NlpCodec::Runtime::UserRequest user_request;
    try {
        user_request = NlpCodec::Runtime::ResolveArguments(argc, argv);
    } catch (const NlpCodec::NlpException& e) {
        std::cerr << "[Argument Error] " << e.what() << std::endl;
        return 1;
    }

    // Try executing real wroker
    try {
        NlpCodec::Runtime::ExecuteWorker(user_request);
    } catch (const NlpCodec::NlpException& e) {
        std::cerr << "[Codec Error] " << e.what() << std::endl;
        return 2;
    }

    return 0;
}