VirtoolsTranslation/NlpCodec/NlpCodec.cpp
yyc12345 52ea2745dd [refactor] do some works
- move gitignore in individual directories.
- change some directory layout.
- refactor NlpCodec but not finished.
2024-12-08 23:33:57 +08:00

367 lines
15 KiB
C++

#include <zlib.h>
#include <iostream>
#include <cstdint>
#include <cinttypes>
#include <filesystem>
#include <string>
#include <fstream>
#include <memory>
#include <limits>
#include <stdexcept>
#include <utility>
namespace NlpCodec {
class NlpException : public std::exception {
public:
NlpException(const char* msg) : message(msg ? msg : "") {}
NlpException(const NlpException& rhs) : message(rhs.message) {}
virtual ~NlpException() {}
[[nodiscard]] virtual const char* what() const noexcept override { return message.c_str(); }
private:
std::string message;
};
/// @brief The safe version of static_cast which throw exception
/// if given value can not be cast into given type (out of range).
template<typename _TyTo, typename _TyFrom>
static constexpr _TyTo SafeCast(_TyFrom value) {
if (!std::in_range<_TyTo>(value))
throw NlpException(
"Fail to cast integral number because given value is greater than container."
"This is usually caused by your input or output file is too long.");
return static_cast<_TyTo>(value);
}
/// @brief The magic DWORD for file length encrption.
/// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
/// @brief The size of extra part of NLP file which store the size of original plain text file.
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
/// @brief The core array for data encryption.
/// @details First byte will XOR with the first byte of this array, and so on.
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
constexpr const uint8_t XOR_ARRAY[] {
0x2C, 0xA8, 0x56, 0xF9, 0xBD, 0xA6, 0x8D, 0x15, 0x25, 0x38, 0x1A, 0xD4, 0x65, 0x58, 0x28, 0x37,
0xFA, 0x6B, 0xB5, 0xA1, 0x2C, 0x96, 0x13, 0xA2, 0xAB, 0x4F, 0xC5, 0xA1, 0x3E, 0xA7, 0x91, 0x8D,
0x2C, 0xDF, 0x78, 0x6D, 0x3C, 0xFC, 0x92, 0x1F, 0x1A, 0x62, 0xA7, 0x9C, 0x92, 0x29, 0x44, 0x6D,
0x3D, 0xA9, 0x2B, 0xE1, 0x91, 0xAD, 0x49, 0x3C, 0xE2, 0x33, 0xD2, 0x1A, 0x55, 0x92, 0xE7, 0x95,
0x8C, 0xDA, 0xD2, 0xCD, 0xA2, 0xCF, 0x92, 0x9A, 0xE1, 0xF9, 0x3A, 0x26, 0xFA, 0xC4, 0xA9, 0x23,
0xA9, 0x4D, 0x1A, 0x2C, 0x3C, 0x2A, 0xAC, 0x62, 0xA3, 0x92, 0xAC, 0x1F, 0x3E, 0xA6, 0xC9, 0xC8,
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
};
/// @brief The size of above array.
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
/// @brief A convenient mask for above array when performing modulo.
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
// then this mask was computed wrongly.
static_assert(XOR_ARRAY_MASK == 0x7Fu);
static void GeneralXorOperation(void* data, size_t data_len) {
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
for (size_t i = 0u; i < data_len; ++i) {
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
}
}
/// @brief Get the length of given file stream.
static uint32_t GetFileLength(std::ifstream& fin) {
// Fetch the types this stream used for following convenience.
using stream_pos_t = std::ifstream::pos_type;
using stream_off_t = std::ifstream::off_type;
// Backups current file cursor.
stream_pos_t current_pos = fin.tellg();
// Seek to the tail and get corresponding offset to get the length of file.
fin.seekg(0, std::ios_base::end);
stream_pos_t tail_pos = fin.tellg();
if (std::numeric_limits<uint32_t>::max() < tail_pos)
throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
// Restore to previous backup file cursor
fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
// Safely reurn cast length.
return SafeCast<uint32_t>(tail_pos);
}
// HINTS:
// In zlib, uLong and uLongf is 32-bit or more.
// So when casting them to uint32_t, you need use SafeCast to perform boundary check.
// However, you can directly cast uint32_t to them because there is no overflow issue.
// Additionally, uInt is 16-bit or more.
// So when processing with uInt, please more carefully.
static void EncodeNlp(std::ifstream& fin, std::ofstream& fout) {
// Get file length and fetch
uint32_t raw_size = GetFileLength(fin);
// Fetch corresponding zlib boundary for the convenience of zlib encode.
// uLong is 32-bit or more, so we need check whether uint32_t can hold the result first.
uint32_t computed_boundary = SafeCast<uint32_t>(compressBound(static_cast<uLong>(raw_size)));
// Create buffer first
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[raw_size]);
std::unique_ptr<char[]> outbuf(new(std::nothrow) char[computed_boundary]);
if (inbuf == nullptr || outbuf == nullptr)
throw NlpException("Fail to allocate memory.");
// Read data from file to input buffer
fin.read(inbuf.get(), raw_size);
if (!fin.good() || fin.gcount() != raw_size)
throw NlpException("Fail to read file data into buffer.");
// Do XOR operation
GeneralXorOperation(inbuf.get(), raw_size);
// Do compress and get the size of compressed data.
uLongf dest_len = static_cast<uLongf>(computed_boundary);
int ret = compress2(
reinterpret_cast<Bytef*>(outbuf.get()), &dest_len,
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
Z_BEST_COMPRESSION
);
// Check ZLib result.
if (ret != Z_OK)
throw NlpException("Zlib compress() failed.");
// Fetch final compressed size.
uint32_t compressed_size = SafeCast<uint32_t>(dest_len);
// Produce checksum
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));
// Write compressed data into file
fout.write(outbuf.get(), compressed_size);
if (!fout.good())
throw NlpException("Fail to write data into file.");
// Raw size and checksum need some extra encryption before writting
raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
checksum = checksum + CHECKSUM_OFFSET;
// Write raw size and checksum
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
if (!fout.good())
throw NlpException("Fail to write raw size into file.");
fout.write(reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
if (!fout.good())
throw NlpException("Fail to write checksum into file.");
}
static void DecodeNlp(std::ifstream& fin, std::ofstream& fout) {
// Seek to tail to get essential data
uint32_t compressed_size = GetFileLength(fin);
if (compressed_size < TAIL_SIZE)
throw NlpException("Invalid file. File is too short.");
// Get expected raw size and checksum
compressed_size -= TAIL_SIZE;
fin.seekg(compressed_size, std::ios_base::beg);
uint32_t expected_raw_size = 0u, expected_checksum = 0u;
fin.read(reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
fin.read(reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
fin.seekg(0, std::ios_base::beg);
// Raw size and checksum data need to do some extra decryption.
expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
expected_checksum = expected_checksum - CHECKSUM_OFFSET;
// Allocate memory to store data
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
std::unique_ptr<char[]> outbuf(new(std::nothrow) char[expected_raw_size]);
if (inbuf == nullptr || outbuf == nullptr)
throw NlpException("Fail to allocate memory.");
// Read file into buffer
fin.read(inbuf.get(), compressed_size);
if (!fin.good() || fin.gcount() != compressed_size)
throw NlpException("Fail to read data into buffer.\n");
// Test checksum
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
if (checksum != expected_checksum) {
fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
expected_checksum, checksum
);
return false;
}
// Do decompress
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
int ret = uncompress(
reinterpret_cast<Bytef*>(outbuf.get()), &_destLen,
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(compressed_size)
);
// Check zlib result
if (ret != Z_OK)
throw NlpException("Zlib uncompress() failed.");
// do xor operation
GeneralXorOperation(outbuf.get(), expected_raw_size);
// Write result into file
fout.write(outbuf.get(), expected_raw_size);
if (!fout.good())
throw NlpException("Fail to write data into file.");
}
}
namespace NlpCodec::Runtime {
enum class UserOperation {
Encode,
Decode,
Version,
Help
};
struct UserRequest {
UserOperation mUserOperation;
std::filesystem::path mInputFile;
std::filesystem::path mOutputFile;
};
static void PrintHelp() {
std::cout
<< "NlpCodec Usage" << std::endl
<< "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
<< std::endl
<< "version - print version info about this program." << std::endl
<< "help - print this page." << std::endl
<< std::endl
<< "encode - encode text file into NLP file." << std::endl
<< "decode - decode NLP file into text file." << std::endl
<< "<src> - the source file." << std::endl
<< " the path to text file in encode mode." << std::endl
<< " the path to NLP file in decode mode." << std::endl
<< "<dest> - the destination file." << std::endl
<< " the path to NLP file in encode mode." << std::endl
<< " the path to text file in decode mode." << std::endl
<< "" << std::endl;
}
static void PrintVersion() {
std::cout
<< "NlpCodec built at " __DATE__ " " __TIME__ << std::endl
<< "MIT License. Copyright (c) 2022-2024 yyc12345" << std::endl;
}
static UserRequest ResolveArguments(int argc, char* argv[]) {
// Prepare return value
UserRequest ret { UserOperation::Version, "", "" };
switch (argc) {
case 2: {
// Get mode string
std::string mode(argv[1]);
// Check `help` and `version`
if (mode == "version") {
ret.mUserOperation = UserOperation::Version;
} else if (mode == "help") {
ret.mUserOperation = UserOperation::Help;
} else {
// Not matched.
throw NlpException("Invalid argument! Must be one of `version` or `help`");
}
// Return value
return ret;
}
case 4: {
// Get mode string
std::string mode(argv[1]);
// Check `encode` and `decode`
if (mode == "encode") {
ret.mUserOperation = UserOperation::Encode;
} else if (mode == "decode") {
ret.mUserOperation = UserOperation::Decode;
} else {
// Not matched.
throw NlpException("Invalid argument! Must be one of `encode` or `decode`");
}
// Setup input output file path
ret.mInputFile = std::filesystem::path(argv[2]);
ret.mOutputFile = std::filesystem::path(argv[3]);
// Return value
return ret;
}
default:
throw NlpException("Invalid argument count!");
}
}
static void ExecuteWorker(const UserRequest& user_request) {
// Take action according to different request first
bool is_encode;
switch (user_request.mUserOperation) {
case UserOperation::Version:
PrintVersion();
return;
case NlpCodec::Runtime::UserOperation::Help:
PrintHelp();
return;
case NlpCodec::Runtime::UserOperation::Encode:
is_encode = true;
break;
case NlpCodec::Runtime::UserOperation::Decode:
is_encode = false;
break;
}
// Do real codec related works.
// Try to open files
std::ifstream in_file;
in_file.open(user_request.mInputFile, std::ios_base::in | std::ios_base::binary);
std::ofstream out_file;
out_file.open(user_request.mOutputFile, std::ios_base::out | std::ios_base::binary);
// Check file status
if (!in_file.is_open() || !out_file.is_open()) {
throw NlpException("Fail to open input or output file.");
}
// Perform codec
if (is_encode) {
::NlpCodec::EncodeNlp(in_file, out_file);
} else {
::NlpCodec::DecodeNlp(in_file, out_file);
}
// Free resources
in_file.close();
out_file.close();
}
}
int main(int argc, char* argv[]) {
// Try parsing given arguments
NlpCodec::Runtime::UserRequest user_request;
try {
user_request = NlpCodec::Runtime::ResolveArguments(argc, argv);
} catch (const NlpCodec::NlpException& e) {
std::cerr << "[Argument Error] " << e.what() << std::endl;
return 1;
}
// Try executing real wroker
try {
NlpCodec::Runtime::ExecuteWorker(user_request);
} catch (const NlpCodec::NlpException& e) {
std::cerr << "[Codec Error] " << e.what() << std::endl;
return 2;
}
return 0;
}