457 lines
18 KiB
C++
457 lines
18 KiB
C++
#include <zlib.h>
|
|
#include <iostream>
|
|
#include <cstdint>
|
|
#include <cinttypes>
|
|
#include <filesystem>
|
|
#include <string>
|
|
#include <fstream>
|
|
#include <memory>
|
|
#include <limits>
|
|
#include <stdexcept>
|
|
#include <utility>
|
|
#include <format>
|
|
|
|
namespace NlpCodec {
|
|
|
|
#pragma region Help Structs and Functions
|
|
|
|
/// @brief NlpCodec universal exception.
|
|
/// @details Once this exception was thrown, it means that somethings went wrong.
|
|
/// and main function should catch it, output error message and exit program immediately.
|
|
class NlpException : public std::exception {
|
|
public:
|
|
NlpException(const char* msg) : message(msg ? msg : "") {}
|
|
NlpException(const NlpException& rhs) : message(rhs.message) {}
|
|
virtual ~NlpException() {}
|
|
[[nodiscard]] virtual const char* what() const noexcept override { return message.c_str(); }
|
|
private:
|
|
std::string message;
|
|
};
|
|
|
|
/// @brief The safe version of `static_cast` which throw exception
|
|
/// if given value can not be cast into given type (out of range).
|
|
template<typename _TyTo, typename _TyFrom>
|
|
static constexpr _TyTo SafeCast(_TyFrom value) {
|
|
if (!std::in_range<_TyTo>(value))
|
|
throw NlpException(
|
|
"Fail to cast integral number because given value is greater than the type can hold. "
|
|
"This is usually caused by your input or output file is too long.");
|
|
return static_cast<_TyTo>(value);
|
|
}
|
|
|
|
/// @brief The safe version of `std::ifstream::read`.
|
|
/// Throw exception if fail to read.
|
|
static void SafeRead(std::ifstream& fin, char* s, std::streamsize count) {
|
|
fin.read(s, count);
|
|
if (!fin.good() || fin.gcount() != count)
|
|
throw NlpException("Fail to read data from file.");
|
|
}
|
|
/// @brief The safe version of `std::ofstream::write`.
|
|
/// Throw exception if fail to write.
|
|
static void SafeWrite(std::ofstream& fout, const char* s, std::streamsize count) {
|
|
fout.write(s, count);
|
|
if (!fout.good())
|
|
throw NlpException("Fail to write data into file.");
|
|
}
|
|
|
|
#pragma endregion
|
|
|
|
#pragma region Encryption Stuff
|
|
|
|
/*
|
|
|
|
# NLP File Structure
|
|
|
|
|Annotation |Size |
|
|
|:--- |:--- |
|
|
|Body |variable |
|
|
|Raw File Length |4 bytes |
|
|
|Checksum |4 bytes |
|
|
|
|
## Body
|
|
|
|
The first part is a zlib compressed byte array.
|
|
Before any process, we need use zlib to decompress it first.
|
|
If we need do reverse operation, e.g. build this compressed byte array,
|
|
the compression level must be maximum value (best compression, e.g. 9).
|
|
|
|
After decompress this byte array, we need to an extra step called circular XOR operation
|
|
to get human-readable plain text data.
|
|
In this operation, we first have a hard-code `XOR_ARRAY`,
|
|
then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
|
|
When we reaching the tail of `XOR_ARRAY`,
|
|
the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
|
|
That's the reason why we call this operation is "circular" XOR operation.
|
|
The reverse operation of this step is nothing changed.
|
|
Because the reverse operation of XOR is perform it again.
|
|
|
|
After all byte are XORed, we can get what we want,
|
|
a human-readable translation file in plain text for following processing.
|
|
|
|
## Raw File Length
|
|
|
|
The `uint32_t` field following Body is Raw File Length,
|
|
which store the length of raw data, e.g. the length of zlib decompressed byte array.
|
|
It's convenient when decompress Body.
|
|
|
|
However, this field is encrypted when storing in NLP file.
|
|
We need to do some extra operations before using it.
|
|
Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
|
|
I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
|
|
So, just do it and don't worry too much.
|
|
By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
|
|
|
The reverse operation, e.g. building this file when creating NLP, is also simple.
|
|
It's okey that just flip the whole steps I introduced above.
|
|
|
|
## Checksum
|
|
|
|
The `uint32_t` field following Body is Checksum,
|
|
which is just the CRC32 of Body.
|
|
This field is usually used to validate the integrity of NLP file.
|
|
|
|
Same like Raw File Length, this field is also encrypted in NLP file.
|
|
But its encryption method is quitely simpler than Raw File Length.
|
|
For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
|
|
|
|
The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
|
|
|
|
*/
|
|
|
|
/// @brief The size of non-Body part of NLP file
|
|
/// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
|
|
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
|
|
|
|
/// @brief The magic DWORD for Raw File Length field encrption.
|
|
/// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
|
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
|
/// @brief Encrypt Raw File Length field for writting NLP file.
|
|
static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
|
|
return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
|
|
}
|
|
/// @brief Decrypt Raw File Length field read from NLP file.
|
|
static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
|
|
return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
|
|
}
|
|
|
|
/// @brief The magic DWORD for Checksum field encryption.
|
|
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
|
/// @brief Encrypt Checksum field for writting NLP file.
|
|
static constexpr uint32_t EncryptChecksum(uint32_t value) {
|
|
return value + CHECKSUM_OFFSET;
|
|
}
|
|
/// @brief Decrypt Checksum field read from NLP file.
|
|
static constexpr uint32_t DecryptChecksum(uint32_t value) {
|
|
return value - CHECKSUM_OFFSET;
|
|
}
|
|
|
|
/// @brief The core array for Body circular XOR encryption.
|
|
/// @details First byte will XOR with the first byte of this array, and so on.
|
|
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
|
|
constexpr const uint8_t XOR_ARRAY[] {
|
|
0x2C, 0xA8, 0x56, 0xF9, 0xBD, 0xA6, 0x8D, 0x15, 0x25, 0x38, 0x1A, 0xD4, 0x65, 0x58, 0x28, 0x37,
|
|
0xFA, 0x6B, 0xB5, 0xA1, 0x2C, 0x96, 0x13, 0xA2, 0xAB, 0x4F, 0xC5, 0xA1, 0x3E, 0xA7, 0x91, 0x8D,
|
|
0x2C, 0xDF, 0x78, 0x6D, 0x3C, 0xFC, 0x92, 0x1F, 0x1A, 0x62, 0xA7, 0x9C, 0x92, 0x29, 0x44, 0x6D,
|
|
0x3D, 0xA9, 0x2B, 0xE1, 0x91, 0xAD, 0x49, 0x3C, 0xE2, 0x33, 0xD2, 0x1A, 0x55, 0x92, 0xE7, 0x95,
|
|
0x8C, 0xDA, 0xD2, 0xCD, 0xA2, 0xCF, 0x92, 0x9A, 0xE1, 0xF9, 0x3A, 0x26, 0xFA, 0xC4, 0xA9, 0x23,
|
|
0xA9, 0x4D, 0x1A, 0x2C, 0x3C, 0x2A, 0xAC, 0x62, 0xA3, 0x92, 0xAC, 0x1F, 0x3E, 0xA6, 0xC9, 0xC8,
|
|
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
|
|
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
|
|
};
|
|
/// @brief The size of `XOR_ARRAY`.
|
|
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
|
|
/// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
|
|
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
|
|
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
|
|
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
|
|
// then this mask was computed wrongly.
|
|
static_assert(XOR_ARRAY_MASK == 0x7Fu);
|
|
/// @brief Encrypt or decrypt decompressed Body field.
|
|
static void CircularXorOperation(void* data, size_t data_len) {
|
|
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
|
|
for (size_t i = 0u; i < data_len; ++i) {
|
|
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
|
|
}
|
|
}
|
|
|
|
#pragma endregion
|
|
|
|
/// @brief Get the length of given file stream.
|
|
static uint32_t GetFileLength(std::ifstream& fin) {
|
|
// Fetch the types this stream used for following convenience.
|
|
using stream_off_t = std::ifstream::off_type;
|
|
|
|
// Backups current file cursor.
|
|
stream_off_t current_pos = fin.tellg();
|
|
// Seek to the tail and get corresponding offset to get the length of file.
|
|
fin.seekg(0, std::ios_base::end);
|
|
stream_off_t tail_pos = fin.tellg();
|
|
// Restore to previous backup file cursor
|
|
fin.seekg(current_pos, std::ios_base::beg);
|
|
|
|
// Safely reurn cast length.
|
|
return SafeCast<uint32_t>(tail_pos);
|
|
}
|
|
|
|
// HINTS:
|
|
// In zlib, uLong and uLongf is 32-bit or more.
|
|
// So when casting them to uint32_t, you need use SafeCast to perform boundary check.
|
|
// However, you can directly cast uint32_t to them because there is no overflow issue.
|
|
// Additionally, uInt is 16-bit or more.
|
|
// So when processing with uInt, please more carefully.
|
|
|
|
static void EncodeNlp(std::ifstream& fin, std::ofstream& fout) {
|
|
// Get file length and fetch
|
|
uint32_t raw_size = GetFileLength(fin);
|
|
// Fetch corresponding zlib boundary for the convenience of zlib encode.
|
|
// uLong is 32-bit or more, so we need check whether uint32_t can hold the result first.
|
|
uint32_t computed_boundary = SafeCast<uint32_t>(compressBound(static_cast<uLong>(raw_size)));
|
|
|
|
// Create buffer first
|
|
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[raw_size]);
|
|
std::unique_ptr<char[]> outbuf(new(std::nothrow) char[computed_boundary]);
|
|
if (inbuf == nullptr || outbuf == nullptr)
|
|
throw NlpException("Fail to allocate memory.");
|
|
|
|
// Read data from file to input buffer
|
|
SafeRead(fin, inbuf.get(), raw_size);
|
|
|
|
// Do XOR operation
|
|
CircularXorOperation(inbuf.get(), raw_size);
|
|
|
|
// Do compress and get the size of compressed data.
|
|
uLongf dest_len = static_cast<uLongf>(computed_boundary);
|
|
int ret = compress2(
|
|
reinterpret_cast<Bytef*>(outbuf.get()), &dest_len,
|
|
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
|
|
Z_BEST_COMPRESSION
|
|
);
|
|
// Check zlib result.
|
|
if (ret != Z_OK)
|
|
throw NlpException("zlib compress() failed.");
|
|
// Fetch final compressed size.
|
|
uint32_t compressed_size = SafeCast<uint32_t>(dest_len);
|
|
|
|
// Produce checksum
|
|
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));
|
|
|
|
// Write compressed data into file
|
|
SafeWrite(fout, outbuf.get(), compressed_size);
|
|
|
|
// Raw size and checksum need some extra encryption before writting
|
|
raw_size = EncryptRawFileLength(raw_size);
|
|
checksum = EncryptChecksum(checksum);
|
|
|
|
// Write raw size and checksum
|
|
SafeWrite(fout, reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
|
SafeWrite(fout, reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
|
|
|
|
}
|
|
|
|
static void DecodeNlp(std::ifstream& fin, std::ofstream& fout) {
|
|
// Seek to tail to get essential data
|
|
uint32_t compressed_size = GetFileLength(fin);
|
|
if (compressed_size < TAIL_SIZE)
|
|
throw NlpException("Invalid file. File is too short.");
|
|
|
|
// Get expected raw size and checksum
|
|
compressed_size -= TAIL_SIZE;
|
|
fin.seekg(compressed_size, std::ios_base::beg);
|
|
uint32_t expected_raw_size = 0u, expected_checksum = 0u;
|
|
SafeRead(fin, reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
|
|
SafeRead(fin, reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
|
|
fin.seekg(0, std::ios_base::beg);
|
|
|
|
// Raw size and checksum data need to do some extra decryption.
|
|
expected_raw_size = DecryptRawFileLength(expected_raw_size);
|
|
expected_checksum = DecryptChecksum(expected_checksum);
|
|
|
|
// Allocate memory to store data
|
|
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
|
|
std::unique_ptr<char[]> outbuf(new(std::nothrow) char[expected_raw_size]);
|
|
if (inbuf == nullptr || outbuf == nullptr)
|
|
throw NlpException("Fail to allocate memory.");
|
|
|
|
// Read file into buffer
|
|
SafeRead(fin, inbuf.get(), compressed_size);
|
|
|
|
// Test checksum
|
|
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
|
|
if (checksum != expected_checksum)
|
|
throw NlpException(
|
|
std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
|
|
);
|
|
|
|
// Do decompress
|
|
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
|
|
int ret = uncompress(
|
|
reinterpret_cast<Bytef*>(outbuf.get()), &_destLen,
|
|
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(compressed_size)
|
|
);
|
|
// Check zlib result
|
|
if (ret != Z_OK)
|
|
throw NlpException("zlib uncompress() failed.");
|
|
|
|
// Do XOR operation
|
|
CircularXorOperation(outbuf.get(), expected_raw_size);
|
|
|
|
// Write result into file
|
|
SafeWrite(fout, outbuf.get(), expected_raw_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
namespace NlpCodec::Runtime {
|
|
|
|
enum class UserOperation {
|
|
Encode,
|
|
Decode,
|
|
Version,
|
|
Help
|
|
};
|
|
|
|
struct UserRequest {
|
|
UserOperation mUserOperation;
|
|
std::filesystem::path mInputFile;
|
|
std::filesystem::path mOutputFile;
|
|
};
|
|
|
|
static void PrintHelp() {
|
|
std::cout
|
|
<< "NlpCodec Usage" << std::endl
|
|
<< "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
|
|
<< std::endl
|
|
<< "version - print version info about this program." << std::endl
|
|
<< "help - print this page." << std::endl
|
|
<< std::endl
|
|
<< "encode - encode text file into NLP file." << std::endl
|
|
<< "decode - decode NLP file into text file." << std::endl
|
|
<< "<src> - the source file." << std::endl
|
|
<< " encode mode: the path to text file." << std::endl
|
|
<< " decode mode: the path to NLP file." << std::endl
|
|
<< "<dest> - the destination file." << std::endl
|
|
<< " encode mode: the path to NLP file." << std::endl
|
|
<< " decode mode: the path to text file." << std::endl
|
|
<< "" << std::endl;
|
|
}
|
|
|
|
static void PrintVersion() {
|
|
std::cout
|
|
<< "NlpCodec built at " __DATE__ " " __TIME__ << std::endl
|
|
<< "MIT License. Copyright (c) 2022-2024 yyc12345" << std::endl;
|
|
}
|
|
|
|
static UserRequest ResolveArguments(int argc, char* argv[]) {
|
|
// Prepare return value
|
|
UserRequest ret { UserOperation::Version, "", "" };
|
|
|
|
switch (argc) {
|
|
case 2: {
|
|
// Get mode string
|
|
std::string mode(argv[1]);
|
|
|
|
// Check `help` and `version`
|
|
if (mode == "version") {
|
|
ret.mUserOperation = UserOperation::Version;
|
|
} else if (mode == "help") {
|
|
ret.mUserOperation = UserOperation::Help;
|
|
} else {
|
|
// Not matched.
|
|
throw NlpException("Invalid argument! Must be one of `version` or `help`");
|
|
}
|
|
|
|
// Return value
|
|
return ret;
|
|
}
|
|
case 4: {
|
|
// Get mode string
|
|
std::string mode(argv[1]);
|
|
|
|
// Check `encode` and `decode`
|
|
if (mode == "encode") {
|
|
ret.mUserOperation = UserOperation::Encode;
|
|
} else if (mode == "decode") {
|
|
ret.mUserOperation = UserOperation::Decode;
|
|
} else {
|
|
// Not matched.
|
|
throw NlpException("Invalid argument! Must be one of `encode` or `decode`");
|
|
}
|
|
|
|
// Setup input output file path
|
|
ret.mInputFile = std::filesystem::path(argv[2]);
|
|
ret.mOutputFile = std::filesystem::path(argv[3]);
|
|
|
|
// Return value
|
|
return ret;
|
|
}
|
|
default:
|
|
throw NlpException("Invalid argument count!");
|
|
}
|
|
}
|
|
|
|
static void ExecuteWorker(const UserRequest& user_request) {
|
|
// Take action according to different request first
|
|
bool is_encode;
|
|
switch (user_request.mUserOperation) {
|
|
case UserOperation::Version:
|
|
PrintVersion();
|
|
return;
|
|
case NlpCodec::Runtime::UserOperation::Help:
|
|
PrintHelp();
|
|
return;
|
|
case NlpCodec::Runtime::UserOperation::Encode:
|
|
is_encode = true;
|
|
break;
|
|
case NlpCodec::Runtime::UserOperation::Decode:
|
|
is_encode = false;
|
|
break;
|
|
}
|
|
|
|
// Do real codec related works.
|
|
// Try to open files
|
|
std::ifstream in_file;
|
|
in_file.open(user_request.mInputFile, std::ios_base::in | std::ios_base::binary);
|
|
std::ofstream out_file;
|
|
out_file.open(user_request.mOutputFile, std::ios_base::out | std::ios_base::binary);
|
|
// Check file status
|
|
if (!in_file.is_open() || !out_file.is_open()) {
|
|
throw NlpException("Fail to open input or output file.");
|
|
}
|
|
|
|
// Perform codec
|
|
if (is_encode) {
|
|
::NlpCodec::EncodeNlp(in_file, out_file);
|
|
} else {
|
|
::NlpCodec::DecodeNlp(in_file, out_file);
|
|
}
|
|
|
|
// Free resources
|
|
in_file.close();
|
|
out_file.close();
|
|
}
|
|
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
|
|
// Try parsing given arguments
|
|
NlpCodec::Runtime::UserRequest user_request;
|
|
try {
|
|
user_request = NlpCodec::Runtime::ResolveArguments(argc, argv);
|
|
} catch (const NlpCodec::NlpException& e) {
|
|
std::cerr << "[Argument Error] " << e.what() << std::endl;
|
|
return 1;
|
|
}
|
|
|
|
// Try executing real wroker
|
|
try {
|
|
NlpCodec::Runtime::ExecuteWorker(user_request);
|
|
} catch (const NlpCodec::NlpException& e) {
|
|
std::cerr << "[Codec Error] " << e.what() << std::endl;
|
|
return 2;
|
|
}
|
|
|
|
return 0;
|
|
}
|