fix: fix NlpCodec compile issue.

- fix std::ifstream length getter.
- use std::format in throwing exception.
This commit is contained in:
yyc12345 2024-12-11 16:20:21 +08:00
parent 52ea2745dd
commit 6193a2ede6

View File

@ -9,9 +9,13 @@
#include <limits>
#include <stdexcept>
#include <utility>
#include <format>
namespace NlpCodec {
/// @brief NlpCodec universal exception.
/// @details Once this exception was thrown, it means that somethings went wrong.
/// and main function should catch it, output error message and exit program immediately.
class NlpException : public std::exception {
public:
NlpException(const char* msg) : message(msg ? msg : "") {}
@ -22,7 +26,7 @@ namespace NlpCodec {
std::string message;
};
/// @brief The safe version of static_cast which throw exception
/// @brief The safe version of `static_cast` which throw exception
/// if given value can not be cast into given type (out of range).
template<typename _TyTo, typename _TyFrom>
static constexpr _TyTo SafeCast(_TyFrom value) {
@ -33,14 +37,96 @@ namespace NlpCodec {
return static_cast<_TyTo>(value);
}
/// @brief The magic DWORD for file length encrption.
/// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
/// @brief The size of extra part of NLP file which store the size of original plain text file.
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
#pragma region "Encryption Stuff" {
/// @brief The core array for data encryption.
/*
# NLP File Structure
|Annotation |Size |
|:--- |:--- |
|Body |variable |
|Raw File Length |4 bytes |
|Checksum |4 bytes |
## Body
The first part is a zlib compressed byte array.
Before any process, we need use zlib to decompress it first.
If we need do reverse operation, e.g. build this compressed byte array,
the compression level must be maximum value (best compression, e.g. 9).
After decompress this byte array, we need to an extra step called circular XOR operation
to get human-readable plain text data.
In this operation, we first have a hard-code `XOR_ARRAY`,
then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
When we reaching the tail of `XOR_ARRAY`,
the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
That's the reason why we call this operation is "circular" XOR operation.
The reverse operation of this step is nothing changed.
Because the reverse operation of XOR is perform it again.
After all byte are XORed, we can get what we want,
a human-readable translation file in plain text for following processing.
## Raw File Length
The `uint32_t` field following Body is Raw File Length,
which store the length of raw data, e.g. the length of zlib decompressed byte array.
It's convenient when decompress Body.
However, this field is encrypted when storing in NLP file.
We need to do some extra operations before using it.
Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
So, just do it and don't worry too much.
By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
The reverse operation, e.g. building this file when creating NLP, is also simple.
It's okey that just flip the whole steps I introduced above.
## Checksum
The `uint32_t` field following Body is Checksum,
which is just the CRC32 of Body.
This field is usually used to validate the integrity of NLP file.
Same like Raw File Length, this field is also encrypted in NLP file.
But its encryption method is quitely simpler than Raw File Length.
For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
*/
/// @brief The size of non-Body part of NLP file
/// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
/// @brief The magic DWORD for Raw File Length field encrption.
/// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
/// @brief Encrypt Raw File Length field for writting NLP file.
static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
}
/// @brief Decrypt Raw File Length field read from NLP file.
static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
}
/// @brief The magic DWORD for Checksum field encryption.
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
/// @brief Encrypt Checksum field for writting NLP file.
static constexpr uint32_t EncryptChecksum(uint32_t value) {
return value + CHECKSUM_OFFSET;
}
/// @brief Decrypt Checksum field read from NLP file.
static constexpr uint32_t DecryptChecksum(uint32_t value) {
return value - CHECKSUM_OFFSET;
}
/// @brief The core array for Body circular XOR encryption.
/// @details First byte will XOR with the first byte of this array, and so on.
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
constexpr const uint8_t XOR_ARRAY[] {
@ -53,37 +139,36 @@ namespace NlpCodec {
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
};
/// @brief The size of above array.
/// @brief The size of `XOR_ARRAY`.
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
/// @brief A convenient mask for above array when performing modulo.
/// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
// then this mask was computed wrongly.
static_assert(XOR_ARRAY_MASK == 0x7Fu);
static void GeneralXorOperation(void* data, size_t data_len) {
/// @brief Encrypt or decrypt decompressed Body field.
static void CircularXorOperation(void* data, size_t data_len) {
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
for (size_t i = 0u; i < data_len; ++i) {
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
}
}
#pragma endregion }
/// @brief Get the length of given file stream.
static uint32_t GetFileLength(std::ifstream& fin) {
// Fetch the types this stream used for following convenience.
using stream_pos_t = std::ifstream::pos_type;
using stream_off_t = std::ifstream::off_type;
// Backups current file cursor.
stream_pos_t current_pos = fin.tellg();
stream_off_t current_pos = fin.tellg();
// Seek to the tail and get corresponding offset to get the length of file.
fin.seekg(0, std::ios_base::end);
stream_pos_t tail_pos = fin.tellg();
if (std::numeric_limits<uint32_t>::max() < tail_pos)
throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
stream_off_t tail_pos = fin.tellg();
// Restore to previous backup file cursor
fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
fin.seekg(current_pos, std::ios_base::beg);
// Safely reurn cast length.
return SafeCast<uint32_t>(tail_pos);
@ -115,7 +200,7 @@ namespace NlpCodec {
throw NlpException("Fail to read file data into buffer.");
// Do XOR operation
GeneralXorOperation(inbuf.get(), raw_size);
CircularXorOperation(inbuf.get(), raw_size);
// Do compress and get the size of compressed data.
uLongf dest_len = static_cast<uLongf>(computed_boundary);
@ -139,8 +224,8 @@ namespace NlpCodec {
throw NlpException("Fail to write data into file.");
// Raw size and checksum need some extra encryption before writting
raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
checksum = checksum + CHECKSUM_OFFSET;
raw_size = EncryptRawFileLength(raw_size);
checksum = EncryptChecksum(checksum);
// Write raw size and checksum
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
@ -167,8 +252,8 @@ namespace NlpCodec {
fin.seekg(0, std::ios_base::beg);
// Raw size and checksum data need to do some extra decryption.
expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
expected_checksum = expected_checksum - CHECKSUM_OFFSET;
expected_raw_size = DecryptRawFileLength(expected_raw_size);
expected_checksum = DecryptChecksum(expected_checksum);
// Allocate memory to store data
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
@ -183,12 +268,10 @@ namespace NlpCodec {
// Test checksum
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
if (checksum != expected_checksum) {
fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
expected_checksum, checksum
if (checksum != expected_checksum)
throw NlpException(
std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
);
return false;
}
// Do decompress
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
@ -201,7 +284,7 @@ namespace NlpCodec {
throw NlpException("Zlib uncompress() failed.");
// do xor operation
GeneralXorOperation(outbuf.get(), expected_raw_size);
CircularXorOperation(outbuf.get(), expected_raw_size);
// Write result into file
fout.write(outbuf.get(), expected_raw_size);