fix: fix NlpCodec compile issue.
- fix std::ifstream length getter. - use std::format in throwing exception.
This commit is contained in:
parent
52ea2745dd
commit
6193a2ede6
@ -9,9 +9,13 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <format>
|
||||||
|
|
||||||
namespace NlpCodec {
|
namespace NlpCodec {
|
||||||
|
|
||||||
|
/// @brief NlpCodec universal exception.
|
||||||
|
/// @details Once this exception was thrown, it means that somethings went wrong.
|
||||||
|
/// and main function should catch it, output error message and exit program immediately.
|
||||||
class NlpException : public std::exception {
|
class NlpException : public std::exception {
|
||||||
public:
|
public:
|
||||||
NlpException(const char* msg) : message(msg ? msg : "") {}
|
NlpException(const char* msg) : message(msg ? msg : "") {}
|
||||||
@ -22,25 +26,107 @@ namespace NlpCodec {
|
|||||||
std::string message;
|
std::string message;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// @brief The safe version of static_cast which throw exception
|
/// @brief The safe version of `static_cast` which throw exception
|
||||||
/// if given value can not be cast into given type (out of range).
|
/// if given value can not be cast into given type (out of range).
|
||||||
template<typename _TyTo, typename _TyFrom>
|
template<typename _TyTo, typename _TyFrom>
|
||||||
static constexpr _TyTo SafeCast(_TyFrom value) {
|
static constexpr _TyTo SafeCast(_TyFrom value) {
|
||||||
if (!std::in_range<_TyTo>(value))
|
if (!std::in_range<_TyTo>(value))
|
||||||
throw NlpException(
|
throw NlpException(
|
||||||
"Fail to cast integral number because given value is greater than container."
|
"Fail to cast integral number because given value is greater than container. "
|
||||||
"This is usually caused by your input or output file is too long.");
|
"This is usually caused by your input or output file is too long.");
|
||||||
return static_cast<_TyTo>(value);
|
return static_cast<_TyTo>(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// @brief The magic DWORD for file length encrption.
|
#pragma region "Encryption Stuff" {
|
||||||
/// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
|
|
||||||
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
|
||||||
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
|
||||||
/// @brief The size of extra part of NLP file which store the size of original plain text file.
|
|
||||||
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
|
|
||||||
|
|
||||||
/// @brief The core array for data encryption.
|
/*
|
||||||
|
|
||||||
|
# NLP File Structure
|
||||||
|
|
||||||
|
|Annotation |Size |
|
||||||
|
|:--- |:--- |
|
||||||
|
|Body |variable |
|
||||||
|
|Raw File Length |4 bytes |
|
||||||
|
|Checksum |4 bytes |
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
The first part is a zlib compressed byte array.
|
||||||
|
Before any process, we need use zlib to decompress it first.
|
||||||
|
If we need do reverse operation, e.g. build this compressed byte array,
|
||||||
|
the compression level must be maximum value (best compression, e.g. 9).
|
||||||
|
|
||||||
|
After decompress this byte array, we need to an extra step called circular XOR operation
|
||||||
|
to get human-readable plain text data.
|
||||||
|
In this operation, we first have a hard-code `XOR_ARRAY`,
|
||||||
|
then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
|
||||||
|
When we reaching the tail of `XOR_ARRAY`,
|
||||||
|
the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
|
||||||
|
That's the reason why we call this operation is "circular" XOR operation.
|
||||||
|
The reverse operation of this step is nothing changed.
|
||||||
|
Because the reverse operation of XOR is perform it again.
|
||||||
|
|
||||||
|
After all byte are XORed, we can get what we want,
|
||||||
|
a human-readable translation file in plain text for following processing.
|
||||||
|
|
||||||
|
## Raw File Length
|
||||||
|
|
||||||
|
The `uint32_t` field following Body is Raw File Length,
|
||||||
|
which store the length of raw data, e.g. the length of zlib decompressed byte array.
|
||||||
|
It's convenient when decompress Body.
|
||||||
|
|
||||||
|
However, this field is encrypted when storing in NLP file.
|
||||||
|
We need to do some extra operations before using it.
|
||||||
|
Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
|
||||||
|
I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
|
||||||
|
So, just do it and don't worry too much.
|
||||||
|
By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
||||||
|
|
||||||
|
The reverse operation, e.g. building this file when creating NLP, is also simple.
|
||||||
|
It's okey that just flip the whole steps I introduced above.
|
||||||
|
|
||||||
|
## Checksum
|
||||||
|
|
||||||
|
The `uint32_t` field following Body is Checksum,
|
||||||
|
which is just the CRC32 of Body.
|
||||||
|
This field is usually used to validate the integrity of NLP file.
|
||||||
|
|
||||||
|
Same like Raw File Length, this field is also encrypted in NLP file.
|
||||||
|
But its encryption method is quitely simpler than Raw File Length.
|
||||||
|
For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
|
||||||
|
|
||||||
|
The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/// @brief The size of non-Body part of NLP file
|
||||||
|
/// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
|
||||||
|
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
|
||||||
|
|
||||||
|
/// @brief The magic DWORD for Raw File Length field encrption.
|
||||||
|
/// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
||||||
|
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
||||||
|
/// @brief Encrypt Raw File Length field for writting NLP file.
|
||||||
|
static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
|
||||||
|
return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
|
||||||
|
}
|
||||||
|
/// @brief Decrypt Raw File Length field read from NLP file.
|
||||||
|
static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
|
||||||
|
return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief The magic DWORD for Checksum field encryption.
|
||||||
|
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
||||||
|
/// @brief Encrypt Checksum field for writting NLP file.
|
||||||
|
static constexpr uint32_t EncryptChecksum(uint32_t value) {
|
||||||
|
return value + CHECKSUM_OFFSET;
|
||||||
|
}
|
||||||
|
/// @brief Decrypt Checksum field read from NLP file.
|
||||||
|
static constexpr uint32_t DecryptChecksum(uint32_t value) {
|
||||||
|
return value - CHECKSUM_OFFSET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief The core array for Body circular XOR encryption.
|
||||||
/// @details First byte will XOR with the first byte of this array, and so on.
|
/// @details First byte will XOR with the first byte of this array, and so on.
|
||||||
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
|
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
|
||||||
constexpr const uint8_t XOR_ARRAY[] {
|
constexpr const uint8_t XOR_ARRAY[] {
|
||||||
@ -53,37 +139,36 @@ namespace NlpCodec {
|
|||||||
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
|
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
|
||||||
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
|
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
|
||||||
};
|
};
|
||||||
/// @brief The size of above array.
|
/// @brief The size of `XOR_ARRAY`.
|
||||||
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
|
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
|
||||||
/// @brief A convenient mask for above array when performing modulo.
|
/// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
|
||||||
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
|
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
|
||||||
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
|
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
|
||||||
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
|
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
|
||||||
// then this mask was computed wrongly.
|
// then this mask was computed wrongly.
|
||||||
static_assert(XOR_ARRAY_MASK == 0x7Fu);
|
static_assert(XOR_ARRAY_MASK == 0x7Fu);
|
||||||
|
/// @brief Encrypt or decrypt decompressed Body field.
|
||||||
static void GeneralXorOperation(void* data, size_t data_len) {
|
static void CircularXorOperation(void* data, size_t data_len) {
|
||||||
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
|
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
|
||||||
for (size_t i = 0u; i < data_len; ++i) {
|
for (size_t i = 0u; i < data_len; ++i) {
|
||||||
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
|
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma endregion }
|
||||||
|
|
||||||
/// @brief Get the length of given file stream.
|
/// @brief Get the length of given file stream.
|
||||||
static uint32_t GetFileLength(std::ifstream& fin) {
|
static uint32_t GetFileLength(std::ifstream& fin) {
|
||||||
// Fetch the types this stream used for following convenience.
|
// Fetch the types this stream used for following convenience.
|
||||||
using stream_pos_t = std::ifstream::pos_type;
|
|
||||||
using stream_off_t = std::ifstream::off_type;
|
using stream_off_t = std::ifstream::off_type;
|
||||||
|
|
||||||
// Backups current file cursor.
|
// Backups current file cursor.
|
||||||
stream_pos_t current_pos = fin.tellg();
|
stream_off_t current_pos = fin.tellg();
|
||||||
// Seek to the tail and get corresponding offset to get the length of file.
|
// Seek to the tail and get corresponding offset to get the length of file.
|
||||||
fin.seekg(0, std::ios_base::end);
|
fin.seekg(0, std::ios_base::end);
|
||||||
stream_pos_t tail_pos = fin.tellg();
|
stream_off_t tail_pos = fin.tellg();
|
||||||
if (std::numeric_limits<uint32_t>::max() < tail_pos)
|
|
||||||
throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
|
|
||||||
// Restore to previous backup file cursor
|
// Restore to previous backup file cursor
|
||||||
fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
|
fin.seekg(current_pos, std::ios_base::beg);
|
||||||
|
|
||||||
// Safely reurn cast length.
|
// Safely reurn cast length.
|
||||||
return SafeCast<uint32_t>(tail_pos);
|
return SafeCast<uint32_t>(tail_pos);
|
||||||
@ -115,7 +200,7 @@ namespace NlpCodec {
|
|||||||
throw NlpException("Fail to read file data into buffer.");
|
throw NlpException("Fail to read file data into buffer.");
|
||||||
|
|
||||||
// Do XOR operation
|
// Do XOR operation
|
||||||
GeneralXorOperation(inbuf.get(), raw_size);
|
CircularXorOperation(inbuf.get(), raw_size);
|
||||||
|
|
||||||
// Do compress and get the size of compressed data.
|
// Do compress and get the size of compressed data.
|
||||||
uLongf dest_len = static_cast<uLongf>(computed_boundary);
|
uLongf dest_len = static_cast<uLongf>(computed_boundary);
|
||||||
@ -139,8 +224,8 @@ namespace NlpCodec {
|
|||||||
throw NlpException("Fail to write data into file.");
|
throw NlpException("Fail to write data into file.");
|
||||||
|
|
||||||
// Raw size and checksum need some extra encryption before writting
|
// Raw size and checksum need some extra encryption before writting
|
||||||
raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
|
raw_size = EncryptRawFileLength(raw_size);
|
||||||
checksum = checksum + CHECKSUM_OFFSET;
|
checksum = EncryptChecksum(checksum);
|
||||||
|
|
||||||
// Write raw size and checksum
|
// Write raw size and checksum
|
||||||
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
||||||
@ -167,8 +252,8 @@ namespace NlpCodec {
|
|||||||
fin.seekg(0, std::ios_base::beg);
|
fin.seekg(0, std::ios_base::beg);
|
||||||
|
|
||||||
// Raw size and checksum data need to do some extra decryption.
|
// Raw size and checksum data need to do some extra decryption.
|
||||||
expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
|
expected_raw_size = DecryptRawFileLength(expected_raw_size);
|
||||||
expected_checksum = expected_checksum - CHECKSUM_OFFSET;
|
expected_checksum = DecryptChecksum(expected_checksum);
|
||||||
|
|
||||||
// Allocate memory to store data
|
// Allocate memory to store data
|
||||||
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
|
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
|
||||||
@ -183,12 +268,10 @@ namespace NlpCodec {
|
|||||||
|
|
||||||
// Test checksum
|
// Test checksum
|
||||||
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
|
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
|
||||||
if (checksum != expected_checksum) {
|
if (checksum != expected_checksum)
|
||||||
fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
|
throw NlpException(
|
||||||
expected_checksum, checksum
|
std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
|
||||||
);
|
);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do decompress
|
// Do decompress
|
||||||
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
|
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
|
||||||
@ -201,7 +284,7 @@ namespace NlpCodec {
|
|||||||
throw NlpException("Zlib uncompress() failed.");
|
throw NlpException("Zlib uncompress() failed.");
|
||||||
|
|
||||||
// do xor operation
|
// do xor operation
|
||||||
GeneralXorOperation(outbuf.get(), expected_raw_size);
|
CircularXorOperation(outbuf.get(), expected_raw_size);
|
||||||
|
|
||||||
// Write result into file
|
// Write result into file
|
||||||
fout.write(outbuf.get(), expected_raw_size);
|
fout.write(outbuf.get(), expected_raw_size);
|
||||||
|
Loading…
Reference in New Issue
Block a user