Compare commits
3 Commits
52ea2745dd
...
master
Author | SHA1 | Date | |
---|---|---|---|
60fca862f3 | |||
b71f6867c5 | |||
6193a2ede6 |
@ -9,9 +9,15 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <format>
|
||||||
|
|
||||||
namespace NlpCodec {
|
namespace NlpCodec {
|
||||||
|
|
||||||
|
#pragma region Help Structs and Functions
|
||||||
|
|
||||||
|
/// @brief NlpCodec universal exception.
|
||||||
|
/// @details Once this exception was thrown, it means that somethings went wrong.
|
||||||
|
/// and main function should catch it, output error message and exit program immediately.
|
||||||
class NlpException : public std::exception {
|
class NlpException : public std::exception {
|
||||||
public:
|
public:
|
||||||
NlpException(const char* msg) : message(msg ? msg : "") {}
|
NlpException(const char* msg) : message(msg ? msg : "") {}
|
||||||
@ -22,25 +28,124 @@ namespace NlpCodec {
|
|||||||
std::string message;
|
std::string message;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// @brief The safe version of static_cast which throw exception
|
/// @brief The safe version of `static_cast` which throw exception
|
||||||
/// if given value can not be cast into given type (out of range).
|
/// if given value can not be cast into given type (out of range).
|
||||||
template<typename _TyTo, typename _TyFrom>
|
template<typename _TyTo, typename _TyFrom>
|
||||||
static constexpr _TyTo SafeCast(_TyFrom value) {
|
static constexpr _TyTo SafeCast(_TyFrom value) {
|
||||||
if (!std::in_range<_TyTo>(value))
|
if (!std::in_range<_TyTo>(value))
|
||||||
throw NlpException(
|
throw NlpException(
|
||||||
"Fail to cast integral number because given value is greater than container."
|
"Fail to cast integral number because given value is greater than the type can hold. "
|
||||||
"This is usually caused by your input or output file is too long.");
|
"This is usually caused by your input or output file is too long.");
|
||||||
return static_cast<_TyTo>(value);
|
return static_cast<_TyTo>(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// @brief The magic DWORD for file length encrption.
|
/// @brief The safe version of `std::ifstream::read`.
|
||||||
/// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
|
/// Throw exception if fail to read.
|
||||||
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
static void SafeRead(std::ifstream& fin, char* s, std::streamsize count) {
|
||||||
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
fin.read(s, count);
|
||||||
/// @brief The size of extra part of NLP file which store the size of original plain text file.
|
if (!fin.good() || fin.gcount() != count)
|
||||||
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
|
throw NlpException("Fail to read data from file.");
|
||||||
|
}
|
||||||
|
/// @brief The safe version of `std::ofstream::write`.
|
||||||
|
/// Throw exception if fail to write.
|
||||||
|
static void SafeWrite(std::ofstream& fout, const char* s, std::streamsize count) {
|
||||||
|
fout.write(s, count);
|
||||||
|
if (!fout.good())
|
||||||
|
throw NlpException("Fail to write data into file.");
|
||||||
|
}
|
||||||
|
|
||||||
/// @brief The core array for data encryption.
|
#pragma endregion
|
||||||
|
|
||||||
|
#pragma region Encryption Stuff
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
# NLP File Structure
|
||||||
|
|
||||||
|
|Annotation |Size |
|
||||||
|
|:--- |:--- |
|
||||||
|
|Body |variable |
|
||||||
|
|Raw File Length |4 bytes |
|
||||||
|
|Checksum |4 bytes |
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
The first part is a zlib compressed byte array.
|
||||||
|
Before any process, we need use zlib to decompress it first.
|
||||||
|
If we need do reverse operation, e.g. build this compressed byte array,
|
||||||
|
the compression level must be maximum value (best compression, e.g. 9).
|
||||||
|
|
||||||
|
After decompress this byte array, we need to an extra step called circular XOR operation
|
||||||
|
to get human-readable plain text data.
|
||||||
|
In this operation, we first have a hard-code `XOR_ARRAY`,
|
||||||
|
then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
|
||||||
|
When we reaching the tail of `XOR_ARRAY`,
|
||||||
|
the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
|
||||||
|
That's the reason why we call this operation is "circular" XOR operation.
|
||||||
|
The reverse operation of this step is nothing changed.
|
||||||
|
Because the reverse operation of XOR is perform it again.
|
||||||
|
|
||||||
|
After all byte are XORed, we can get what we want,
|
||||||
|
a human-readable translation file in plain text for following processing.
|
||||||
|
|
||||||
|
## Raw File Length
|
||||||
|
|
||||||
|
The `uint32_t` field following Body is Raw File Length,
|
||||||
|
which store the length of raw data, e.g. the length of zlib decompressed byte array.
|
||||||
|
It's convenient when decompress Body.
|
||||||
|
|
||||||
|
However, this field is encrypted when storing in NLP file.
|
||||||
|
We need to do some extra operations before using it.
|
||||||
|
Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
|
||||||
|
I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
|
||||||
|
So, just do it and don't worry too much.
|
||||||
|
By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
||||||
|
|
||||||
|
The reverse operation, e.g. building this file when creating NLP, is also simple.
|
||||||
|
It's okey that just flip the whole steps I introduced above.
|
||||||
|
|
||||||
|
## Checksum
|
||||||
|
|
||||||
|
The `uint32_t` field following Body is Checksum,
|
||||||
|
which is just the CRC32 of Body.
|
||||||
|
This field is usually used to validate the integrity of NLP file.
|
||||||
|
|
||||||
|
Same like Raw File Length, this field is also encrypted in NLP file.
|
||||||
|
But its encryption method is quitely simpler than Raw File Length.
|
||||||
|
For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
|
||||||
|
|
||||||
|
The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/// @brief The size of non-Body part of NLP file
|
||||||
|
/// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
|
||||||
|
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
|
||||||
|
|
||||||
|
/// @brief The magic DWORD for Raw File Length field encrption.
|
||||||
|
/// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
||||||
|
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
||||||
|
/// @brief Encrypt Raw File Length field for writting NLP file.
|
||||||
|
static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
|
||||||
|
return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
|
||||||
|
}
|
||||||
|
/// @brief Decrypt Raw File Length field read from NLP file.
|
||||||
|
static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
|
||||||
|
return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief The magic DWORD for Checksum field encryption.
|
||||||
|
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
||||||
|
/// @brief Encrypt Checksum field for writting NLP file.
|
||||||
|
static constexpr uint32_t EncryptChecksum(uint32_t value) {
|
||||||
|
return value + CHECKSUM_OFFSET;
|
||||||
|
}
|
||||||
|
/// @brief Decrypt Checksum field read from NLP file.
|
||||||
|
static constexpr uint32_t DecryptChecksum(uint32_t value) {
|
||||||
|
return value - CHECKSUM_OFFSET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief The core array for Body circular XOR encryption.
|
||||||
/// @details First byte will XOR with the first byte of this array, and so on.
|
/// @details First byte will XOR with the first byte of this array, and so on.
|
||||||
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
|
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
|
||||||
constexpr const uint8_t XOR_ARRAY[] {
|
constexpr const uint8_t XOR_ARRAY[] {
|
||||||
@ -53,37 +158,36 @@ namespace NlpCodec {
|
|||||||
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
|
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
|
||||||
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
|
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
|
||||||
};
|
};
|
||||||
/// @brief The size of above array.
|
/// @brief The size of `XOR_ARRAY`.
|
||||||
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
|
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
|
||||||
/// @brief A convenient mask for above array when performing modulo.
|
/// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
|
||||||
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
|
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
|
||||||
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
|
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
|
||||||
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
|
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
|
||||||
// then this mask was computed wrongly.
|
// then this mask was computed wrongly.
|
||||||
static_assert(XOR_ARRAY_MASK == 0x7Fu);
|
static_assert(XOR_ARRAY_MASK == 0x7Fu);
|
||||||
|
/// @brief Encrypt or decrypt decompressed Body field.
|
||||||
static void GeneralXorOperation(void* data, size_t data_len) {
|
static void CircularXorOperation(void* data, size_t data_len) {
|
||||||
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
|
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
|
||||||
for (size_t i = 0u; i < data_len; ++i) {
|
for (size_t i = 0u; i < data_len; ++i) {
|
||||||
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
|
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma endregion
|
||||||
|
|
||||||
/// @brief Get the length of given file stream.
|
/// @brief Get the length of given file stream.
|
||||||
static uint32_t GetFileLength(std::ifstream& fin) {
|
static uint32_t GetFileLength(std::ifstream& fin) {
|
||||||
// Fetch the types this stream used for following convenience.
|
// Fetch the types this stream used for following convenience.
|
||||||
using stream_pos_t = std::ifstream::pos_type;
|
|
||||||
using stream_off_t = std::ifstream::off_type;
|
using stream_off_t = std::ifstream::off_type;
|
||||||
|
|
||||||
// Backups current file cursor.
|
// Backups current file cursor.
|
||||||
stream_pos_t current_pos = fin.tellg();
|
stream_off_t current_pos = fin.tellg();
|
||||||
// Seek to the tail and get corresponding offset to get the length of file.
|
// Seek to the tail and get corresponding offset to get the length of file.
|
||||||
fin.seekg(0, std::ios_base::end);
|
fin.seekg(0, std::ios_base::end);
|
||||||
stream_pos_t tail_pos = fin.tellg();
|
stream_off_t tail_pos = fin.tellg();
|
||||||
if (std::numeric_limits<uint32_t>::max() < tail_pos)
|
|
||||||
throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
|
|
||||||
// Restore to previous backup file cursor
|
// Restore to previous backup file cursor
|
||||||
fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
|
fin.seekg(current_pos, std::ios_base::beg);
|
||||||
|
|
||||||
// Safely reurn cast length.
|
// Safely reurn cast length.
|
||||||
return SafeCast<uint32_t>(tail_pos);
|
return SafeCast<uint32_t>(tail_pos);
|
||||||
@ -110,12 +214,10 @@ namespace NlpCodec {
|
|||||||
throw NlpException("Fail to allocate memory.");
|
throw NlpException("Fail to allocate memory.");
|
||||||
|
|
||||||
// Read data from file to input buffer
|
// Read data from file to input buffer
|
||||||
fin.read(inbuf.get(), raw_size);
|
SafeRead(fin, inbuf.get(), raw_size);
|
||||||
if (!fin.good() || fin.gcount() != raw_size)
|
|
||||||
throw NlpException("Fail to read file data into buffer.");
|
|
||||||
|
|
||||||
// Do XOR operation
|
// Do XOR operation
|
||||||
GeneralXorOperation(inbuf.get(), raw_size);
|
CircularXorOperation(inbuf.get(), raw_size);
|
||||||
|
|
||||||
// Do compress and get the size of compressed data.
|
// Do compress and get the size of compressed data.
|
||||||
uLongf dest_len = static_cast<uLongf>(computed_boundary);
|
uLongf dest_len = static_cast<uLongf>(computed_boundary);
|
||||||
@ -124,9 +226,9 @@ namespace NlpCodec {
|
|||||||
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
|
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
|
||||||
Z_BEST_COMPRESSION
|
Z_BEST_COMPRESSION
|
||||||
);
|
);
|
||||||
// Check ZLib result.
|
// Check zlib result.
|
||||||
if (ret != Z_OK)
|
if (ret != Z_OK)
|
||||||
throw NlpException("Zlib compress() failed.");
|
throw NlpException("zlib compress() failed.");
|
||||||
// Fetch final compressed size.
|
// Fetch final compressed size.
|
||||||
uint32_t compressed_size = SafeCast<uint32_t>(dest_len);
|
uint32_t compressed_size = SafeCast<uint32_t>(dest_len);
|
||||||
|
|
||||||
@ -134,21 +236,15 @@ namespace NlpCodec {
|
|||||||
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));
|
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));
|
||||||
|
|
||||||
// Write compressed data into file
|
// Write compressed data into file
|
||||||
fout.write(outbuf.get(), compressed_size);
|
SafeWrite(fout, outbuf.get(), compressed_size);
|
||||||
if (!fout.good())
|
|
||||||
throw NlpException("Fail to write data into file.");
|
|
||||||
|
|
||||||
// Raw size and checksum need some extra encryption before writting
|
// Raw size and checksum need some extra encryption before writting
|
||||||
raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
|
raw_size = EncryptRawFileLength(raw_size);
|
||||||
checksum = checksum + CHECKSUM_OFFSET;
|
checksum = EncryptChecksum(checksum);
|
||||||
|
|
||||||
// Write raw size and checksum
|
// Write raw size and checksum
|
||||||
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
SafeWrite(fout, reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
||||||
if (!fout.good())
|
SafeWrite(fout, reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
|
||||||
throw NlpException("Fail to write raw size into file.");
|
|
||||||
fout.write(reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
|
|
||||||
if (!fout.good())
|
|
||||||
throw NlpException("Fail to write checksum into file.");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,13 +258,13 @@ namespace NlpCodec {
|
|||||||
compressed_size -= TAIL_SIZE;
|
compressed_size -= TAIL_SIZE;
|
||||||
fin.seekg(compressed_size, std::ios_base::beg);
|
fin.seekg(compressed_size, std::ios_base::beg);
|
||||||
uint32_t expected_raw_size = 0u, expected_checksum = 0u;
|
uint32_t expected_raw_size = 0u, expected_checksum = 0u;
|
||||||
fin.read(reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
|
SafeRead(fin, reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
|
||||||
fin.read(reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
|
SafeRead(fin, reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
|
||||||
fin.seekg(0, std::ios_base::beg);
|
fin.seekg(0, std::ios_base::beg);
|
||||||
|
|
||||||
// Raw size and checksum data need to do some extra decryption.
|
// Raw size and checksum data need to do some extra decryption.
|
||||||
expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
|
expected_raw_size = DecryptRawFileLength(expected_raw_size);
|
||||||
expected_checksum = expected_checksum - CHECKSUM_OFFSET;
|
expected_checksum = DecryptChecksum(expected_checksum);
|
||||||
|
|
||||||
// Allocate memory to store data
|
// Allocate memory to store data
|
||||||
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
|
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
|
||||||
@ -177,18 +273,14 @@ namespace NlpCodec {
|
|||||||
throw NlpException("Fail to allocate memory.");
|
throw NlpException("Fail to allocate memory.");
|
||||||
|
|
||||||
// Read file into buffer
|
// Read file into buffer
|
||||||
fin.read(inbuf.get(), compressed_size);
|
SafeRead(fin, inbuf.get(), compressed_size);
|
||||||
if (!fin.good() || fin.gcount() != compressed_size)
|
|
||||||
throw NlpException("Fail to read data into buffer.\n");
|
|
||||||
|
|
||||||
// Test checksum
|
// Test checksum
|
||||||
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
|
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
|
||||||
if (checksum != expected_checksum) {
|
if (checksum != expected_checksum)
|
||||||
fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
|
throw NlpException(
|
||||||
expected_checksum, checksum
|
std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
|
||||||
);
|
);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do decompress
|
// Do decompress
|
||||||
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
|
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
|
||||||
@ -198,15 +290,13 @@ namespace NlpCodec {
|
|||||||
);
|
);
|
||||||
// Check zlib result
|
// Check zlib result
|
||||||
if (ret != Z_OK)
|
if (ret != Z_OK)
|
||||||
throw NlpException("Zlib uncompress() failed.");
|
throw NlpException("zlib uncompress() failed.");
|
||||||
|
|
||||||
// do xor operation
|
// Do XOR operation
|
||||||
GeneralXorOperation(outbuf.get(), expected_raw_size);
|
CircularXorOperation(outbuf.get(), expected_raw_size);
|
||||||
|
|
||||||
// Write result into file
|
// Write result into file
|
||||||
fout.write(outbuf.get(), expected_raw_size);
|
SafeWrite(fout, outbuf.get(), expected_raw_size);
|
||||||
if (!fout.good())
|
|
||||||
throw NlpException("Fail to write data into file.");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,16 +323,16 @@ namespace NlpCodec::Runtime {
|
|||||||
<< "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
|
<< "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
|
||||||
<< std::endl
|
<< std::endl
|
||||||
<< "version - print version info about this program." << std::endl
|
<< "version - print version info about this program." << std::endl
|
||||||
<< "help - print this page." << std::endl
|
<< "help - print this page." << std::endl
|
||||||
<< std::endl
|
<< std::endl
|
||||||
<< "encode - encode text file into NLP file." << std::endl
|
<< "encode - encode text file into NLP file." << std::endl
|
||||||
<< "decode - decode NLP file into text file." << std::endl
|
<< "decode - decode NLP file into text file." << std::endl
|
||||||
<< "<src> - the source file." << std::endl
|
<< "<src> - the source file." << std::endl
|
||||||
<< " the path to text file in encode mode." << std::endl
|
<< " encode mode: the path to text file." << std::endl
|
||||||
<< " the path to NLP file in decode mode." << std::endl
|
<< " decode mode: the path to NLP file." << std::endl
|
||||||
<< "<dest> - the destination file." << std::endl
|
<< "<dest> - the destination file." << std::endl
|
||||||
<< " the path to NLP file in encode mode." << std::endl
|
<< " encode mode: the path to NLP file." << std::endl
|
||||||
<< " the path to text file in decode mode." << std::endl
|
<< " decode mode: the path to text file." << std::endl
|
||||||
<< "" << std::endl;
|
<< "" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
100
NlpParser/JsonConverter.java
Normal file
100
NlpParser/JsonConverter.java
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
import java.util.Stack;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.google.gson.JsonArray;
|
||||||
|
import com.google.gson.JsonObject;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
|
||||||
|
public class JsonConverter extends NlpBaseListener {
|
||||||
|
public JsonConverter() {
|
||||||
|
mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
|
||||||
|
mRoot = new JsonObject();
|
||||||
|
mSection = new JsonArray();
|
||||||
|
mSectionStack = new Stack<JsonArray>();
|
||||||
|
}
|
||||||
|
/* ========== JSON related stuff ========== */
|
||||||
|
|
||||||
|
Gson mGsonInstance;
|
||||||
|
|
||||||
|
public String buildJsonString() {
|
||||||
|
return mGsonInstance.toJson(mRoot);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ========== Section layout related stuff ========== */
|
||||||
|
|
||||||
|
JsonObject mRoot;
|
||||||
|
JsonArray mSection;
|
||||||
|
Stack<JsonArray> mSectionStack;
|
||||||
|
|
||||||
|
private void pushSection() {
|
||||||
|
mSectionStack.push(mSection);
|
||||||
|
mSection = new JsonArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void popSection() {
|
||||||
|
mSection = mSectionStack.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ========== Listener ========== */
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enterDocument(NlpParser.DocumentContext ctx) {
|
||||||
|
// insert language prop
|
||||||
|
mRoot.addProperty("language", StringHelper.cutLanguageHead(ctx.LANG_HEADER().getText()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void exitDocument(NlpParser.DocumentContext ctx) {
|
||||||
|
// insert document prop
|
||||||
|
mRoot.add("entries", mSection);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enterSection(NlpParser.SectionContext ctx) {
|
||||||
|
pushSection();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void exitSection(NlpParser.SectionContext ctx) {
|
||||||
|
// create new object
|
||||||
|
JsonObject objSection = new JsonObject();
|
||||||
|
objSection.addProperty("section", StringHelper.cutSectionHead(ctx.SECTION_HEAD().getText()));
|
||||||
|
objSection.add("entries", mSection);
|
||||||
|
// pop and insert
|
||||||
|
popSection();
|
||||||
|
mSection.add(objSection);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enterSubSection(NlpParser.SubSectionContext ctx) {
|
||||||
|
pushSection();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void exitSubSection(NlpParser.SubSectionContext ctx) {
|
||||||
|
// create new object
|
||||||
|
JsonObject objSubSection = new JsonObject();
|
||||||
|
objSubSection.addProperty("section", StringHelper.cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
|
||||||
|
objSubSection.add("entries", mSection);
|
||||||
|
// pop and insert
|
||||||
|
popSection();
|
||||||
|
mSection.add(objSubSection);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enterEntryString(NlpParser.EntryStringContext ctx) {
|
||||||
|
mSection.add(StringHelper.processString(ctx.ENTRY_STRING().getText()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
|
||||||
|
mSection.add(StringHelper.processConcatedString(
|
||||||
|
ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
|
||||||
|
mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
|
||||||
|
}
|
||||||
|
}
|
@ -1,20 +1,6 @@
|
|||||||
// import antlr stuff
|
|
||||||
import org.antlr.v4.runtime.*;
|
import org.antlr.v4.runtime.*;
|
||||||
import org.antlr.v4.runtime.tree.*;
|
import org.antlr.v4.runtime.tree.*;
|
||||||
// import container
|
|
||||||
import java.util.Stack;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.List;
|
|
||||||
import java.lang.StringBuilder;
|
|
||||||
// import json
|
|
||||||
import com.google.gson.JsonArray;
|
|
||||||
import com.google.gson.JsonObject;
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
// import regex
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
// import io related
|
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
@ -22,179 +8,84 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
public class MainRunner {
|
public class MainRunner {
|
||||||
public static class NlpJsonConverter extends NlpBaseListener {
|
|
||||||
public NlpJsonConverter() {
|
|
||||||
mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
|
|
||||||
mRoot = new JsonObject();
|
|
||||||
mSection = new JsonArray();
|
|
||||||
mSectionStack = new Stack<JsonArray>();
|
|
||||||
}
|
|
||||||
/* JSON related stuff */
|
|
||||||
|
|
||||||
Gson mGsonInstance;
|
|
||||||
public String buildJsonString() {
|
|
||||||
return mGsonInstance.toJson(mRoot);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* String related stuff */
|
|
||||||
|
|
||||||
// \\\\[^\\rn] match the concator. concator must not be appended with \n \r or \\
|
|
||||||
// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
|
|
||||||
private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
|
|
||||||
private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
|
|
||||||
// private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
|
|
||||||
private static final Pattern mRegEscTab = Pattern.compile("\\t");
|
|
||||||
private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
|
|
||||||
private String cutLangHead(String strl) {
|
|
||||||
return strl.substring("Language:".length());
|
|
||||||
}
|
|
||||||
private String cutSectionHead(String strl) {
|
|
||||||
return strl.substring(1, strl.length() - 1);
|
|
||||||
}
|
|
||||||
private String cutString(String strl) {
|
|
||||||
return strl.substring(1, strl.length() - 1);
|
|
||||||
}
|
|
||||||
private String regulateString(String strl) {
|
|
||||||
strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator
|
|
||||||
strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
|
|
||||||
// strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it.
|
|
||||||
strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); // replace real escape to escape char
|
|
||||||
strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
|
|
||||||
|
|
||||||
return strl;
|
|
||||||
}
|
|
||||||
private String processString(String strl) {
|
|
||||||
return regulateString(cutString(strl));
|
|
||||||
}
|
|
||||||
private String processConcatedString(List<String> ls) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (String node : ls) {
|
|
||||||
sb.append(regulateString(cutString(node)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Section layout related stuff */
|
|
||||||
|
|
||||||
JsonObject mRoot;
|
|
||||||
JsonArray mSection;
|
|
||||||
Stack<JsonArray> mSectionStack;
|
|
||||||
private void pushSection() {
|
|
||||||
mSectionStack.push(mSection);
|
|
||||||
mSection = new JsonArray();
|
|
||||||
}
|
|
||||||
private void popSection() {
|
|
||||||
mSection = mSectionStack.pop();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Listener */
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void enterDocument(NlpParser.DocumentContext ctx) {
|
|
||||||
// insert language prop
|
|
||||||
mRoot.addProperty("language", cutLangHead(ctx.LANG_HEADER().getText()));
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void exitDocument(NlpParser.DocumentContext ctx) {
|
|
||||||
// insert document prop
|
|
||||||
mRoot.add("entries", mSection);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void enterSection(NlpParser.SectionContext ctx) {
|
|
||||||
pushSection();
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void exitSection(NlpParser.SectionContext ctx) {
|
|
||||||
// create new object
|
|
||||||
JsonObject objSection = new JsonObject();
|
|
||||||
objSection.addProperty("section", cutSectionHead(ctx.SECTION_HEAD().getText()));
|
|
||||||
objSection.add("entries", mSection);
|
|
||||||
// pop and insert
|
|
||||||
popSection();
|
|
||||||
mSection.add(objSection);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void enterSubSection(NlpParser.SubSectionContext ctx) {
|
|
||||||
pushSection();
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void exitSubSection(NlpParser.SubSectionContext ctx) {
|
|
||||||
// create new object
|
|
||||||
JsonObject objSubSection = new JsonObject();
|
|
||||||
objSubSection.addProperty("section", cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
|
|
||||||
objSubSection.add("entries", mSection);
|
|
||||||
// pop and insert
|
|
||||||
popSection();
|
|
||||||
mSection.add(objSubSection);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void enterEntryString(NlpParser.EntryStringContext ctx) {
|
|
||||||
mSection.add(processString(ctx.ENTRY_STRING().getText()));
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
|
|
||||||
mSection.add(processConcatedString(
|
|
||||||
ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())
|
|
||||||
));
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
|
|
||||||
mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void printHelp() {
|
private static void printHelp() {
|
||||||
|
System.out.println("NlpParser Usage");
|
||||||
System.out.println("NlpParser <src> <dest>");
|
System.out.println("NlpParser <src> <dest>");
|
||||||
System.out.println();
|
System.out.println();
|
||||||
System.out.println("<src> - the decoded nlp text file.");
|
System.out.println("<src> - the decoded NLP text file.");
|
||||||
System.out.println("<dest> - the output json file.");
|
System.out.println("<dest> - the output json file.");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
private static class UserRequest {
|
||||||
// check parameter
|
public UserRequest(String input_filepath, String output_filepath) {
|
||||||
if (args.length != 2) {
|
this.mInputFilePath = input_filepath;
|
||||||
System.out.println("[ERR] Invalid arguments!");
|
this.mOutputFilePath = output_filepath;
|
||||||
printHelp();
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// open file stream
|
|
||||||
FileInputStream fin = null;
|
|
||||||
FileOutputStream fout = null;
|
|
||||||
try {
|
|
||||||
fin = new FileInputStream(args[0]);
|
|
||||||
fout = new FileOutputStream(args[1]);
|
|
||||||
} catch (Exception e) {
|
|
||||||
if (fin != null) fin.close();
|
|
||||||
if (fout != null) fout.close();
|
|
||||||
|
|
||||||
System.out.println("[ERR] Fail to open file!");
|
|
||||||
printHelp();
|
|
||||||
System.exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// start lex and parse
|
String mInputFilePath;
|
||||||
CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
|
String mOutputFilePath;
|
||||||
NlpLexer lexer = new NlpLexer(input);
|
|
||||||
CommonTokenStream tokens = new CommonTokenStream(lexer);
|
public String getInputFilePath() {
|
||||||
NlpParser parser = new NlpParser(tokens);
|
return this.mInputFilePath;
|
||||||
|
}
|
||||||
// walk tree to build json
|
|
||||||
ParseTree tree = parser.document();
|
public String getOutputFilePath() {
|
||||||
ParseTreeWalker walker = new ParseTreeWalker();
|
return this.mOutputFilePath;
|
||||||
NlpJsonConverter converter = new NlpJsonConverter();
|
}
|
||||||
walker.walk(converter, tree);
|
|
||||||
|
}
|
||||||
// write json
|
|
||||||
OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);
|
private static UserRequest resolveArguments(String[] args) throws Exception {
|
||||||
fw.write(converter.buildJsonString());
|
// Check parameter
|
||||||
|
if (args.length != 2) {
|
||||||
// close file stream
|
throw new Exception("Invalid arguments count!");
|
||||||
fin.close();
|
}
|
||||||
fw.close();
|
// Return fetched argumnts
|
||||||
|
return new UserRequest(args[0], args[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void executeWorker(UserRequest user_request) throws Exception {
|
||||||
|
// Use try-with-resources to safely manage file stream.
|
||||||
|
try (FileInputStream fin = new FileInputStream(user_request.getInputFilePath());
|
||||||
|
FileOutputStream fout = new FileOutputStream(user_request.getOutputFilePath());
|
||||||
|
OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);) {
|
||||||
|
// Start lex and parse
|
||||||
|
CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
|
||||||
|
NlpLexer lexer = new NlpLexer(input);
|
||||||
|
CommonTokenStream tokens = new CommonTokenStream(lexer);
|
||||||
|
NlpParser parser = new NlpParser(tokens);
|
||||||
|
|
||||||
|
// Walk tree to build json
|
||||||
|
ParseTree tree = parser.document();
|
||||||
|
ParseTreeWalker walker = new ParseTreeWalker();
|
||||||
|
JsonConverter converter = new JsonConverter();
|
||||||
|
walker.walk(converter, tree);
|
||||||
|
|
||||||
|
// Write json
|
||||||
|
fw.write(converter.buildJsonString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
// Check argument
|
||||||
|
UserRequest user_request = null;
|
||||||
|
try {
|
||||||
|
user_request = resolveArguments(args);
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.out.print("[Argument Error] ");
|
||||||
|
System.out.println(e.getMessage());
|
||||||
|
printHelp();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call converter
|
||||||
|
try {
|
||||||
|
executeWorker(user_request);
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.out.print("[Converter Error] ");
|
||||||
|
System.out.println(e.getMessage());
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
72
NlpParser/StringHelper.java
Normal file
72
NlpParser/StringHelper.java
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import java.util.List;
|
||||||
|
import java.lang.StringBuilder;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String related stuff
|
||||||
|
*/
|
||||||
|
public class StringHelper {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Regex Constants.
|
||||||
|
*
|
||||||
|
* Hints:
|
||||||
|
*
|
||||||
|
* \\\\[^\\rn] match the concator. concator must not be appended with \n \r or
|
||||||
|
* \\.
|
||||||
|
*
|
||||||
|
* [^\\r\\n]*[\\r\\n]+ is match to line breaker.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
private static final Pattern gRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
|
||||||
|
private static final Pattern gRegDoubleQuote = Pattern.compile("\\\"\\\"");
|
||||||
|
// private static final Pattern gRegEscSlash = Pattern.compile("\\\\\\\\");
|
||||||
|
private static final Pattern gRegEscTab = Pattern.compile("\\t");
|
||||||
|
private static final Pattern gRegEscEol = Pattern.compile("\\r?\\n");
|
||||||
|
|
||||||
|
public static String cutLanguageHead(String strl) {
|
||||||
|
return strl.substring("Language:".length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cutSectionHead(String strl) {
|
||||||
|
return strl.substring(1, strl.length() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cutString(String strl) {
|
||||||
|
return strl.substring(1, strl.length() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String regulateString(String strl) {
|
||||||
|
// remove string concator
|
||||||
|
strl = gRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement(""));
|
||||||
|
|
||||||
|
// replace "" with "
|
||||||
|
strl = gRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));
|
||||||
|
|
||||||
|
// leave double back slash alone. we still need it.
|
||||||
|
// strl = gRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));
|
||||||
|
|
||||||
|
// replace real escape to escape char
|
||||||
|
strl = gRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));
|
||||||
|
strl = gRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
|
||||||
|
|
||||||
|
return strl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String processString(String strl) {
|
||||||
|
return regulateString(cutString(strl));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String processConcatedString(List<String> ls) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (String node : ls) {
|
||||||
|
sb.append(regulateString(cutString(node)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -5,11 +5,11 @@
|
|||||||
./NlpCodec/out/NlpCodec decode NlpSrc/VT50.nlp NlpSrc/VT50.txt
|
./NlpCodec/out/NlpCodec decode NlpSrc/VT50.nlp NlpSrc/VT50.txt
|
||||||
|
|
||||||
cd NlpParser
|
cd NlpParser
|
||||||
java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.json
|
java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.nested.json
|
||||||
java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.json
|
java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.nested.json
|
||||||
java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.json
|
java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.nested.json
|
||||||
java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.json
|
java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.nested.json
|
||||||
java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.json
|
java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.nested.json
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
cd NlpProc
|
cd NlpProc
|
||||||
|
Reference in New Issue
Block a user