Compare commits
3 Commits
52ea2745dd
...
master
Author | SHA1 | Date | |
---|---|---|---|
60fca862f3 | |||
b71f6867c5 | |||
6193a2ede6 |
@ -9,9 +9,15 @@
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
#include <utility>
|
||||
#include <format>
|
||||
|
||||
namespace NlpCodec {
|
||||
|
||||
#pragma region Help Structs and Functions
|
||||
|
||||
/// @brief NlpCodec universal exception.
|
||||
/// @details Once this exception was thrown, it means that somethings went wrong.
|
||||
/// and main function should catch it, output error message and exit program immediately.
|
||||
class NlpException : public std::exception {
|
||||
public:
|
||||
NlpException(const char* msg) : message(msg ? msg : "") {}
|
||||
@ -22,25 +28,124 @@ namespace NlpCodec {
|
||||
std::string message;
|
||||
};
|
||||
|
||||
/// @brief The safe version of static_cast which throw exception
|
||||
/// @brief The safe version of `static_cast` which throw exception
|
||||
/// if given value can not be cast into given type (out of range).
|
||||
template<typename _TyTo, typename _TyFrom>
|
||||
static constexpr _TyTo SafeCast(_TyFrom value) {
|
||||
if (!std::in_range<_TyTo>(value))
|
||||
throw NlpException(
|
||||
"Fail to cast integral number because given value is greater than container."
|
||||
"Fail to cast integral number because given value is greater than the type can hold. "
|
||||
"This is usually caused by your input or output file is too long.");
|
||||
return static_cast<_TyTo>(value);
|
||||
}
|
||||
|
||||
/// @brief The magic DWORD for file length encrption.
|
||||
/// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
|
||||
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
||||
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
||||
/// @brief The size of extra part of NLP file which store the size of original plain text file.
|
||||
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
|
||||
/// @brief The safe version of `std::ifstream::read`.
|
||||
/// Throw exception if fail to read.
|
||||
static void SafeRead(std::ifstream& fin, char* s, std::streamsize count) {
|
||||
fin.read(s, count);
|
||||
if (!fin.good() || fin.gcount() != count)
|
||||
throw NlpException("Fail to read data from file.");
|
||||
}
|
||||
/// @brief The safe version of `std::ofstream::write`.
|
||||
/// Throw exception if fail to write.
|
||||
static void SafeWrite(std::ofstream& fout, const char* s, std::streamsize count) {
|
||||
fout.write(s, count);
|
||||
if (!fout.good())
|
||||
throw NlpException("Fail to write data into file.");
|
||||
}
|
||||
|
||||
/// @brief The core array for data encryption.
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Encryption Stuff
|
||||
|
||||
/*
|
||||
|
||||
# NLP File Structure
|
||||
|
||||
|Annotation |Size |
|
||||
|:--- |:--- |
|
||||
|Body |variable |
|
||||
|Raw File Length |4 bytes |
|
||||
|Checksum |4 bytes |
|
||||
|
||||
## Body
|
||||
|
||||
The first part is a zlib compressed byte array.
|
||||
Before any process, we need use zlib to decompress it first.
|
||||
If we need do reverse operation, e.g. build this compressed byte array,
|
||||
the compression level must be maximum value (best compression, e.g. 9).
|
||||
|
||||
After decompress this byte array, we need to an extra step called circular XOR operation
|
||||
to get human-readable plain text data.
|
||||
In this operation, we first have a hard-code `XOR_ARRAY`,
|
||||
then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
|
||||
When we reaching the tail of `XOR_ARRAY`,
|
||||
the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
|
||||
That's the reason why we call this operation is "circular" XOR operation.
|
||||
The reverse operation of this step is nothing changed.
|
||||
Because the reverse operation of XOR is perform it again.
|
||||
|
||||
After all byte are XORed, we can get what we want,
|
||||
a human-readable translation file in plain text for following processing.
|
||||
|
||||
## Raw File Length
|
||||
|
||||
The `uint32_t` field following Body is Raw File Length,
|
||||
which store the length of raw data, e.g. the length of zlib decompressed byte array.
|
||||
It's convenient when decompress Body.
|
||||
|
||||
However, this field is encrypted when storing in NLP file.
|
||||
We need to do some extra operations before using it.
|
||||
Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
|
||||
I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
|
||||
So, just do it and don't worry too much.
|
||||
By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
||||
|
||||
The reverse operation, e.g. building this file when creating NLP, is also simple.
|
||||
It's okey that just flip the whole steps I introduced above.
|
||||
|
||||
## Checksum
|
||||
|
||||
The `uint32_t` field following Body is Checksum,
|
||||
which is just the CRC32 of Body.
|
||||
This field is usually used to validate the integrity of NLP file.
|
||||
|
||||
Same like Raw File Length, this field is also encrypted in NLP file.
|
||||
But its encryption method is quitely simpler than Raw File Length.
|
||||
For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
|
||||
|
||||
The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
|
||||
|
||||
*/
|
||||
|
||||
/// @brief The size of non-Body part of NLP file
|
||||
/// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
|
||||
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
|
||||
|
||||
/// @brief The magic DWORD for Raw File Length field encrption.
|
||||
/// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
|
||||
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
|
||||
/// @brief Encrypt Raw File Length field for writting NLP file.
|
||||
static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
|
||||
return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
|
||||
}
|
||||
/// @brief Decrypt Raw File Length field read from NLP file.
|
||||
static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
|
||||
return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
|
||||
}
|
||||
|
||||
/// @brief The magic DWORD for Checksum field encryption.
|
||||
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
|
||||
/// @brief Encrypt Checksum field for writting NLP file.
|
||||
static constexpr uint32_t EncryptChecksum(uint32_t value) {
|
||||
return value + CHECKSUM_OFFSET;
|
||||
}
|
||||
/// @brief Decrypt Checksum field read from NLP file.
|
||||
static constexpr uint32_t DecryptChecksum(uint32_t value) {
|
||||
return value - CHECKSUM_OFFSET;
|
||||
}
|
||||
|
||||
/// @brief The core array for Body circular XOR encryption.
|
||||
/// @details First byte will XOR with the first byte of this array, and so on.
|
||||
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
|
||||
constexpr const uint8_t XOR_ARRAY[] {
|
||||
@ -53,37 +158,36 @@ namespace NlpCodec {
|
||||
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
|
||||
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
|
||||
};
|
||||
/// @brief The size of above array.
|
||||
/// @brief The size of `XOR_ARRAY`.
|
||||
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
|
||||
/// @brief A convenient mask for above array when performing modulo.
|
||||
/// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
|
||||
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
|
||||
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
|
||||
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
|
||||
// then this mask was computed wrongly.
|
||||
static_assert(XOR_ARRAY_MASK == 0x7Fu);
|
||||
|
||||
static void GeneralXorOperation(void* data, size_t data_len) {
|
||||
/// @brief Encrypt or decrypt decompressed Body field.
|
||||
static void CircularXorOperation(void* data, size_t data_len) {
|
||||
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
|
||||
for (size_t i = 0u; i < data_len; ++i) {
|
||||
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma endregion
|
||||
|
||||
/// @brief Get the length of given file stream.
|
||||
static uint32_t GetFileLength(std::ifstream& fin) {
|
||||
// Fetch the types this stream used for following convenience.
|
||||
using stream_pos_t = std::ifstream::pos_type;
|
||||
using stream_off_t = std::ifstream::off_type;
|
||||
|
||||
// Backups current file cursor.
|
||||
stream_pos_t current_pos = fin.tellg();
|
||||
stream_off_t current_pos = fin.tellg();
|
||||
// Seek to the tail and get corresponding offset to get the length of file.
|
||||
fin.seekg(0, std::ios_base::end);
|
||||
stream_pos_t tail_pos = fin.tellg();
|
||||
if (std::numeric_limits<uint32_t>::max() < tail_pos)
|
||||
throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
|
||||
stream_off_t tail_pos = fin.tellg();
|
||||
// Restore to previous backup file cursor
|
||||
fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
|
||||
fin.seekg(current_pos, std::ios_base::beg);
|
||||
|
||||
// Safely reurn cast length.
|
||||
return SafeCast<uint32_t>(tail_pos);
|
||||
@ -110,12 +214,10 @@ namespace NlpCodec {
|
||||
throw NlpException("Fail to allocate memory.");
|
||||
|
||||
// Read data from file to input buffer
|
||||
fin.read(inbuf.get(), raw_size);
|
||||
if (!fin.good() || fin.gcount() != raw_size)
|
||||
throw NlpException("Fail to read file data into buffer.");
|
||||
SafeRead(fin, inbuf.get(), raw_size);
|
||||
|
||||
// Do XOR operation
|
||||
GeneralXorOperation(inbuf.get(), raw_size);
|
||||
CircularXorOperation(inbuf.get(), raw_size);
|
||||
|
||||
// Do compress and get the size of compressed data.
|
||||
uLongf dest_len = static_cast<uLongf>(computed_boundary);
|
||||
@ -124,9 +226,9 @@ namespace NlpCodec {
|
||||
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
|
||||
Z_BEST_COMPRESSION
|
||||
);
|
||||
// Check ZLib result.
|
||||
// Check zlib result.
|
||||
if (ret != Z_OK)
|
||||
throw NlpException("Zlib compress() failed.");
|
||||
throw NlpException("zlib compress() failed.");
|
||||
// Fetch final compressed size.
|
||||
uint32_t compressed_size = SafeCast<uint32_t>(dest_len);
|
||||
|
||||
@ -134,21 +236,15 @@ namespace NlpCodec {
|
||||
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));
|
||||
|
||||
// Write compressed data into file
|
||||
fout.write(outbuf.get(), compressed_size);
|
||||
if (!fout.good())
|
||||
throw NlpException("Fail to write data into file.");
|
||||
SafeWrite(fout, outbuf.get(), compressed_size);
|
||||
|
||||
// Raw size and checksum need some extra encryption before writting
|
||||
raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
|
||||
checksum = checksum + CHECKSUM_OFFSET;
|
||||
raw_size = EncryptRawFileLength(raw_size);
|
||||
checksum = EncryptChecksum(checksum);
|
||||
|
||||
// Write raw size and checksum
|
||||
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
||||
if (!fout.good())
|
||||
throw NlpException("Fail to write raw size into file.");
|
||||
fout.write(reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
|
||||
if (!fout.good())
|
||||
throw NlpException("Fail to write checksum into file.");
|
||||
SafeWrite(fout, reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
|
||||
SafeWrite(fout, reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
|
||||
|
||||
}
|
||||
|
||||
@ -162,13 +258,13 @@ namespace NlpCodec {
|
||||
compressed_size -= TAIL_SIZE;
|
||||
fin.seekg(compressed_size, std::ios_base::beg);
|
||||
uint32_t expected_raw_size = 0u, expected_checksum = 0u;
|
||||
fin.read(reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
|
||||
fin.read(reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
|
||||
SafeRead(fin, reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
|
||||
SafeRead(fin, reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
|
||||
fin.seekg(0, std::ios_base::beg);
|
||||
|
||||
// Raw size and checksum data need to do some extra decryption.
|
||||
expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
|
||||
expected_checksum = expected_checksum - CHECKSUM_OFFSET;
|
||||
expected_raw_size = DecryptRawFileLength(expected_raw_size);
|
||||
expected_checksum = DecryptChecksum(expected_checksum);
|
||||
|
||||
// Allocate memory to store data
|
||||
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
|
||||
@ -177,18 +273,14 @@ namespace NlpCodec {
|
||||
throw NlpException("Fail to allocate memory.");
|
||||
|
||||
// Read file into buffer
|
||||
fin.read(inbuf.get(), compressed_size);
|
||||
if (!fin.good() || fin.gcount() != compressed_size)
|
||||
throw NlpException("Fail to read data into buffer.\n");
|
||||
SafeRead(fin, inbuf.get(), compressed_size);
|
||||
|
||||
// Test checksum
|
||||
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
|
||||
if (checksum != expected_checksum) {
|
||||
fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
|
||||
expected_checksum, checksum
|
||||
if (checksum != expected_checksum)
|
||||
throw NlpException(
|
||||
std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Do decompress
|
||||
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
|
||||
@ -198,15 +290,13 @@ namespace NlpCodec {
|
||||
);
|
||||
// Check zlib result
|
||||
if (ret != Z_OK)
|
||||
throw NlpException("Zlib uncompress() failed.");
|
||||
throw NlpException("zlib uncompress() failed.");
|
||||
|
||||
// do xor operation
|
||||
GeneralXorOperation(outbuf.get(), expected_raw_size);
|
||||
// Do XOR operation
|
||||
CircularXorOperation(outbuf.get(), expected_raw_size);
|
||||
|
||||
// Write result into file
|
||||
fout.write(outbuf.get(), expected_raw_size);
|
||||
if (!fout.good())
|
||||
throw NlpException("Fail to write data into file.");
|
||||
SafeWrite(fout, outbuf.get(), expected_raw_size);
|
||||
|
||||
}
|
||||
|
||||
@ -238,11 +328,11 @@ namespace NlpCodec::Runtime {
|
||||
<< "encode - encode text file into NLP file." << std::endl
|
||||
<< "decode - decode NLP file into text file." << std::endl
|
||||
<< "<src> - the source file." << std::endl
|
||||
<< " the path to text file in encode mode." << std::endl
|
||||
<< " the path to NLP file in decode mode." << std::endl
|
||||
<< " encode mode: the path to text file." << std::endl
|
||||
<< " decode mode: the path to NLP file." << std::endl
|
||||
<< "<dest> - the destination file." << std::endl
|
||||
<< " the path to NLP file in encode mode." << std::endl
|
||||
<< " the path to text file in decode mode." << std::endl
|
||||
<< " encode mode: the path to NLP file." << std::endl
|
||||
<< " decode mode: the path to text file." << std::endl
|
||||
<< "" << std::endl;
|
||||
}
|
||||
|
||||
|
100
NlpParser/JsonConverter.java
Normal file
100
NlpParser/JsonConverter.java
Normal file
@ -0,0 +1,100 @@
|
||||
import java.util.Stack;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
public class JsonConverter extends NlpBaseListener {
|
||||
public JsonConverter() {
|
||||
mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
|
||||
mRoot = new JsonObject();
|
||||
mSection = new JsonArray();
|
||||
mSectionStack = new Stack<JsonArray>();
|
||||
}
|
||||
/* ========== JSON related stuff ========== */
|
||||
|
||||
Gson mGsonInstance;
|
||||
|
||||
public String buildJsonString() {
|
||||
return mGsonInstance.toJson(mRoot);
|
||||
}
|
||||
|
||||
/* ========== Section layout related stuff ========== */
|
||||
|
||||
JsonObject mRoot;
|
||||
JsonArray mSection;
|
||||
Stack<JsonArray> mSectionStack;
|
||||
|
||||
private void pushSection() {
|
||||
mSectionStack.push(mSection);
|
||||
mSection = new JsonArray();
|
||||
}
|
||||
|
||||
private void popSection() {
|
||||
mSection = mSectionStack.pop();
|
||||
}
|
||||
|
||||
/* ========== Listener ========== */
|
||||
|
||||
@Override
|
||||
public void enterDocument(NlpParser.DocumentContext ctx) {
|
||||
// insert language prop
|
||||
mRoot.addProperty("language", StringHelper.cutLanguageHead(ctx.LANG_HEADER().getText()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitDocument(NlpParser.DocumentContext ctx) {
|
||||
// insert document prop
|
||||
mRoot.add("entries", mSection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterSection(NlpParser.SectionContext ctx) {
|
||||
pushSection();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitSection(NlpParser.SectionContext ctx) {
|
||||
// create new object
|
||||
JsonObject objSection = new JsonObject();
|
||||
objSection.addProperty("section", StringHelper.cutSectionHead(ctx.SECTION_HEAD().getText()));
|
||||
objSection.add("entries", mSection);
|
||||
// pop and insert
|
||||
popSection();
|
||||
mSection.add(objSection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterSubSection(NlpParser.SubSectionContext ctx) {
|
||||
pushSection();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exitSubSection(NlpParser.SubSectionContext ctx) {
|
||||
// create new object
|
||||
JsonObject objSubSection = new JsonObject();
|
||||
objSubSection.addProperty("section", StringHelper.cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
|
||||
objSubSection.add("entries", mSection);
|
||||
// pop and insert
|
||||
popSection();
|
||||
mSection.add(objSubSection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterEntryString(NlpParser.EntryStringContext ctx) {
|
||||
mSection.add(StringHelper.processString(ctx.ENTRY_STRING().getText()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
|
||||
mSection.add(StringHelper.processConcatedString(
|
||||
ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
|
||||
mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
|
||||
}
|
||||
}
|
@ -1,20 +1,6 @@
|
||||
// import antlr stuff
|
||||
import org.antlr.v4.runtime.*;
|
||||
import org.antlr.v4.runtime.tree.*;
|
||||
// import container
|
||||
import java.util.Stack;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.List;
|
||||
import java.lang.StringBuilder;
|
||||
// import json
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
// import regex
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
// import io related
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
@ -22,179 +8,84 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
public class MainRunner {
|
||||
public static class NlpJsonConverter extends NlpBaseListener {
|
||||
public NlpJsonConverter() {
|
||||
mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
|
||||
mRoot = new JsonObject();
|
||||
mSection = new JsonArray();
|
||||
mSectionStack = new Stack<JsonArray>();
|
||||
}
|
||||
/* JSON related stuff */
|
||||
|
||||
Gson mGsonInstance;
|
||||
public String buildJsonString() {
|
||||
return mGsonInstance.toJson(mRoot);
|
||||
}
|
||||
|
||||
/* String related stuff */
|
||||
|
||||
// \\\\[^\\rn] match the concator. concator must not be appended with \n \r or \\
|
||||
// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
|
||||
private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
|
||||
private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
|
||||
// private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
|
||||
private static final Pattern mRegEscTab = Pattern.compile("\\t");
|
||||
private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
|
||||
private String cutLangHead(String strl) {
|
||||
return strl.substring("Language:".length());
|
||||
}
|
||||
private String cutSectionHead(String strl) {
|
||||
return strl.substring(1, strl.length() - 1);
|
||||
}
|
||||
private String cutString(String strl) {
|
||||
return strl.substring(1, strl.length() - 1);
|
||||
}
|
||||
private String regulateString(String strl) {
|
||||
strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator
|
||||
strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
|
||||
// strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it.
|
||||
strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); // replace real escape to escape char
|
||||
strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
|
||||
|
||||
return strl;
|
||||
}
|
||||
private String processString(String strl) {
|
||||
return regulateString(cutString(strl));
|
||||
}
|
||||
private String processConcatedString(List<String> ls) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String node : ls) {
|
||||
sb.append(regulateString(cutString(node)));
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/* Section layout related stuff */
|
||||
|
||||
JsonObject mRoot;
|
||||
JsonArray mSection;
|
||||
Stack<JsonArray> mSectionStack;
|
||||
private void pushSection() {
|
||||
mSectionStack.push(mSection);
|
||||
mSection = new JsonArray();
|
||||
}
|
||||
private void popSection() {
|
||||
mSection = mSectionStack.pop();
|
||||
}
|
||||
|
||||
/* Listener */
|
||||
|
||||
@Override
|
||||
public void enterDocument(NlpParser.DocumentContext ctx) {
|
||||
// insert language prop
|
||||
mRoot.addProperty("language", cutLangHead(ctx.LANG_HEADER().getText()));
|
||||
}
|
||||
@Override
|
||||
public void exitDocument(NlpParser.DocumentContext ctx) {
|
||||
// insert document prop
|
||||
mRoot.add("entries", mSection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterSection(NlpParser.SectionContext ctx) {
|
||||
pushSection();
|
||||
}
|
||||
@Override
|
||||
public void exitSection(NlpParser.SectionContext ctx) {
|
||||
// create new object
|
||||
JsonObject objSection = new JsonObject();
|
||||
objSection.addProperty("section", cutSectionHead(ctx.SECTION_HEAD().getText()));
|
||||
objSection.add("entries", mSection);
|
||||
// pop and insert
|
||||
popSection();
|
||||
mSection.add(objSection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterSubSection(NlpParser.SubSectionContext ctx) {
|
||||
pushSection();
|
||||
}
|
||||
@Override
|
||||
public void exitSubSection(NlpParser.SubSectionContext ctx) {
|
||||
// create new object
|
||||
JsonObject objSubSection = new JsonObject();
|
||||
objSubSection.addProperty("section", cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
|
||||
objSubSection.add("entries", mSection);
|
||||
// pop and insert
|
||||
popSection();
|
||||
mSection.add(objSubSection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void enterEntryString(NlpParser.EntryStringContext ctx) {
|
||||
mSection.add(processString(ctx.ENTRY_STRING().getText()));
|
||||
}
|
||||
@Override
|
||||
public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
|
||||
mSection.add(processConcatedString(
|
||||
ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())
|
||||
));
|
||||
}
|
||||
@Override
|
||||
public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
|
||||
mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
|
||||
}
|
||||
}
|
||||
|
||||
private static void printHelp() {
|
||||
System.out.println("NlpParser Usage");
|
||||
System.out.println("NlpParser <src> <dest>");
|
||||
System.out.println();
|
||||
System.out.println("<src> - the decoded nlp text file.");
|
||||
System.out.println("<src> - the decoded NLP text file.");
|
||||
System.out.println("<dest> - the output json file.");
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// check parameter
|
||||
private static class UserRequest {
|
||||
public UserRequest(String input_filepath, String output_filepath) {
|
||||
this.mInputFilePath = input_filepath;
|
||||
this.mOutputFilePath = output_filepath;
|
||||
}
|
||||
|
||||
String mInputFilePath;
|
||||
String mOutputFilePath;
|
||||
|
||||
public String getInputFilePath() {
|
||||
return this.mInputFilePath;
|
||||
}
|
||||
|
||||
public String getOutputFilePath() {
|
||||
return this.mOutputFilePath;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static UserRequest resolveArguments(String[] args) throws Exception {
|
||||
// Check parameter
|
||||
if (args.length != 2) {
|
||||
System.out.println("[ERR] Invalid arguments!");
|
||||
printHelp();
|
||||
System.exit(1);
|
||||
throw new Exception("Invalid arguments count!");
|
||||
}
|
||||
// Return fetched argumnts
|
||||
return new UserRequest(args[0], args[1]);
|
||||
}
|
||||
|
||||
// open file stream
|
||||
FileInputStream fin = null;
|
||||
FileOutputStream fout = null;
|
||||
try {
|
||||
fin = new FileInputStream(args[0]);
|
||||
fout = new FileOutputStream(args[1]);
|
||||
} catch (Exception e) {
|
||||
if (fin != null) fin.close();
|
||||
if (fout != null) fout.close();
|
||||
|
||||
System.out.println("[ERR] Fail to open file!");
|
||||
printHelp();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
// start lex and parse
|
||||
private static void executeWorker(UserRequest user_request) throws Exception {
|
||||
// Use try-with-resources to safely manage file stream.
|
||||
try (FileInputStream fin = new FileInputStream(user_request.getInputFilePath());
|
||||
FileOutputStream fout = new FileOutputStream(user_request.getOutputFilePath());
|
||||
OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);) {
|
||||
// Start lex and parse
|
||||
CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
|
||||
NlpLexer lexer = new NlpLexer(input);
|
||||
CommonTokenStream tokens = new CommonTokenStream(lexer);
|
||||
NlpParser parser = new NlpParser(tokens);
|
||||
|
||||
// walk tree to build json
|
||||
// Walk tree to build json
|
||||
ParseTree tree = parser.document();
|
||||
ParseTreeWalker walker = new ParseTreeWalker();
|
||||
NlpJsonConverter converter = new NlpJsonConverter();
|
||||
JsonConverter converter = new JsonConverter();
|
||||
walker.walk(converter, tree);
|
||||
|
||||
// write json
|
||||
OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);
|
||||
// Write json
|
||||
fw.write(converter.buildJsonString());
|
||||
}
|
||||
}
|
||||
|
||||
// close file stream
|
||||
fin.close();
|
||||
fw.close();
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Check argument
|
||||
UserRequest user_request = null;
|
||||
try {
|
||||
user_request = resolveArguments(args);
|
||||
} catch (Exception e) {
|
||||
System.out.print("[Argument Error] ");
|
||||
System.out.println(e.getMessage());
|
||||
printHelp();
|
||||
return;
|
||||
}
|
||||
|
||||
// Call converter
|
||||
try {
|
||||
executeWorker(user_request);
|
||||
} catch (Exception e) {
|
||||
System.out.print("[Converter Error] ");
|
||||
System.out.println(e.getMessage());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
72
NlpParser/StringHelper.java
Normal file
72
NlpParser/StringHelper.java
Normal file
@ -0,0 +1,72 @@
|
||||
import java.util.List;
|
||||
import java.lang.StringBuilder;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
/**
|
||||
* String related stuff
|
||||
*/
|
||||
public class StringHelper {
|
||||
|
||||
/*
|
||||
* Regex Constants.
|
||||
*
|
||||
* Hints:
|
||||
*
|
||||
* \\\\[^\\rn] match the concator. concator must not be appended with \n \r or
|
||||
* \\.
|
||||
*
|
||||
* [^\\r\\n]*[\\r\\n]+ is match to line breaker.
|
||||
*
|
||||
*/
|
||||
|
||||
private static final Pattern gRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
|
||||
private static final Pattern gRegDoubleQuote = Pattern.compile("\\\"\\\"");
|
||||
// private static final Pattern gRegEscSlash = Pattern.compile("\\\\\\\\");
|
||||
private static final Pattern gRegEscTab = Pattern.compile("\\t");
|
||||
private static final Pattern gRegEscEol = Pattern.compile("\\r?\\n");
|
||||
|
||||
public static String cutLanguageHead(String strl) {
|
||||
return strl.substring("Language:".length());
|
||||
}
|
||||
|
||||
public static String cutSectionHead(String strl) {
|
||||
return strl.substring(1, strl.length() - 1);
|
||||
}
|
||||
|
||||
public static String cutString(String strl) {
|
||||
return strl.substring(1, strl.length() - 1);
|
||||
}
|
||||
|
||||
public static String regulateString(String strl) {
|
||||
// remove string concator
|
||||
strl = gRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement(""));
|
||||
|
||||
// replace "" with "
|
||||
strl = gRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));
|
||||
|
||||
// leave double back slash alone. we still need it.
|
||||
// strl = gRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));
|
||||
|
||||
// replace real escape to escape char
|
||||
strl = gRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));
|
||||
strl = gRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
|
||||
|
||||
return strl;
|
||||
}
|
||||
|
||||
public static String processString(String strl) {
|
||||
return regulateString(cutString(strl));
|
||||
}
|
||||
|
||||
public static String processConcatedString(List<String> ls) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String node : ls) {
|
||||
sb.append(regulateString(cutString(node)));
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
@ -5,11 +5,11 @@
|
||||
./NlpCodec/out/NlpCodec decode NlpSrc/VT50.nlp NlpSrc/VT50.txt
|
||||
|
||||
cd NlpParser
|
||||
java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.json
|
||||
java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.json
|
||||
java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.json
|
||||
java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.json
|
||||
java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.json
|
||||
java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.nested.json
|
||||
java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.nested.json
|
||||
java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.nested.json
|
||||
java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.nested.json
|
||||
java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.nested.json
|
||||
cd ..
|
||||
|
||||
cd NlpProc
|
||||
|
Reference in New Issue
Block a user