refactor: update NlpParser project

fix: update NlpCodec
fix: fix NlpCodec compile issue.
2024-12-13 15:47:05 +08:00 · 2024-12-11 20:19:48 +08:00 · 2024-12-11 16:20:21 +08:00
5 changed files with 403 additions and 250 deletions
--- a/NlpCodec/NlpCodec.cpp
+++ b/NlpCodec/NlpCodec.cpp
@ -9,9 +9,15 @@
 #include <limits>
 #include <stdexcept>
 #include <utility>
+#include <format>

 namespace NlpCodec {

+#pragma region Help Structs and Functions
+
+    /// @brief NlpCodec universal exception.
+    /// @details Once this exception was thrown, it means that somethings went wrong.
+    /// and main function should catch it, output error message and exit program immediately.
    class NlpException : public std::exception {
    public:
        NlpException(const char* msg) : message(msg ? msg : "") {}
@ -22,25 +28,124 @@ namespace NlpCodec {
        std::string message;
    };

-    /// @brief The safe version of static_cast which throw exception
+    /// @brief The safe version of `static_cast` which throw exception
    /// if given value can not be cast into given type (out of range).
    template<typename _TyTo, typename _TyFrom>
    static constexpr _TyTo SafeCast(_TyFrom value) {
        if (!std::in_range<_TyTo>(value))
            throw NlpException(
-                    "Fail to cast integral number because given value is greater than container."
+                    "Fail to cast integral number because given value is greater than the type can hold. "
                    "This is usually caused by your input or output file is too long.");
        return static_cast<_TyTo>(value);
    }

-    /// @brief The magic DWORD for file length encrption.
-    /// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
-    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
-    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
-    /// @brief The size of extra part of NLP file which store the size of original plain text file.
-    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
+    /// @brief The safe version of `std::ifstream::read`.
+    /// Throw exception if fail to read.
+    static void SafeRead(std::ifstream& fin, char* s, std::streamsize count) {
+        fin.read(s, count);
+        if (!fin.good() || fin.gcount() != count)
+            throw NlpException("Fail to read data from file.");
+    }
+    /// @brief The safe version of `std::ofstream::write`.
+    /// Throw exception if fail to write.
+    static void SafeWrite(std::ofstream& fout, const char* s, std::streamsize count) {
+        fout.write(s, count);
+        if (!fout.good())
+            throw NlpException("Fail to write data into file.");
+    }

-    /// @brief The core array for data encryption.
+#pragma endregion
+
+#pragma region Encryption Stuff
+
+    /*
+
+    # NLP File Structure
+
+    |Annotation         |Size       |
+    |:---               |:---       |
+    |Body               |variable   |
+    |Raw File Length    |4 bytes    |
+    |Checksum           |4 bytes    |
+
+    ## Body
+
+    The first part is a zlib compressed byte array.
+    Before any process, we need use zlib to decompress it first.
+    If we need do reverse operation, e.g. build this compressed byte array,
+    the compression level must be maximum value (best compression, e.g. 9).
+
+    After decompress this byte array, we need to an extra step called circular XOR operation
+    to get human-readable plain text data.
+    In this operation, we first have a hard-code `XOR_ARRAY`,
+    then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
+    When we reaching the tail of `XOR_ARRAY`,
+    the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
+    That's the reason why we call this operation is "circular" XOR operation.
+    The reverse operation of this step is nothing changed.
+    Because the reverse operation of XOR is perform it again.
+
+    After all byte are XORed, we can get what we want,
+    a human-readable translation file in plain text for following processing.
+
+    ## Raw File Length
+
+    The `uint32_t` field following Body is Raw File Length,
+    which store the length of raw data, e.g. the length of zlib decompressed byte array.
+    It's convenient when decompress Body.
+
+    However, this field is encrypted when storing in NLP file.
+    We need to do some extra operations before using it.
+    Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
+    I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
+    So, just do it and don't worry too much.
+    By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
+
+    The reverse operation, e.g. building this file when creating NLP, is also simple.
+    It's okey that just flip the whole steps I introduced above.
+
+    ## Checksum
+
+    The `uint32_t` field following Body is Checksum,
+    which is just the CRC32 of Body.
+    This field is usually used to validate the integrity of NLP file.
+
+    Same like Raw File Length, this field is also encrypted in NLP file.
+    But its encryption method is quitely simpler than Raw File Length.
+    For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
+
+    The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
+
+    */
+
+    /// @brief The size of non-Body part of NLP file
+    /// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
+    constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
+
+    /// @brief The magic DWORD for Raw File Length field encrption.
+    /// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
+    constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
+    /// @brief Encrypt Raw File Length field for writting NLP file.
+    static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
+        return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
+    }
+    /// @brief Decrypt Raw File Length field read from NLP file.
+    static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
+        return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
+    }
+
+    /// @brief The magic DWORD for Checksum field encryption.
+    constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
+    /// @brief Encrypt Checksum field for writting NLP file.
+    static constexpr uint32_t EncryptChecksum(uint32_t value) {
+        return value + CHECKSUM_OFFSET;
+    }
+    /// @brief Decrypt Checksum field read from NLP file.
+    static constexpr uint32_t DecryptChecksum(uint32_t value) {
+        return value - CHECKSUM_OFFSET;
+    }
+
+    /// @brief The core array for Body circular XOR encryption.
    /// @details First byte will XOR with the first byte of this array, and so on.
    /// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
    constexpr const uint8_t XOR_ARRAY[] {
@ -53,37 +158,36 @@ namespace NlpCodec {
        0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
        0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
    };
-    /// @brief The size of above array.
+    /// @brief The size of `XOR_ARRAY`.
    constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
-    /// @brief A convenient mask for above array when performing modulo.
+    /// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
    constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
    // Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
    // Because some stupid programmers (like me) may change above array and fill a series of wrong data,
    // then this mask was computed wrongly.
    static_assert(XOR_ARRAY_MASK == 0x7Fu);
-
-    static void GeneralXorOperation(void* data, size_t data_len) {
+    /// @brief Encrypt or decrypt decompressed Body field.
+    static void CircularXorOperation(void* data, size_t data_len) {
        uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
        for (size_t i = 0u; i < data_len; ++i) {
            ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
        }
    }

+#pragma endregion
+
    /// @brief Get the length of given file stream.
    static uint32_t GetFileLength(std::ifstream& fin) {
        // Fetch the types this stream used for following convenience.
-        using stream_pos_t = std::ifstream::pos_type;
        using stream_off_t = std::ifstream::off_type;

        // Backups current file cursor.
-        stream_pos_t current_pos = fin.tellg();
+        stream_off_t current_pos = fin.tellg();
        // Seek to the tail and get corresponding offset to get the length of file.
        fin.seekg(0, std::ios_base::end);
-        stream_pos_t tail_pos = fin.tellg();
-        if (std::numeric_limits<uint32_t>::max() < tail_pos)
-            throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
+        stream_off_t tail_pos = fin.tellg();
        // Restore to previous backup file cursor
-        fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
+        fin.seekg(current_pos, std::ios_base::beg);

        // Safely reurn cast length.
        return SafeCast<uint32_t>(tail_pos);
@ -110,12 +214,10 @@ namespace NlpCodec {
            throw NlpException("Fail to allocate memory.");

        // Read data from file to input buffer
-        fin.read(inbuf.get(), raw_size);
-        if (!fin.good() || fin.gcount() != raw_size)
-            throw NlpException("Fail to read file data into buffer.");
+        SafeRead(fin, inbuf.get(), raw_size);

        // Do XOR operation
-        GeneralXorOperation(inbuf.get(), raw_size);
+        CircularXorOperation(inbuf.get(), raw_size);

        // Do compress and get the size of compressed data.
        uLongf dest_len = static_cast<uLongf>(computed_boundary);
@ -124,9 +226,9 @@ namespace NlpCodec {
            reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
            Z_BEST_COMPRESSION
        );
-        // Check ZLib result.
+        // Check zlib result.
        if (ret != Z_OK)
-            throw NlpException("Zlib compress() failed.");
+            throw NlpException("zlib compress() failed.");
        // Fetch final compressed size.
        uint32_t compressed_size = SafeCast<uint32_t>(dest_len);

@ -134,21 +236,15 @@ namespace NlpCodec {
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));

        // Write compressed data into file
-        fout.write(outbuf.get(), compressed_size);
-        if (!fout.good())
-            throw NlpException("Fail to write data into file.");
+        SafeWrite(fout, outbuf.get(), compressed_size);

        // Raw size and checksum need some extra encryption before writting
-        raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
-        checksum = checksum + CHECKSUM_OFFSET;
+        raw_size = EncryptRawFileLength(raw_size);
+        checksum = EncryptChecksum(checksum);

        // Write raw size and checksum
-        fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
-        if (!fout.good())
-            throw NlpException("Fail to write raw size into file.");
-        fout.write(reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
-        if (!fout.good())
-            throw NlpException("Fail to write checksum into file.");
+        SafeWrite(fout, reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
+        SafeWrite(fout, reinterpret_cast<char*>(&checksum), sizeof(uint32_t));

    }

@ -162,13 +258,13 @@ namespace NlpCodec {
        compressed_size -= TAIL_SIZE;
        fin.seekg(compressed_size, std::ios_base::beg);
        uint32_t expected_raw_size = 0u, expected_checksum = 0u;
-        fin.read(reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
-        fin.read(reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
+        SafeRead(fin, reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
+        SafeRead(fin, reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
        fin.seekg(0, std::ios_base::beg);

        // Raw size and checksum data need to do some extra decryption.
-        expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
-        expected_checksum = expected_checksum - CHECKSUM_OFFSET;
+        expected_raw_size = DecryptRawFileLength(expected_raw_size);
+        expected_checksum = DecryptChecksum(expected_checksum);

        // Allocate memory to store data
        std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
@ -177,18 +273,14 @@ namespace NlpCodec {
            throw NlpException("Fail to allocate memory.");

        // Read file into buffer
-        fin.read(inbuf.get(), compressed_size);
-        if (!fin.good() || fin.gcount() != compressed_size)
-            throw NlpException("Fail to read data into buffer.\n");
+        SafeRead(fin, inbuf.get(), compressed_size);

        // Test checksum
        uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
-        if (checksum != expected_checksum) {
-            fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
-                expected_checksum, checksum
+        if (checksum != expected_checksum)
+            throw NlpException(
+                std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
            );
-            return false;
-        }

        // Do decompress
        uLongf _destLen = static_cast<uLongf>(expected_raw_size);
@ -198,15 +290,13 @@ namespace NlpCodec {
        );
        // Check zlib result
        if (ret != Z_OK)
-            throw NlpException("Zlib uncompress() failed.");
+            throw NlpException("zlib uncompress() failed.");

-        // do xor operation
-        GeneralXorOperation(outbuf.get(), expected_raw_size);
+        // Do XOR operation
+        CircularXorOperation(outbuf.get(), expected_raw_size);

        // Write result into file
-        fout.write(outbuf.get(), expected_raw_size);
-        if (!fout.good())
-            throw NlpException("Fail to write data into file.");
+        SafeWrite(fout, outbuf.get(), expected_raw_size);

    }

@ -233,16 +323,16 @@ namespace NlpCodec::Runtime {
                << "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
                << std::endl
                << "version - print version info about this program." << std::endl
-                << "help - print this page." << std::endl
+                << "help    - print this page." << std::endl
                << std::endl
-                << "encode - encode text file into NLP file." << std::endl
-                << "decode - decode NLP file into text file." << std::endl
-                << "<src> - the source file." << std::endl
-                << "        the path to text file in encode mode." << std::endl
-                << "        the path to NLP file in decode mode." << std::endl
-                << "<dest> - the destination file." << std::endl
-                << "         the path to NLP file in encode mode." << std::endl
-                << "         the path to text file in decode mode." << std::endl
+                << "encode  - encode text file into NLP file." << std::endl
+                << "decode  - decode NLP file into text file." << std::endl
+                << "<src>   - the source file." << std::endl
+                << "          encode mode: the path to text file." << std::endl
+                << "          decode mode: the path to NLP file." << std::endl
+                << "<dest>  - the destination file." << std::endl
+                << "          encode mode: the path to NLP file." << std::endl
+                << "          decode mode: the path to text file." << std::endl
                << "" << std::endl;
    }

--- a/NlpParser/JsonConverter.java
+++ b/NlpParser/JsonConverter.java
@ -0,0 +1,100 @@
+import java.util.Stack;
+import java.util.stream.Collectors;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonObject;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+public class JsonConverter extends NlpBaseListener {
+	public JsonConverter() {
+		mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
+		mRoot = new JsonObject();
+		mSection = new JsonArray();
+		mSectionStack = new Stack<JsonArray>();
+	}
+	/* ========== JSON related stuff ========== */
+
+	Gson mGsonInstance;
+
+	public String buildJsonString() {
+		return mGsonInstance.toJson(mRoot);
+	}
+
+	/* ========== Section layout related stuff ========== */
+
+	JsonObject mRoot;
+	JsonArray mSection;
+	Stack<JsonArray> mSectionStack;
+
+	private void pushSection() {
+		mSectionStack.push(mSection);
+		mSection = new JsonArray();
+	}
+
+	private void popSection() {
+		mSection = mSectionStack.pop();
+	}
+
+	/* ========== Listener ========== */
+
+	@Override
+	public void enterDocument(NlpParser.DocumentContext ctx) {
+		// insert language prop
+		mRoot.addProperty("language", StringHelper.cutLanguageHead(ctx.LANG_HEADER().getText()));
+	}
+
+	@Override
+	public void exitDocument(NlpParser.DocumentContext ctx) {
+		// insert document prop
+		mRoot.add("entries", mSection);
+	}
+
+	@Override
+	public void enterSection(NlpParser.SectionContext ctx) {
+		pushSection();
+	}
+
+	@Override
+	public void exitSection(NlpParser.SectionContext ctx) {
+		// create new object
+		JsonObject objSection = new JsonObject();
+		objSection.addProperty("section", StringHelper.cutSectionHead(ctx.SECTION_HEAD().getText()));
+		objSection.add("entries", mSection);
+		// pop and insert
+		popSection();
+		mSection.add(objSection);
+	}
+
+	@Override
+	public void enterSubSection(NlpParser.SubSectionContext ctx) {
+		pushSection();
+	}
+
+	@Override
+	public void exitSubSection(NlpParser.SubSectionContext ctx) {
+		// create new object
+		JsonObject objSubSection = new JsonObject();
+		objSubSection.addProperty("section", StringHelper.cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
+		objSubSection.add("entries", mSection);
+		// pop and insert
+		popSection();
+		mSection.add(objSubSection);
+	}
+
+	@Override
+	public void enterEntryString(NlpParser.EntryStringContext ctx) {
+		mSection.add(StringHelper.processString(ctx.ENTRY_STRING().getText()));
+	}
+
+	@Override
+	public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
+		mSection.add(StringHelper.processConcatedString(
+				ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())));
+	}
+
+	@Override
+	public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
+		mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
+	}
+}
--- a/NlpParser/MainRunner.java
+++ b/NlpParser/MainRunner.java
@ -1,20 +1,6 @@
-// import antlr stuff
 import org.antlr.v4.runtime.*;
 import org.antlr.v4.runtime.tree.*;
-// import container
-import java.util.Stack;
-import java.util.stream.Collectors;
-import java.util.List;
-import java.lang.StringBuilder;
-// import json
-import com.google.gson.JsonArray;
-import com.google.gson.JsonObject;
-import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
-// import regex
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
-// import io related
+
 import java.io.FileOutputStream;
 import java.io.FileInputStream;
 import java.io.OutputStreamWriter;
@ -22,179 +8,84 @@ import java.nio.charset.StandardCharsets;
 import java.nio.charset.Charset;

 public class MainRunner {
-	public static class NlpJsonConverter extends NlpBaseListener {
-		public NlpJsonConverter() {
-			mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
-			mRoot = new JsonObject();
-			mSection = new JsonArray();
-			mSectionStack = new Stack<JsonArray>();
-		}
-		/* JSON related stuff */
-		
-		Gson mGsonInstance;
-		public String buildJsonString() {
-			return mGsonInstance.toJson(mRoot);
-		}
-		
-		/* String related stuff */
-		
-		// \\\\[^\\rn] match the concator. concator must not be appended with \n \r or \\ 
-		// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
-		private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
-		private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
-		// private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
-		private static final Pattern mRegEscTab = Pattern.compile("\\t");
-		private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
-		private String cutLangHead(String strl) {
-			return strl.substring("Language:".length());
-		}
-		private String cutSectionHead(String strl) {
-			return strl.substring(1, strl.length() - 1);
-		}
-		private String cutString(String strl) {
-			return strl.substring(1, strl.length() - 1);
-		}
-		private String regulateString(String strl) {
-			strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement(""));		// remove string concator
-			strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
-			// strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it.
-			strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));	// replace real escape to escape char
-			strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
-			
-			return strl;			
-		}
-		private String processString(String strl) {
-			return regulateString(cutString(strl));
-		}
-		private String processConcatedString(List<String> ls) {
-			StringBuilder sb = new StringBuilder();
-			for (String node : ls) {
-				sb.append(regulateString(cutString(node)));
-			}
-			
-			return sb.toString();
-		}
-		
-		/* Section layout related stuff */
-		
-		JsonObject mRoot;
-		JsonArray mSection;
-		Stack<JsonArray> mSectionStack;
-		private void pushSection() {
-			mSectionStack.push(mSection);
-			mSection = new JsonArray();
-		}
-		private void popSection() {
-			mSection = mSectionStack.pop();
-		}
-		
-		/* Listener */
-		
-		@Override
-		public void enterDocument(NlpParser.DocumentContext ctx) {
-			// insert language prop
-			mRoot.addProperty("language", cutLangHead(ctx.LANG_HEADER().getText()));
-		}
-		@Override
-		public void exitDocument(NlpParser.DocumentContext ctx) {
-			// insert document prop
-			mRoot.add("entries", mSection);
-		}
-		
-		@Override 
-		public void enterSection(NlpParser.SectionContext ctx) { 
-			pushSection();
-		}
-		@Override 
-		public void exitSection(NlpParser.SectionContext ctx) { 
-			// create new object
-			JsonObject objSection = new JsonObject();
-			objSection.addProperty("section", cutSectionHead(ctx.SECTION_HEAD().getText()));
-			objSection.add("entries", mSection);
-			// pop and insert
-			popSection();
-			mSection.add(objSection);
-		}
-		
-		@Override 
-		public void enterSubSection(NlpParser.SubSectionContext ctx) { 
-			pushSection();
-		}
-		@Override 
-		public void exitSubSection(NlpParser.SubSectionContext ctx) {
-			// create new object
-			JsonObject objSubSection = new JsonObject();
-			objSubSection.addProperty("section", cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
-			objSubSection.add("entries", mSection);
-			// pop and insert
-			popSection();
-			mSection.add(objSubSection);
-		}
-		
-		@Override 
-		public void enterEntryString(NlpParser.EntryStringContext ctx) {
-			mSection.add(processString(ctx.ENTRY_STRING().getText()));
-		}
-		@Override 
-		public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
-			mSection.add(processConcatedString(
-					ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())
-					));
-		}
-		@Override 
-		public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) { 
-			mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
-		}
-	}
-	
+
 	private static void printHelp() {
+		System.out.println("NlpParser Usage");
 		System.out.println("NlpParser <src> <dest>");
 		System.out.println();
-		System.out.println("<src> - the decoded nlp text file.");
+		System.out.println("<src>  - the decoded NLP text file.");
 		System.out.println("<dest> - the output json file.");
 	}
-	
-	public static void main(String[] args) throws Exception {
-		// check parameter
-		if (args.length != 2) {
-			System.out.println("[ERR] Invalid arguments!");
-			printHelp();
-			System.exit(1);
-		}
-		
-		// open file stream
-		FileInputStream fin = null;
-		FileOutputStream fout = null;
-		try {
-			fin = new FileInputStream(args[0]);
-			fout = new FileOutputStream(args[1]);
-		} catch (Exception e) {
-			if (fin != null) fin.close();
-			if (fout != null) fout.close();
-			
-			System.out.println("[ERR] Fail to open file!");
-			printHelp();
-			System.exit(1);
+
+	private static class UserRequest {
+		public UserRequest(String input_filepath, String output_filepath) {
+			this.mInputFilePath = input_filepath;
+			this.mOutputFilePath = output_filepath;
 		}

-		// start lex and parse
-		CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
-		NlpLexer lexer = new NlpLexer(input);
-		CommonTokenStream tokens = new CommonTokenStream(lexer);
-		NlpParser parser = new NlpParser(tokens);
-		
-		// walk tree to build json
-		ParseTree tree = parser.document();
-		ParseTreeWalker walker = new ParseTreeWalker();
-		NlpJsonConverter converter = new NlpJsonConverter();
-		walker.walk(converter, tree);
-		
-		// write json
-		OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);
-		fw.write(converter.buildJsonString());
-		
-		// close file stream
-		fin.close();
-		fw.close();
+		String mInputFilePath;
+		String mOutputFilePath;
+
+		public String getInputFilePath() {
+			return this.mInputFilePath;
+		}
+
+		public String getOutputFilePath() {
+			return this.mOutputFilePath;
+		}
+
+	}
+
+	private static UserRequest resolveArguments(String[] args) throws Exception {
+		// Check parameter
+		if (args.length != 2) {
+			throw new Exception("Invalid arguments count!");
+		}
+		// Return fetched argumnts
+		return new UserRequest(args[0], args[1]);
+	}
+
+	private static void executeWorker(UserRequest user_request) throws Exception {
+		// Use try-with-resources to safely manage file stream.
+		try (FileInputStream fin = new FileInputStream(user_request.getInputFilePath());
+				FileOutputStream fout = new FileOutputStream(user_request.getOutputFilePath());
+				OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);) {
+			// Start lex and parse
+			CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
+			NlpLexer lexer = new NlpLexer(input);
+			CommonTokenStream tokens = new CommonTokenStream(lexer);
+			NlpParser parser = new NlpParser(tokens);
+
+			// Walk tree to build json
+			ParseTree tree = parser.document();
+			ParseTreeWalker walker = new ParseTreeWalker();
+			JsonConverter converter = new JsonConverter();
+			walker.walk(converter, tree);
+
+			// Write json
+			fw.write(converter.buildJsonString());
+		}
+	}
+
+	public static void main(String[] args) throws Exception {
+		// Check argument
+		UserRequest user_request = null;
+		try {
+			user_request = resolveArguments(args);
+		} catch (Exception e) {
+			System.out.print("[Argument Error] ");
+			System.out.println(e.getMessage());
+			printHelp();
+			return;
+		}
+
+		// Call converter
+		try {
+			executeWorker(user_request);
+		} catch (Exception e) {
+			System.out.print("[Converter Error] ");
+			System.out.println(e.getMessage());
+			return;
+		}
 	}
 }
--- a/NlpParser/StringHelper.java
+++ b/NlpParser/StringHelper.java
@ -0,0 +1,72 @@
+import java.util.List;
+import java.lang.StringBuilder;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+/**
+ * String related stuff
+ */
+public class StringHelper {
+
+	/*
+	 * Regex Constants.
+	 * 
+	 * Hints:
+	 * 
+	 * \\\\[^\\rn] match the concator. concator must not be appended with \n \r or
+	 * \\.
+	 * 
+	 * [^\\r\\n]*[\\r\\n]+ is match to line breaker.
+	 * 
+	 */
+
+	private static final Pattern gRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
+	private static final Pattern gRegDoubleQuote = Pattern.compile("\\\"\\\"");
+	// private static final Pattern gRegEscSlash = Pattern.compile("\\\\\\\\");
+	private static final Pattern gRegEscTab = Pattern.compile("\\t");
+	private static final Pattern gRegEscEol = Pattern.compile("\\r?\\n");
+
+	public static String cutLanguageHead(String strl) {
+		return strl.substring("Language:".length());
+	}
+
+	public static String cutSectionHead(String strl) {
+		return strl.substring(1, strl.length() - 1);
+	}
+
+	public static String cutString(String strl) {
+		return strl.substring(1, strl.length() - 1);
+	}
+
+	public static String regulateString(String strl) {
+		// remove string concator
+		strl = gRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement(""));
+
+		// replace "" with "
+		strl = gRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));
+
+		// leave double back slash alone. we still need it.
+//		strl = gRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));
+
+		// replace real escape to escape char
+		strl = gRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));
+		strl = gRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
+
+		return strl;
+	}
+
+	public static String processString(String strl) {
+		return regulateString(cutString(strl));
+	}
+
+	public static String processConcatedString(List<String> ls) {
+		StringBuilder sb = new StringBuilder();
+		for (String node : ls) {
+			sb.append(regulateString(cutString(node)));
+		}
+
+		return sb.toString();
+	}
+
+}
--- a/Scripts/generate_source.sh
+++ b/Scripts/generate_source.sh
@ -5,11 +5,11 @@
 ./NlpCodec/out/NlpCodec decode NlpSrc/VT50.nlp NlpSrc/VT50.txt

 cd NlpParser
-java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.json
-java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.json
-java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.json
-java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.json
-java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.json
+java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.nested.json
+java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.nested.json
+java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.nested.json
+java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.nested.json
+java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.nested.json
 cd ..

 cd NlpProc
Author	SHA1	Message	Date
yyc12345	60fca862f3	refactor: update NlpParser project	2024-12-13 15:47:05 +08:00
yyc12345	b71f6867c5	fix: update NlpCodec	2024-12-11 20:19:48 +08:00
yyc12345	6193a2ede6	fix: fix NlpCodec compile issue. - fix std::ifstream length getter. - use std::format in throwing exception.	2024-12-11 16:20:21 +08:00