Compare commits

...

3 Commits

Author SHA1 Message Date
60fca862f3 refactor: update NlpParser project 2024-12-13 15:47:05 +08:00
b71f6867c5 fix: update NlpCodec 2024-12-11 20:19:48 +08:00
6193a2ede6 fix: fix NlpCodec compile issue.
- fix std::ifstream length getter.
- use std::format in throwing exception.
2024-12-11 16:20:21 +08:00
5 changed files with 403 additions and 250 deletions

View File

@ -9,9 +9,15 @@
#include <limits>
#include <stdexcept>
#include <utility>
#include <format>
namespace NlpCodec {
#pragma region Help Structs and Functions
/// @brief NlpCodec universal exception.
/// @details Once this exception was thrown, it means that somethings went wrong.
/// and main function should catch it, output error message and exit program immediately.
class NlpException : public std::exception {
public:
NlpException(const char* msg) : message(msg ? msg : "") {}
@ -22,25 +28,124 @@ namespace NlpCodec {
std::string message;
};
/// @brief The safe version of static_cast which throw exception
/// @brief The safe version of `static_cast` which throw exception
/// if given value can not be cast into given type (out of range).
template<typename _TyTo, typename _TyFrom>
static constexpr _TyTo SafeCast(_TyFrom value) {
if (!std::in_range<_TyTo>(value))
throw NlpException(
"Fail to cast integral number because given value is greater than container."
"Fail to cast integral number because given value is greater than the type can hold. "
"This is usually caused by your input or output file is too long.");
return static_cast<_TyTo>(value);
}
/// @brief The magic DWORD for file length encrption.
/// @details It is actually the DWORD consisted by the first 4 bytes of XOR_ARRAY.
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
/// @brief The size of extra part of NLP file which store the size of original plain text file.
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) * 2u;
/// @brief The safe version of `std::ifstream::read`.
/// Throw exception if fail to read.
static void SafeRead(std::ifstream& fin, char* s, std::streamsize count) {
fin.read(s, count);
if (!fin.good() || fin.gcount() != count)
throw NlpException("Fail to read data from file.");
}
/// @brief The safe version of `std::ofstream::write`.
/// Throw exception if fail to write.
static void SafeWrite(std::ofstream& fout, const char* s, std::streamsize count) {
fout.write(s, count);
if (!fout.good())
throw NlpException("Fail to write data into file.");
}
/// @brief The core array for data encryption.
#pragma endregion
#pragma region Encryption Stuff
/*
# NLP File Structure
|Annotation |Size |
|:--- |:--- |
|Body |variable |
|Raw File Length |4 bytes |
|Checksum |4 bytes |
## Body
The first part is a zlib compressed byte array.
Before any process, we need use zlib to decompress it first.
If we need do reverse operation, e.g. build this compressed byte array,
the compression level must be maximum value (best compression, e.g. 9).
After decompress this byte array, we need to an extra step called circular XOR operation
to get human-readable plain text data.
In this operation, we first have a hard-code `XOR_ARRAY`,
then the first byte of decompressed byte array will perform XOR operation with the first byte of `XOR_ARRAY` and so on.
When we reaching the tail of `XOR_ARRAY`,
the next byte of decompressed byte array will perform XOR with the first byte of `XOR_ARRAY` again and so on.
That's the reason why we call this operation is "circular" XOR operation.
The reverse operation of this step is nothing changed.
Because the reverse operation of XOR is perform it again.
After all byte are XORed, we can get what we want,
a human-readable translation file in plain text for following processing.
## Raw File Length
The `uint32_t` field following Body is Raw File Length,
which store the length of raw data, e.g. the length of zlib decompressed byte array.
It's convenient when decompress Body.
However, this field is encrypted when storing in NLP file.
We need to do some extra operations before using it.
Basically, what we need to do is use a `MAGIC_DWORD` to XOR it first, then use `-1` to minus it (overflow is allowed).
I don't know what the fuck this operation is. I just honestly translate the result of reverse work.
So, just do it and don't worry too much.
By the way, the value of `MAGIC_DWORD` is just the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
The reverse operation, e.g. building this file when creating NLP, is also simple.
It's okey that just flip the whole steps I introduced above.
## Checksum
The `uint32_t` field following Body is Checksum,
which is just the CRC32 of Body.
This field is usually used to validate the integrity of NLP file.
Same like Raw File Length, this field is also encrypted in NLP file.
But its encryption method is quitely simpler than Raw File Length.
For decrypting it, you just need minus `CHECKSUM_OFFSET` from it (overflow is also allowed).
The reverse operation of this field is just adding `CHECKSUM_OFFSET`. Fairly simple.
*/
/// @brief The size of non-Body part of NLP file
/// @details Basically this size is the size of the combination of Raw File Length and Checksum field.
constexpr const size_t TAIL_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
/// @brief The magic DWORD for Raw File Length field encrption.
/// @details It is actually the combination of the first 4 bytes of `XOR_ARRAY` in little endian.
constexpr const uint32_t MAGIC_DWORD = 0xF956A82Cu;
/// @brief Encrypt Raw File Length field for writting NLP file.
static constexpr uint32_t EncryptRawFileLength(uint32_t value) {
return static_cast<uint32_t>(-(static_cast<int32_t>(value) + 1)) ^ MAGIC_DWORD;
}
/// @brief Decrypt Raw File Length field read from NLP file.
static constexpr uint32_t DecryptRawFileLength(uint32_t value) {
return static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ value));
}
/// @brief The magic DWORD for Checksum field encryption.
constexpr const uint32_t CHECKSUM_OFFSET = 1072u;
/// @brief Encrypt Checksum field for writting NLP file.
static constexpr uint32_t EncryptChecksum(uint32_t value) {
return value + CHECKSUM_OFFSET;
}
/// @brief Decrypt Checksum field read from NLP file.
static constexpr uint32_t DecryptChecksum(uint32_t value) {
return value - CHECKSUM_OFFSET;
}
/// @brief The core array for Body circular XOR encryption.
/// @details First byte will XOR with the first byte of this array, and so on.
/// When reaching the tail of this array, next give byte will perform XOR with the first byte again and so on.
constexpr const uint8_t XOR_ARRAY[] {
@ -53,37 +158,36 @@ namespace NlpCodec {
0x63, 0xCA, 0x52, 0xF9, 0xFB, 0x3A, 0x9C, 0x2A, 0xB2, 0x1A, 0x8D, 0x9A, 0x8C, 0x2A, 0x9C, 0x32,
0xAA, 0xC3, 0xA2, 0x97, 0x34, 0x92, 0xFA, 0x71, 0xBE, 0x3F, 0xAC, 0x28, 0x22, 0x9F, 0xAC, 0xE8
};
/// @brief The size of above array.
/// @brief The size of `XOR_ARRAY`.
constexpr const size_t XOR_ARRAY_LEN = sizeof(XOR_ARRAY) / sizeof(uint8_t);
/// @brief A convenient mask for above array when performing modulo.
/// @brief A convenient mask for `XOR_ARRAY` when performing modulo during curcular XOR operation.
constexpr const size_t XOR_ARRAY_MASK = XOR_ARRAY_LEN - 1u;
// Use a static_assert to confirm computed XOR_ARRAY_MASK is what we desired.
// Because some stupid programmers (like me) may change above array and fill a series of wrong data,
// then this mask was computed wrongly.
static_assert(XOR_ARRAY_MASK == 0x7Fu);
static void GeneralXorOperation(void* data, size_t data_len) {
/// @brief Encrypt or decrypt decompressed Body field.
static void CircularXorOperation(void* data, size_t data_len) {
uint8_t* ptr = reinterpret_cast<uint8_t*>(data);
for (size_t i = 0u; i < data_len; ++i) {
ptr[i] ^= XOR_ARRAY[i & XOR_ARRAY_MASK];
}
}
#pragma endregion
/// @brief Get the length of given file stream.
static uint32_t GetFileLength(std::ifstream& fin) {
// Fetch the types this stream used for following convenience.
using stream_pos_t = std::ifstream::pos_type;
using stream_off_t = std::ifstream::off_type;
// Backups current file cursor.
stream_pos_t current_pos = fin.tellg();
stream_off_t current_pos = fin.tellg();
// Seek to the tail and get corresponding offset to get the length of file.
fin.seekg(0, std::ios_base::end);
stream_pos_t tail_pos = fin.tellg();
if (std::numeric_limits<uint32_t>::max() < tail_pos)
throw NlpException("The size of given file is too large. It should not larger than the capacity of uint32_t.");
stream_off_t tail_pos = fin.tellg();
// Restore to previous backup file cursor
fin.seekg(static_cast<stream_off_t>(current_pos), std::ios_base::beg);
fin.seekg(current_pos, std::ios_base::beg);
// Safely reurn cast length.
return SafeCast<uint32_t>(tail_pos);
@ -110,12 +214,10 @@ namespace NlpCodec {
throw NlpException("Fail to allocate memory.");
// Read data from file to input buffer
fin.read(inbuf.get(), raw_size);
if (!fin.good() || fin.gcount() != raw_size)
throw NlpException("Fail to read file data into buffer.");
SafeRead(fin, inbuf.get(), raw_size);
// Do XOR operation
GeneralXorOperation(inbuf.get(), raw_size);
CircularXorOperation(inbuf.get(), raw_size);
// Do compress and get the size of compressed data.
uLongf dest_len = static_cast<uLongf>(computed_boundary);
@ -124,9 +226,9 @@ namespace NlpCodec {
reinterpret_cast<Bytef*>(inbuf.get()), static_cast<uLong>(raw_size),
Z_BEST_COMPRESSION
);
// Check ZLib result.
// Check zlib result.
if (ret != Z_OK)
throw NlpException("Zlib compress() failed.");
throw NlpException("zlib compress() failed.");
// Fetch final compressed size.
uint32_t compressed_size = SafeCast<uint32_t>(dest_len);
@ -134,21 +236,15 @@ namespace NlpCodec {
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(outbuf.get()), SafeCast<uInt>(compressed_size)));
// Write compressed data into file
fout.write(outbuf.get(), compressed_size);
if (!fout.good())
throw NlpException("Fail to write data into file.");
SafeWrite(fout, outbuf.get(), compressed_size);
// Raw size and checksum need some extra encryption before writting
raw_size = static_cast<uint32_t>(-(static_cast<int32_t>(raw_size) + 1)) ^ MAGIC_DWORD;
checksum = checksum + CHECKSUM_OFFSET;
raw_size = EncryptRawFileLength(raw_size);
checksum = EncryptChecksum(checksum);
// Write raw size and checksum
fout.write(reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
if (!fout.good())
throw NlpException("Fail to write raw size into file.");
fout.write(reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
if (!fout.good())
throw NlpException("Fail to write checksum into file.");
SafeWrite(fout, reinterpret_cast<char*>(&raw_size), sizeof(uint32_t));
SafeWrite(fout, reinterpret_cast<char*>(&checksum), sizeof(uint32_t));
}
@ -162,13 +258,13 @@ namespace NlpCodec {
compressed_size -= TAIL_SIZE;
fin.seekg(compressed_size, std::ios_base::beg);
uint32_t expected_raw_size = 0u, expected_checksum = 0u;
fin.read(reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
fin.read(reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
SafeRead(fin, reinterpret_cast<char*>(&expected_raw_size), sizeof(uint32_t));
SafeRead(fin, reinterpret_cast<char*>(&expected_checksum), sizeof(uint32_t));
fin.seekg(0, std::ios_base::beg);
// Raw size and checksum data need to do some extra decryption.
expected_raw_size = static_cast<uint32_t>(-1 - static_cast<int32_t>(MAGIC_DWORD ^ expected_raw_size));
expected_checksum = expected_checksum - CHECKSUM_OFFSET;
expected_raw_size = DecryptRawFileLength(expected_raw_size);
expected_checksum = DecryptChecksum(expected_checksum);
// Allocate memory to store data
std::unique_ptr<char[]> inbuf(new(std::nothrow) char[compressed_size]);
@ -177,18 +273,14 @@ namespace NlpCodec {
throw NlpException("Fail to allocate memory.");
// Read file into buffer
fin.read(inbuf.get(), compressed_size);
if (!fin.good() || fin.gcount() != compressed_size)
throw NlpException("Fail to read data into buffer.\n");
SafeRead(fin, inbuf.get(), compressed_size);
// Test checksum
uint32_t checksum = static_cast<uint32_t>(adler32(0u, reinterpret_cast<Bytef*>(inbuf.get()), SafeCast<uInt>(compressed_size)));
if (checksum != expected_checksum) {
fprintf(stdout, "[ERR] Fail to match crc32. Expect 0x%" PRIx32 " got 0x%" PRIx32 ".\n",
expected_checksum, checksum
if (checksum != expected_checksum)
throw NlpException(
std::format("Not matched crc32. Expect 0x{:<08x} got 0x{:<08x}.", expected_checksum, checksum).c_str()
);
return false;
}
// Do decompress
uLongf _destLen = static_cast<uLongf>(expected_raw_size);
@ -198,15 +290,13 @@ namespace NlpCodec {
);
// Check zlib result
if (ret != Z_OK)
throw NlpException("Zlib uncompress() failed.");
throw NlpException("zlib uncompress() failed.");
// do xor operation
GeneralXorOperation(outbuf.get(), expected_raw_size);
// Do XOR operation
CircularXorOperation(outbuf.get(), expected_raw_size);
// Write result into file
fout.write(outbuf.get(), expected_raw_size);
if (!fout.good())
throw NlpException("Fail to write data into file.");
SafeWrite(fout, outbuf.get(), expected_raw_size);
}
@ -233,16 +323,16 @@ namespace NlpCodec::Runtime {
<< "NlpCodec [encode | decode | version | help] <src> <dest>" << std::endl
<< std::endl
<< "version - print version info about this program." << std::endl
<< "help - print this page." << std::endl
<< "help - print this page." << std::endl
<< std::endl
<< "encode - encode text file into NLP file." << std::endl
<< "decode - decode NLP file into text file." << std::endl
<< "<src> - the source file." << std::endl
<< " the path to text file in encode mode." << std::endl
<< " the path to NLP file in decode mode." << std::endl
<< "<dest> - the destination file." << std::endl
<< " the path to NLP file in encode mode." << std::endl
<< " the path to text file in decode mode." << std::endl
<< "encode - encode text file into NLP file." << std::endl
<< "decode - decode NLP file into text file." << std::endl
<< "<src> - the source file." << std::endl
<< " encode mode: the path to text file." << std::endl
<< " decode mode: the path to NLP file." << std::endl
<< "<dest> - the destination file." << std::endl
<< " encode mode: the path to NLP file." << std::endl
<< " decode mode: the path to text file." << std::endl
<< "" << std::endl;
}

View File

@ -0,0 +1,100 @@
import java.util.Stack;
import java.util.stream.Collectors;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
public class JsonConverter extends NlpBaseListener {
public JsonConverter() {
mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
mRoot = new JsonObject();
mSection = new JsonArray();
mSectionStack = new Stack<JsonArray>();
}
/* ========== JSON related stuff ========== */
Gson mGsonInstance;
public String buildJsonString() {
return mGsonInstance.toJson(mRoot);
}
/* ========== Section layout related stuff ========== */
JsonObject mRoot;
JsonArray mSection;
Stack<JsonArray> mSectionStack;
private void pushSection() {
mSectionStack.push(mSection);
mSection = new JsonArray();
}
private void popSection() {
mSection = mSectionStack.pop();
}
/* ========== Listener ========== */
@Override
public void enterDocument(NlpParser.DocumentContext ctx) {
// insert language prop
mRoot.addProperty("language", StringHelper.cutLanguageHead(ctx.LANG_HEADER().getText()));
}
@Override
public void exitDocument(NlpParser.DocumentContext ctx) {
// insert document prop
mRoot.add("entries", mSection);
}
@Override
public void enterSection(NlpParser.SectionContext ctx) {
pushSection();
}
@Override
public void exitSection(NlpParser.SectionContext ctx) {
// create new object
JsonObject objSection = new JsonObject();
objSection.addProperty("section", StringHelper.cutSectionHead(ctx.SECTION_HEAD().getText()));
objSection.add("entries", mSection);
// pop and insert
popSection();
mSection.add(objSection);
}
@Override
public void enterSubSection(NlpParser.SubSectionContext ctx) {
pushSection();
}
@Override
public void exitSubSection(NlpParser.SubSectionContext ctx) {
// create new object
JsonObject objSubSection = new JsonObject();
objSubSection.addProperty("section", StringHelper.cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
objSubSection.add("entries", mSection);
// pop and insert
popSection();
mSection.add(objSubSection);
}
@Override
public void enterEntryString(NlpParser.EntryStringContext ctx) {
mSection.add(StringHelper.processString(ctx.ENTRY_STRING().getText()));
}
@Override
public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
mSection.add(StringHelper.processConcatedString(
ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())));
}
@Override
public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
}
}

View File

@ -1,20 +1,6 @@
// import antlr stuff
import org.antlr.v4.runtime.*;
import org.antlr.v4.runtime.tree.*;
// import container
import java.util.Stack;
import java.util.stream.Collectors;
import java.util.List;
import java.lang.StringBuilder;
// import json
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
// import regex
import java.util.regex.Pattern;
import java.util.regex.Matcher;
// import io related
import java.io.FileOutputStream;
import java.io.FileInputStream;
import java.io.OutputStreamWriter;
@ -22,179 +8,84 @@ import java.nio.charset.StandardCharsets;
import java.nio.charset.Charset;
public class MainRunner {
public static class NlpJsonConverter extends NlpBaseListener {
public NlpJsonConverter() {
mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();
mRoot = new JsonObject();
mSection = new JsonArray();
mSectionStack = new Stack<JsonArray>();
}
/* JSON related stuff */
Gson mGsonInstance;
public String buildJsonString() {
return mGsonInstance.toJson(mRoot);
}
/* String related stuff */
// \\\\[^\\rn] match the concator. concator must not be appended with \n \r or \\
// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
// private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
private static final Pattern mRegEscTab = Pattern.compile("\\t");
private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
private String cutLangHead(String strl) {
return strl.substring("Language:".length());
}
private String cutSectionHead(String strl) {
return strl.substring(1, strl.length() - 1);
}
private String cutString(String strl) {
return strl.substring(1, strl.length() - 1);
}
private String regulateString(String strl) {
strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator
strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
// strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it.
strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); // replace real escape to escape char
strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
return strl;
}
private String processString(String strl) {
return regulateString(cutString(strl));
}
private String processConcatedString(List<String> ls) {
StringBuilder sb = new StringBuilder();
for (String node : ls) {
sb.append(regulateString(cutString(node)));
}
return sb.toString();
}
/* Section layout related stuff */
JsonObject mRoot;
JsonArray mSection;
Stack<JsonArray> mSectionStack;
private void pushSection() {
mSectionStack.push(mSection);
mSection = new JsonArray();
}
private void popSection() {
mSection = mSectionStack.pop();
}
/* Listener */
@Override
public void enterDocument(NlpParser.DocumentContext ctx) {
// insert language prop
mRoot.addProperty("language", cutLangHead(ctx.LANG_HEADER().getText()));
}
@Override
public void exitDocument(NlpParser.DocumentContext ctx) {
// insert document prop
mRoot.add("entries", mSection);
}
@Override
public void enterSection(NlpParser.SectionContext ctx) {
pushSection();
}
@Override
public void exitSection(NlpParser.SectionContext ctx) {
// create new object
JsonObject objSection = new JsonObject();
objSection.addProperty("section", cutSectionHead(ctx.SECTION_HEAD().getText()));
objSection.add("entries", mSection);
// pop and insert
popSection();
mSection.add(objSection);
}
@Override
public void enterSubSection(NlpParser.SubSectionContext ctx) {
pushSection();
}
@Override
public void exitSubSection(NlpParser.SubSectionContext ctx) {
// create new object
JsonObject objSubSection = new JsonObject();
objSubSection.addProperty("section", cutSectionHead(ctx.SUB_SECTION_HEAD().getText()));
objSubSection.add("entries", mSection);
// pop and insert
popSection();
mSection.add(objSubSection);
}
@Override
public void enterEntryString(NlpParser.EntryStringContext ctx) {
mSection.add(processString(ctx.ENTRY_STRING().getText()));
}
@Override
public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) {
mSection.add(processConcatedString(
ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList())
));
}
@Override
public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) {
mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText()));
}
}
private static void printHelp() {
System.out.println("NlpParser Usage");
System.out.println("NlpParser <src> <dest>");
System.out.println();
System.out.println("<src> - the decoded nlp text file.");
System.out.println("<src> - the decoded NLP text file.");
System.out.println("<dest> - the output json file.");
}
public static void main(String[] args) throws Exception {
// check parameter
if (args.length != 2) {
System.out.println("[ERR] Invalid arguments!");
printHelp();
System.exit(1);
}
// open file stream
FileInputStream fin = null;
FileOutputStream fout = null;
try {
fin = new FileInputStream(args[0]);
fout = new FileOutputStream(args[1]);
} catch (Exception e) {
if (fin != null) fin.close();
if (fout != null) fout.close();
System.out.println("[ERR] Fail to open file!");
printHelp();
System.exit(1);
private static class UserRequest {
public UserRequest(String input_filepath, String output_filepath) {
this.mInputFilePath = input_filepath;
this.mOutputFilePath = output_filepath;
}
// start lex and parse
CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
NlpLexer lexer = new NlpLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
NlpParser parser = new NlpParser(tokens);
// walk tree to build json
ParseTree tree = parser.document();
ParseTreeWalker walker = new ParseTreeWalker();
NlpJsonConverter converter = new NlpJsonConverter();
walker.walk(converter, tree);
// write json
OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);
fw.write(converter.buildJsonString());
// close file stream
fin.close();
fw.close();
String mInputFilePath;
String mOutputFilePath;
public String getInputFilePath() {
return this.mInputFilePath;
}
public String getOutputFilePath() {
return this.mOutputFilePath;
}
}
private static UserRequest resolveArguments(String[] args) throws Exception {
// Check parameter
if (args.length != 2) {
throw new Exception("Invalid arguments count!");
}
// Return fetched argumnts
return new UserRequest(args[0], args[1]);
}
private static void executeWorker(UserRequest user_request) throws Exception {
// Use try-with-resources to safely manage file stream.
try (FileInputStream fin = new FileInputStream(user_request.getInputFilePath());
FileOutputStream fout = new FileOutputStream(user_request.getOutputFilePath());
OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);) {
// Start lex and parse
CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252"));
NlpLexer lexer = new NlpLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
NlpParser parser = new NlpParser(tokens);
// Walk tree to build json
ParseTree tree = parser.document();
ParseTreeWalker walker = new ParseTreeWalker();
JsonConverter converter = new JsonConverter();
walker.walk(converter, tree);
// Write json
fw.write(converter.buildJsonString());
}
}
public static void main(String[] args) throws Exception {
// Check argument
UserRequest user_request = null;
try {
user_request = resolveArguments(args);
} catch (Exception e) {
System.out.print("[Argument Error] ");
System.out.println(e.getMessage());
printHelp();
return;
}
// Call converter
try {
executeWorker(user_request);
} catch (Exception e) {
System.out.print("[Converter Error] ");
System.out.println(e.getMessage());
return;
}
}
}

View File

@ -0,0 +1,72 @@
import java.util.List;
import java.lang.StringBuilder;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* String related stuff
*/
public class StringHelper {
/*
* Regex Constants.
*
* Hints:
*
* \\\\[^\\rn] match the concator. concator must not be appended with \n \r or
* \\.
*
* [^\\r\\n]*[\\r\\n]+ is match to line breaker.
*
*/
private static final Pattern gRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
private static final Pattern gRegDoubleQuote = Pattern.compile("\\\"\\\"");
// private static final Pattern gRegEscSlash = Pattern.compile("\\\\\\\\");
private static final Pattern gRegEscTab = Pattern.compile("\\t");
private static final Pattern gRegEscEol = Pattern.compile("\\r?\\n");
public static String cutLanguageHead(String strl) {
return strl.substring("Language:".length());
}
public static String cutSectionHead(String strl) {
return strl.substring(1, strl.length() - 1);
}
public static String cutString(String strl) {
return strl.substring(1, strl.length() - 1);
}
public static String regulateString(String strl) {
// remove string concator
strl = gRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement(""));
// replace "" with "
strl = gRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));
// leave double back slash alone. we still need it.
// strl = gRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));
// replace real escape to escape char
strl = gRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));
strl = gRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
return strl;
}
public static String processString(String strl) {
return regulateString(cutString(strl));
}
public static String processConcatedString(List<String> ls) {
StringBuilder sb = new StringBuilder();
for (String node : ls) {
sb.append(regulateString(cutString(node)));
}
return sb.toString();
}
}

View File

@ -5,11 +5,11 @@
./NlpCodec/out/NlpCodec decode NlpSrc/VT50.nlp NlpSrc/VT50.txt
cd NlpParser
java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.json
java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.json
java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.json
java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.json
java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.json
java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.nested.json
java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.nested.json
java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.nested.json
java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.nested.json
java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.nested.json
cd ..
cd NlpProc