From 60fca862f3d3a624a74325038fdb441a73fdec6b Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Fri, 13 Dec 2024 15:46:28 +0800 Subject: [PATCH] refactor: update NlpParser project --- NlpParser/JsonConverter.java | 100 ++++++++++++++ NlpParser/MainRunner.java | 255 ++++++++++------------------------- NlpParser/StringHelper.java | 72 ++++++++++ Scripts/generate_source.sh | 10 +- 4 files changed, 250 insertions(+), 187 deletions(-) create mode 100644 NlpParser/JsonConverter.java create mode 100644 NlpParser/StringHelper.java diff --git a/NlpParser/JsonConverter.java b/NlpParser/JsonConverter.java new file mode 100644 index 0000000..d09af95 --- /dev/null +++ b/NlpParser/JsonConverter.java @@ -0,0 +1,100 @@ +import java.util.Stack; +import java.util.stream.Collectors; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +public class JsonConverter extends NlpBaseListener { + public JsonConverter() { + mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create(); + mRoot = new JsonObject(); + mSection = new JsonArray(); + mSectionStack = new Stack(); + } + /* ========== JSON related stuff ========== */ + + Gson mGsonInstance; + + public String buildJsonString() { + return mGsonInstance.toJson(mRoot); + } + + /* ========== Section layout related stuff ========== */ + + JsonObject mRoot; + JsonArray mSection; + Stack mSectionStack; + + private void pushSection() { + mSectionStack.push(mSection); + mSection = new JsonArray(); + } + + private void popSection() { + mSection = mSectionStack.pop(); + } + + /* ========== Listener ========== */ + + @Override + public void enterDocument(NlpParser.DocumentContext ctx) { + // insert language prop + mRoot.addProperty("language", StringHelper.cutLanguageHead(ctx.LANG_HEADER().getText())); + } + + @Override + public void exitDocument(NlpParser.DocumentContext ctx) { + // insert document prop + mRoot.add("entries", mSection); + } + + @Override + public void enterSection(NlpParser.SectionContext ctx) { + pushSection(); + } + + @Override + public void exitSection(NlpParser.SectionContext ctx) { + // create new object + JsonObject objSection = new JsonObject(); + objSection.addProperty("section", StringHelper.cutSectionHead(ctx.SECTION_HEAD().getText())); + objSection.add("entries", mSection); + // pop and insert + popSection(); + mSection.add(objSection); + } + + @Override + public void enterSubSection(NlpParser.SubSectionContext ctx) { + pushSection(); + } + + @Override + public void exitSubSection(NlpParser.SubSectionContext ctx) { + // create new object + JsonObject objSubSection = new JsonObject(); + objSubSection.addProperty("section", StringHelper.cutSectionHead(ctx.SUB_SECTION_HEAD().getText())); + objSubSection.add("entries", mSection); + // pop and insert + popSection(); + mSection.add(objSubSection); + } + + @Override + public void enterEntryString(NlpParser.EntryStringContext ctx) { + mSection.add(StringHelper.processString(ctx.ENTRY_STRING().getText())); + } + + @Override + public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) { + mSection.add(StringHelper.processConcatedString( + ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList()))); + } + + @Override + public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) { + mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText())); + } +} diff --git a/NlpParser/MainRunner.java b/NlpParser/MainRunner.java index 4687be4..d1afe93 100644 --- a/NlpParser/MainRunner.java +++ b/NlpParser/MainRunner.java @@ -1,20 +1,6 @@ -// import antlr stuff import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.tree.*; -// import container -import java.util.Stack; -import java.util.stream.Collectors; -import java.util.List; -import java.lang.StringBuilder; -// import json -import com.google.gson.JsonArray; -import com.google.gson.JsonObject; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -// import regex -import java.util.regex.Pattern; -import java.util.regex.Matcher; -// import io related + import java.io.FileOutputStream; import java.io.FileInputStream; import java.io.OutputStreamWriter; @@ -22,179 +8,84 @@ import java.nio.charset.StandardCharsets; import java.nio.charset.Charset; public class MainRunner { - public static class NlpJsonConverter extends NlpBaseListener { - public NlpJsonConverter() { - mGsonInstance = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create(); - mRoot = new JsonObject(); - mSection = new JsonArray(); - mSectionStack = new Stack(); - } - /* JSON related stuff */ - - Gson mGsonInstance; - public String buildJsonString() { - return mGsonInstance.toJson(mRoot); - } - - /* String related stuff */ - - // \\\\[^\\rn] match the concator. concator must not be appended with \n \r or \\ - // [^\\r\\n]*[\\r\\n]+ is match to line breaker. - private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+"); - private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\""); - // private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\"); - private static final Pattern mRegEscTab = Pattern.compile("\\t"); - private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n"); - private String cutLangHead(String strl) { - return strl.substring("Language:".length()); - } - private String cutSectionHead(String strl) { - return strl.substring(1, strl.length() - 1); - } - private String cutString(String strl) { - return strl.substring(1, strl.length() - 1); - } - private String regulateString(String strl) { - strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator - strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with " - // strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it. - strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); // replace real escape to escape char - strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n")); - - return strl; - } - private String processString(String strl) { - return regulateString(cutString(strl)); - } - private String processConcatedString(List ls) { - StringBuilder sb = new StringBuilder(); - for (String node : ls) { - sb.append(regulateString(cutString(node))); - } - - return sb.toString(); - } - - /* Section layout related stuff */ - - JsonObject mRoot; - JsonArray mSection; - Stack mSectionStack; - private void pushSection() { - mSectionStack.push(mSection); - mSection = new JsonArray(); - } - private void popSection() { - mSection = mSectionStack.pop(); - } - - /* Listener */ - - @Override - public void enterDocument(NlpParser.DocumentContext ctx) { - // insert language prop - mRoot.addProperty("language", cutLangHead(ctx.LANG_HEADER().getText())); - } - @Override - public void exitDocument(NlpParser.DocumentContext ctx) { - // insert document prop - mRoot.add("entries", mSection); - } - - @Override - public void enterSection(NlpParser.SectionContext ctx) { - pushSection(); - } - @Override - public void exitSection(NlpParser.SectionContext ctx) { - // create new object - JsonObject objSection = new JsonObject(); - objSection.addProperty("section", cutSectionHead(ctx.SECTION_HEAD().getText())); - objSection.add("entries", mSection); - // pop and insert - popSection(); - mSection.add(objSection); - } - - @Override - public void enterSubSection(NlpParser.SubSectionContext ctx) { - pushSection(); - } - @Override - public void exitSubSection(NlpParser.SubSectionContext ctx) { - // create new object - JsonObject objSubSection = new JsonObject(); - objSubSection.addProperty("section", cutSectionHead(ctx.SUB_SECTION_HEAD().getText())); - objSubSection.add("entries", mSection); - // pop and insert - popSection(); - mSection.add(objSubSection); - } - - @Override - public void enterEntryString(NlpParser.EntryStringContext ctx) { - mSection.add(processString(ctx.ENTRY_STRING().getText())); - } - @Override - public void enterEntryConcatedString(NlpParser.EntryConcatedStringContext ctx) { - mSection.add(processConcatedString( - ctx.ENTRY_STRING().stream().map(value -> value.getText()).collect(Collectors.toList()) - )); - } - @Override - public void enterEntryInteger(NlpParser.EntryIntegerContext ctx) { - mSection.add(Integer.parseInt(ctx.ENTRY_INTEGER().getText())); - } - } - + private static void printHelp() { + System.out.println("NlpParser Usage"); System.out.println("NlpParser "); System.out.println(); - System.out.println(" - the decoded nlp text file."); + System.out.println(" - the decoded NLP text file."); System.out.println(" - the output json file."); } - - public static void main(String[] args) throws Exception { - // check parameter - if (args.length != 2) { - System.out.println("[ERR] Invalid arguments!"); - printHelp(); - System.exit(1); - } - - // open file stream - FileInputStream fin = null; - FileOutputStream fout = null; - try { - fin = new FileInputStream(args[0]); - fout = new FileOutputStream(args[1]); - } catch (Exception e) { - if (fin != null) fin.close(); - if (fout != null) fout.close(); - - System.out.println("[ERR] Fail to open file!"); - printHelp(); - System.exit(1); + + private static class UserRequest { + public UserRequest(String input_filepath, String output_filepath) { + this.mInputFilePath = input_filepath; + this.mOutputFilePath = output_filepath; } - // start lex and parse - CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252")); - NlpLexer lexer = new NlpLexer(input); - CommonTokenStream tokens = new CommonTokenStream(lexer); - NlpParser parser = new NlpParser(tokens); - - // walk tree to build json - ParseTree tree = parser.document(); - ParseTreeWalker walker = new ParseTreeWalker(); - NlpJsonConverter converter = new NlpJsonConverter(); - walker.walk(converter, tree); - - // write json - OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8); - fw.write(converter.buildJsonString()); - - // close file stream - fin.close(); - fw.close(); + String mInputFilePath; + String mOutputFilePath; + + public String getInputFilePath() { + return this.mInputFilePath; + } + + public String getOutputFilePath() { + return this.mOutputFilePath; + } + + } + + private static UserRequest resolveArguments(String[] args) throws Exception { + // Check parameter + if (args.length != 2) { + throw new Exception("Invalid arguments count!"); + } + // Return fetched argumnts + return new UserRequest(args[0], args[1]); + } + + private static void executeWorker(UserRequest user_request) throws Exception { + // Use try-with-resources to safely manage file stream. + try (FileInputStream fin = new FileInputStream(user_request.getInputFilePath()); + FileOutputStream fout = new FileOutputStream(user_request.getOutputFilePath()); + OutputStreamWriter fw = new OutputStreamWriter(fout, StandardCharsets.UTF_8);) { + // Start lex and parse + CharStream input = CharStreams.fromStream(fin, Charset.forName("windows-1252")); + NlpLexer lexer = new NlpLexer(input); + CommonTokenStream tokens = new CommonTokenStream(lexer); + NlpParser parser = new NlpParser(tokens); + + // Walk tree to build json + ParseTree tree = parser.document(); + ParseTreeWalker walker = new ParseTreeWalker(); + JsonConverter converter = new JsonConverter(); + walker.walk(converter, tree); + + // Write json + fw.write(converter.buildJsonString()); + } + } + + public static void main(String[] args) throws Exception { + // Check argument + UserRequest user_request = null; + try { + user_request = resolveArguments(args); + } catch (Exception e) { + System.out.print("[Argument Error] "); + System.out.println(e.getMessage()); + printHelp(); + return; + } + + // Call converter + try { + executeWorker(user_request); + } catch (Exception e) { + System.out.print("[Converter Error] "); + System.out.println(e.getMessage()); + return; + } } } diff --git a/NlpParser/StringHelper.java b/NlpParser/StringHelper.java new file mode 100644 index 0000000..60efa96 --- /dev/null +++ b/NlpParser/StringHelper.java @@ -0,0 +1,72 @@ +import java.util.List; +import java.lang.StringBuilder; + +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +/** + * String related stuff + */ +public class StringHelper { + + /* + * Regex Constants. + * + * Hints: + * + * \\\\[^\\rn] match the concator. concator must not be appended with \n \r or + * \\. + * + * [^\\r\\n]*[\\r\\n]+ is match to line breaker. + * + */ + + private static final Pattern gRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+"); + private static final Pattern gRegDoubleQuote = Pattern.compile("\\\"\\\""); + // private static final Pattern gRegEscSlash = Pattern.compile("\\\\\\\\"); + private static final Pattern gRegEscTab = Pattern.compile("\\t"); + private static final Pattern gRegEscEol = Pattern.compile("\\r?\\n"); + + public static String cutLanguageHead(String strl) { + return strl.substring("Language:".length()); + } + + public static String cutSectionHead(String strl) { + return strl.substring(1, strl.length() - 1); + } + + public static String cutString(String strl) { + return strl.substring(1, strl.length() - 1); + } + + public static String regulateString(String strl) { + // remove string concator + strl = gRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); + + // replace "" with " + strl = gRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\"")); + + // leave double back slash alone. we still need it. +// strl = gRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\")); + + // replace real escape to escape char + strl = gRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); + strl = gRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n")); + + return strl; + } + + public static String processString(String strl) { + return regulateString(cutString(strl)); + } + + public static String processConcatedString(List ls) { + StringBuilder sb = new StringBuilder(); + for (String node : ls) { + sb.append(regulateString(cutString(node))); + } + + return sb.toString(); + } + +} diff --git a/Scripts/generate_source.sh b/Scripts/generate_source.sh index 1638e95..dd291be 100644 --- a/Scripts/generate_source.sh +++ b/Scripts/generate_source.sh @@ -5,11 +5,11 @@ ./NlpCodec/out/NlpCodec decode NlpSrc/VT50.nlp NlpSrc/VT50.txt cd NlpParser -java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.json -java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.json -java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.json -java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.json -java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.json +java MainRunner ../NlpSrc/VT25.txt ../NlpSrc/VT25.nested.json +java MainRunner ../NlpSrc/VT30.txt ../NlpSrc/VT30.nested.json +java MainRunner ../NlpSrc/VT35.txt ../NlpSrc/VT35.nested.json +java MainRunner ../NlpSrc/VT40.txt ../NlpSrc/VT40.nested.json +java MainRunner ../NlpSrc/VT50.txt ../NlpSrc/VT50.nested.json cd .. cd NlpProc