From b740e95a62397e5b8aedd459ab65d814dc43cd36 Mon Sep 17 00:00:00 2001
From: yyc12345 <yyc12321@outlook.com>
Date: Tue, 11 Jul 2023 22:36:07 +0800
Subject: [PATCH] finish script. but vt can not recognize it

---
 NlpParser/NlpRunner.java  |  6 +--
 NlpProc/NlpJsonDecoder.py |  8 ++--
 NlpProc/NlpJsonEncoder.py | 52 ++++++++++++++++++----
 NlpProc/NlpUtils.py       | 91 ++++++++++++++++++++++++++++++++++++---
 4 files changed, 136 insertions(+), 21 deletions(-)

diff --git a/NlpParser/NlpRunner.java b/NlpParser/NlpRunner.java
index d53998b..1e20bc0 100644
--- a/NlpParser/NlpRunner.java
+++ b/NlpParser/NlpRunner.java
@@ -41,7 +41,7 @@ public class NlpRunner {
 		// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
 		private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
 		private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
-		private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
+		// private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
 		private static final Pattern mRegEscTab = Pattern.compile("\\t");
 		private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
 		private String cutLangHead(String strl) {
@@ -56,8 +56,8 @@ public class NlpRunner {
 		private String regulateString(String strl) {
 			strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement(""));		// remove string concator
 			strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
-			strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));	// replace real escape to escape char
-			strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));
+			// strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it.
+			strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));	// replace real escape to escape char
 			strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
 			
 			return strl;			
diff --git a/NlpProc/NlpJsonDecoder.py b/NlpProc/NlpJsonDecoder.py
index 6d0de6b..0e75770 100644
--- a/NlpProc/NlpJsonDecoder.py
+++ b/NlpProc/NlpJsonDecoder.py
@@ -13,7 +13,7 @@ def ConstructVtTrDataTuple() -> tuple[VtTrDataTuple]:
 
 if __name__ == "__main__":
 
-    prevJson = None
+    prevPlainValues = None
     for vtVer in ConstructVtTrDataTuple():
         print(f'Processing {vtVer.nlpJson}...')
 
@@ -25,14 +25,14 @@ if __name__ == "__main__":
         NlpUtils.DumpTrIndex(vtVer.trIndex, plainKeys)
 
         # compare with previous one
-        if prevJson is None:
+        if prevPlainValues is None:
             # this is first json. omit diff
             # write blank diff and write whole translation values
             NlpUtils.DumpTrDiff(vtVer.trDiff, [], [])
             NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict(zip(plainKeys, plainValues)))
         else:
             # compare with prev json
-            cmpResult = jsondiff.diff(prevJson, plainValues)
+            cmpResult = jsondiff.diff(prevPlainValues, plainValues)
             # seperate diff result
             (insertedKey, deletedKey, insertedVal) = NlpUtils.SeperatePlainJsonDiff(cmpResult)
 
@@ -42,5 +42,5 @@ if __name__ == "__main__":
             NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict((plainKeys[insertedKey[i]], insertedVal[i]) for i in range(len(insertedKey))))
 
         # assign prev json
-        prevJson = plainValues
+        prevPlainValues = plainValues
 
diff --git a/NlpProc/NlpJsonEncoder.py b/NlpProc/NlpJsonEncoder.py
index 4901183..fa6d6a1 100644
--- a/NlpProc/NlpJsonEncoder.py
+++ b/NlpProc/NlpJsonEncoder.py
@@ -3,12 +3,12 @@ import jsondiff
 import collections
 
 g_SupportedEncoding = {
-    'zh-cn': ('utf-8', 'gb2312', )
+    'zh-cn': ('Chinese', ('utf-8', 'gb2312', ), )
 }
 
 VtTrDataTuple = collections.namedtuple('VtTrDataTuple', ('rawNlp', 'trTemplate', 'trDiff', 'trIndex'))
-def GetRawNlpPath(ver: str, lang: str) -> str:
-    return f'../NlpTr/out/VT{ver}.{lang}.txt'
+def GetRawNlpPath(ver: str, lang: str, enc: str) -> str:
+    return f'../NlpTr/out/VT{ver}.{lang}.{enc}.txt'
 def GetTrPath(ver: str, lang: str) -> str:
     return f'../NlpTr/VT{ver}.{lang}.json'
 def GetTrDiffPath(ver: str) -> str:
@@ -18,16 +18,52 @@ def GetTrIndexPath(ver: str) -> str:
 
 if __name__ == "__main__":
 
+    # load each version's diff data and patch data for conventient using
+    PreLoadedDiffIdxTuple = collections.namedtuple('PreLoadedDiffIndexTuple', ('insertedKey', 'deletedKey', 'plainKeys'))
+    preLoadedData: dict[str, PreLoadedDiffIdxTuple] = {}
     for ver in NlpUtils.g_VirtoolsVersion:
         # load diff and index data
+        insertedKey, deletedKey = NlpUtils.LoadTrDiff(GetTrDiffPath(ver))
+        plainKeys = NlpUtils.LoadTrIndex(GetTrIndexPath(ver))
+        # insert to dict
+        preLoadedData[ver] = PreLoadedDiffIdxTuple._make((insertedKey, deletedKey, plainKeys))
+
+    # iterate lang first
+    # because we use progressive patch. we need iterate vt ver in order
+    for lang in NlpUtils.g_SupportedLangs:
+        
+        prevPlainValues: list[str] = None
+        for ver in NlpUtils.g_VirtoolsVersion:
+            print(f'Processing {ver}.{lang}...')
+
+            # pick data from pre-loaded dict
+            diffIdxData = preLoadedData[ver]
 
-        for lang in NlpUtils.g_SupportedLangs:
             # load lang file
+            # and only keeps its value.
+            trFull = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang))
+            _, plainValues = zip(*trFull.items())
 
-            # patch it
+            # patch it if needed
+            if prevPlainValues is not None:
+                # patch needed load
+                # load patch part first
+                trPart = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang))
 
-            # convert plain json to nested json
+                # re-construct the diff structure understood by jsondiff
+                cmpResult = NlpUtils.CombinePlainJsonDiff(diffIdxData.insertedKey, diffIdxData.deletedKey, plainValues)
+
+                # patch data
+                plainValues = jsondiff.patch(prevPlainValues, cmpResult)
+
+            # convert plain json to nlp json
+            nlpJson = NlpUtils.PlainJson2NlpJson(plainKeys, plainValues)
 
             # write into file with different encoding
-            for enc in g_SupportedEncoding[lang]:
-                print(f'Process {ver}.{lang}.{enc}...')
+            lang_macro, encs = g_SupportedEncoding[lang]
+            for enc in encs:
+                print(f'Processing {ver}.{lang}.{enc}...')
+                NlpUtils.DumpNlpJson(GetRawNlpPath(ver, lang, enc), enc, lang_macro, nlpJson)
+
+            # assign prev json
+            prevPlainValues = plainValues
\ No newline at end of file
diff --git a/NlpProc/NlpUtils.py b/NlpProc/NlpUtils.py
index 6adb18d..4abc24d 100644
--- a/NlpProc/NlpUtils.py
+++ b/NlpProc/NlpUtils.py
@@ -2,6 +2,7 @@ import jsondiff
 import collections
 import io
 import json
+import re
 
 g_VirtoolsVersion: tuple[str] = (
     '25', '35', '40', '50',
@@ -40,7 +41,7 @@ def DumpTrTemplate(filepath: str, templateData: dict[str, str]):
 def LoadTrTemplate(filepath: str) ->  dict[str, str]:
     return LoadJson(filepath)
 
-def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]):
+def DumpTrDiff(filepath: str, insertedKey: list[int], deletedKey: list[int]):
     with open(filepath, 'w', encoding='utf-8') as f:
         for entryIdx in insertedKey:
             f.write(f'i/{entryIdx}\n')
@@ -49,9 +50,9 @@ def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]):
             f.write(f'd/{entryIdx}\n')
 
 # return a tuple. (insertedKey, deletedKey)
-def LoadTrDiff(filepath: str) -> dict:
-    insertedKey: list[str] = []
-    deletedKey: list[str] = []
+def LoadTrDiff(filepath: str) -> tuple:
+    insertedKey: list[int] = []
+    deletedKey: list[int] = []
     with open(filepath, 'r', encoding='utf-8') as f:
         while True:
             ln = f.readline()
@@ -59,9 +60,9 @@ def LoadTrDiff(filepath: str) -> dict:
 
             sp = ln.strip('\n').split('/')
             if sp[0] == 'i':
-                insertedKey.append(sp[1])
+                insertedKey.append(int(sp[1]))
             else:
-                deletedKey.append(sp[1])
+                deletedKey.append(int(sp[1]))
 
     return (insertedKey, deletedKey)
 
@@ -121,3 +122,81 @@ def InternalNlpJson2PlainJson(nlpJson: dict, stack: collections.deque, keyList:
             InternalNlpJson2PlainJson(entry, stack, keyList, valueList)
             stack.pop()
 
+def PlainJson2NlpJson(keyList: list[str], valueList: list[str]) -> dict:
+    # create the base section
+    # each section will have 3 k-v pair. language/section and entities are existed in original nlp json
+    # and key_map is served for path finding and convenient for looking for sub section.
+    result: dict = {
+        "language": "English",
+        "entities": [],
+        "key_map": {}
+    }
+    # inerate list and construct dict
+    for k, v in zip(keyList, valueList):
+        InternalPlainJson2NlpJson(result, k, v)
+    return result
+def InternalPlainJson2NlpJson(nlpJson: dict, pairKey: str, pairVal: str):
+    keypath = pairKey.split('/')
+    # confirm last node is number and remove it
+    assert keypath[-1].isdecimal()
+    keypath = keypath[0:-1]
+
+    # move to correct sub section
+    for pathpart in keypath:
+        if pathpart in nlpJson['key_map']:
+            # existed sub section. directly entering
+            nlpJson = nlpJson['key_map'][pathpart]
+        else:
+            # create a new one
+            sub_section = {
+                'section': pathpart,
+                'entities': [],
+                'key_map': {}
+            }
+
+            # add into current section
+            nlpJson['entities'].append(sub_section)
+            nlpJson['key_map'][pathpart] = sub_section
+
+            # move to the new created sub section
+            nlpJson = sub_section
+
+    # insert data
+    nlpJson['entities'].append(pairVal)
+
+
+
+def DumpNlpJson(filepath: str, encoding: str, lang_macro: str, nlpJson: dict):
+    # write in wb mode because we need explicitly write \r\n, not \n
+    with open(filepath, 'wb') as f:
+        f.write(f'Language:{lang_macro}\r\n'.encode(encoding, errors='ignore'))
+        InternalDumpNlpJson(f, encoding, 0, nlpJson)
+
+g_NlpJsonStrRepl1 = re.compile('\\\\')
+g_NlpJsonStrRepl2 = re.compile('\"')
+def NlpJsonStringProcessor(strl: str) -> str:
+    return g_NlpJsonStrRepl2.sub('\"\"', strl)
+
+def InternalDumpNlpJson(f: io.BufferedWriter, encoding: str, depth: int, nlpJson: dict):
+    assert 'entities' in nlpJson
+
+    is_first: bool = True
+    for entity in nlpJson['entities']:
+        if isinstance(entity, str):
+            # write comma if not the first element
+            if not is_first: f.write(','.encode(encoding))
+            else: is_first = False
+
+            # write real data
+            # replace all " to "" to escape
+            f.write('"{0}"'.format(NlpJsonStringProcessor(entity)).encode(encoding, errors='ignore'))
+        else:
+            # sub section
+            # write section header and call self.
+            if depth == 0:
+                f.write(f'\r\n[{entity["section"]}]\r\n'.encode(encoding, errors='ignore'))
+            else:
+                f.write(f'\r\n<{entity["section"]}>\r\n'.encode(encoding, errors='ignore'))
+
+            InternalDumpNlpJson(f, encoding, depth + 1, entity)
+