From b740e95a62397e5b8aedd459ab65d814dc43cd36 Mon Sep 17 00:00:00 2001 From: yyc12345 Date: Tue, 11 Jul 2023 22:36:07 +0800 Subject: [PATCH] finish script. but vt can not recognize it --- NlpParser/NlpRunner.java | 6 +-- NlpProc/NlpJsonDecoder.py | 8 ++-- NlpProc/NlpJsonEncoder.py | 52 ++++++++++++++++++---- NlpProc/NlpUtils.py | 91 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 136 insertions(+), 21 deletions(-) diff --git a/NlpParser/NlpRunner.java b/NlpParser/NlpRunner.java index d53998b..1e20bc0 100644 --- a/NlpParser/NlpRunner.java +++ b/NlpParser/NlpRunner.java @@ -41,7 +41,7 @@ public class NlpRunner { // [^\\r\\n]*[\\r\\n]+ is match to line breaker. private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+"); private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\""); - private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\"); + // private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\"); private static final Pattern mRegEscTab = Pattern.compile("\\t"); private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n"); private String cutLangHead(String strl) { @@ -56,8 +56,8 @@ public class NlpRunner { private String regulateString(String strl) { strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with " - strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\")); // replace real escape to escape char - strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); + // strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it. + strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); // replace real escape to escape char strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n")); return strl; diff --git a/NlpProc/NlpJsonDecoder.py b/NlpProc/NlpJsonDecoder.py index 6d0de6b..0e75770 100644 --- a/NlpProc/NlpJsonDecoder.py +++ b/NlpProc/NlpJsonDecoder.py @@ -13,7 +13,7 @@ def ConstructVtTrDataTuple() -> tuple[VtTrDataTuple]: if __name__ == "__main__": - prevJson = None + prevPlainValues = None for vtVer in ConstructVtTrDataTuple(): print(f'Processing {vtVer.nlpJson}...') @@ -25,14 +25,14 @@ if __name__ == "__main__": NlpUtils.DumpTrIndex(vtVer.trIndex, plainKeys) # compare with previous one - if prevJson is None: + if prevPlainValues is None: # this is first json. omit diff # write blank diff and write whole translation values NlpUtils.DumpTrDiff(vtVer.trDiff, [], []) NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict(zip(plainKeys, plainValues))) else: # compare with prev json - cmpResult = jsondiff.diff(prevJson, plainValues) + cmpResult = jsondiff.diff(prevPlainValues, plainValues) # seperate diff result (insertedKey, deletedKey, insertedVal) = NlpUtils.SeperatePlainJsonDiff(cmpResult) @@ -42,5 +42,5 @@ if __name__ == "__main__": NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict((plainKeys[insertedKey[i]], insertedVal[i]) for i in range(len(insertedKey)))) # assign prev json - prevJson = plainValues + prevPlainValues = plainValues diff --git a/NlpProc/NlpJsonEncoder.py b/NlpProc/NlpJsonEncoder.py index 4901183..fa6d6a1 100644 --- a/NlpProc/NlpJsonEncoder.py +++ b/NlpProc/NlpJsonEncoder.py @@ -3,12 +3,12 @@ import jsondiff import collections g_SupportedEncoding = { - 'zh-cn': ('utf-8', 'gb2312', ) + 'zh-cn': ('Chinese', ('utf-8', 'gb2312', ), ) } VtTrDataTuple = collections.namedtuple('VtTrDataTuple', ('rawNlp', 'trTemplate', 'trDiff', 'trIndex')) -def GetRawNlpPath(ver: str, lang: str) -> str: - return f'../NlpTr/out/VT{ver}.{lang}.txt' +def GetRawNlpPath(ver: str, lang: str, enc: str) -> str: + return f'../NlpTr/out/VT{ver}.{lang}.{enc}.txt' def GetTrPath(ver: str, lang: str) -> str: return f'../NlpTr/VT{ver}.{lang}.json' def GetTrDiffPath(ver: str) -> str: @@ -18,16 +18,52 @@ def GetTrIndexPath(ver: str) -> str: if __name__ == "__main__": + # load each version's diff data and patch data for conventient using + PreLoadedDiffIdxTuple = collections.namedtuple('PreLoadedDiffIndexTuple', ('insertedKey', 'deletedKey', 'plainKeys')) + preLoadedData: dict[str, PreLoadedDiffIdxTuple] = {} for ver in NlpUtils.g_VirtoolsVersion: # load diff and index data + insertedKey, deletedKey = NlpUtils.LoadTrDiff(GetTrDiffPath(ver)) + plainKeys = NlpUtils.LoadTrIndex(GetTrIndexPath(ver)) + # insert to dict + preLoadedData[ver] = PreLoadedDiffIdxTuple._make((insertedKey, deletedKey, plainKeys)) + + # iterate lang first + # because we use progressive patch. we need iterate vt ver in order + for lang in NlpUtils.g_SupportedLangs: + + prevPlainValues: list[str] = None + for ver in NlpUtils.g_VirtoolsVersion: + print(f'Processing {ver}.{lang}...') + + # pick data from pre-loaded dict + diffIdxData = preLoadedData[ver] - for lang in NlpUtils.g_SupportedLangs: # load lang file + # and only keeps its value. + trFull = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang)) + _, plainValues = zip(*trFull.items()) - # patch it + # patch it if needed + if prevPlainValues is not None: + # patch needed load + # load patch part first + trPart = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang)) - # convert plain json to nested json + # re-construct the diff structure understood by jsondiff + cmpResult = NlpUtils.CombinePlainJsonDiff(diffIdxData.insertedKey, diffIdxData.deletedKey, plainValues) + + # patch data + plainValues = jsondiff.patch(prevPlainValues, cmpResult) + + # convert plain json to nlp json + nlpJson = NlpUtils.PlainJson2NlpJson(plainKeys, plainValues) # write into file with different encoding - for enc in g_SupportedEncoding[lang]: - print(f'Process {ver}.{lang}.{enc}...') + lang_macro, encs = g_SupportedEncoding[lang] + for enc in encs: + print(f'Processing {ver}.{lang}.{enc}...') + NlpUtils.DumpNlpJson(GetRawNlpPath(ver, lang, enc), enc, lang_macro, nlpJson) + + # assign prev json + prevPlainValues = plainValues \ No newline at end of file diff --git a/NlpProc/NlpUtils.py b/NlpProc/NlpUtils.py index 6adb18d..4abc24d 100644 --- a/NlpProc/NlpUtils.py +++ b/NlpProc/NlpUtils.py @@ -2,6 +2,7 @@ import jsondiff import collections import io import json +import re g_VirtoolsVersion: tuple[str] = ( '25', '35', '40', '50', @@ -40,7 +41,7 @@ def DumpTrTemplate(filepath: str, templateData: dict[str, str]): def LoadTrTemplate(filepath: str) -> dict[str, str]: return LoadJson(filepath) -def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]): +def DumpTrDiff(filepath: str, insertedKey: list[int], deletedKey: list[int]): with open(filepath, 'w', encoding='utf-8') as f: for entryIdx in insertedKey: f.write(f'i/{entryIdx}\n') @@ -49,9 +50,9 @@ def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]): f.write(f'd/{entryIdx}\n') # return a tuple. (insertedKey, deletedKey) -def LoadTrDiff(filepath: str) -> dict: - insertedKey: list[str] = [] - deletedKey: list[str] = [] +def LoadTrDiff(filepath: str) -> tuple: + insertedKey: list[int] = [] + deletedKey: list[int] = [] with open(filepath, 'r', encoding='utf-8') as f: while True: ln = f.readline() @@ -59,9 +60,9 @@ def LoadTrDiff(filepath: str) -> dict: sp = ln.strip('\n').split('/') if sp[0] == 'i': - insertedKey.append(sp[1]) + insertedKey.append(int(sp[1])) else: - deletedKey.append(sp[1]) + deletedKey.append(int(sp[1])) return (insertedKey, deletedKey) @@ -121,3 +122,81 @@ def InternalNlpJson2PlainJson(nlpJson: dict, stack: collections.deque, keyList: InternalNlpJson2PlainJson(entry, stack, keyList, valueList) stack.pop() +def PlainJson2NlpJson(keyList: list[str], valueList: list[str]) -> dict: + # create the base section + # each section will have 3 k-v pair. language/section and entities are existed in original nlp json + # and key_map is served for path finding and convenient for looking for sub section. + result: dict = { + "language": "English", + "entities": [], + "key_map": {} + } + # inerate list and construct dict + for k, v in zip(keyList, valueList): + InternalPlainJson2NlpJson(result, k, v) + return result +def InternalPlainJson2NlpJson(nlpJson: dict, pairKey: str, pairVal: str): + keypath = pairKey.split('/') + # confirm last node is number and remove it + assert keypath[-1].isdecimal() + keypath = keypath[0:-1] + + # move to correct sub section + for pathpart in keypath: + if pathpart in nlpJson['key_map']: + # existed sub section. directly entering + nlpJson = nlpJson['key_map'][pathpart] + else: + # create a new one + sub_section = { + 'section': pathpart, + 'entities': [], + 'key_map': {} + } + + # add into current section + nlpJson['entities'].append(sub_section) + nlpJson['key_map'][pathpart] = sub_section + + # move to the new created sub section + nlpJson = sub_section + + # insert data + nlpJson['entities'].append(pairVal) + + + +def DumpNlpJson(filepath: str, encoding: str, lang_macro: str, nlpJson: dict): + # write in wb mode because we need explicitly write \r\n, not \n + with open(filepath, 'wb') as f: + f.write(f'Language:{lang_macro}\r\n'.encode(encoding, errors='ignore')) + InternalDumpNlpJson(f, encoding, 0, nlpJson) + +g_NlpJsonStrRepl1 = re.compile('\\\\') +g_NlpJsonStrRepl2 = re.compile('\"') +def NlpJsonStringProcessor(strl: str) -> str: + return g_NlpJsonStrRepl2.sub('\"\"', strl) + +def InternalDumpNlpJson(f: io.BufferedWriter, encoding: str, depth: int, nlpJson: dict): + assert 'entities' in nlpJson + + is_first: bool = True + for entity in nlpJson['entities']: + if isinstance(entity, str): + # write comma if not the first element + if not is_first: f.write(','.encode(encoding)) + else: is_first = False + + # write real data + # replace all " to "" to escape + f.write('"{0}"'.format(NlpJsonStringProcessor(entity)).encode(encoding, errors='ignore')) + else: + # sub section + # write section header and call self. + if depth == 0: + f.write(f'\r\n[{entity["section"]}]\r\n'.encode(encoding, errors='ignore')) + else: + f.write(f'\r\n<{entity["section"]}>\r\n'.encode(encoding, errors='ignore')) + + InternalDumpNlpJson(f, encoding, depth + 1, entity) +