finish script. but vt can not recognize it
This commit is contained in:
parent
4651110885
commit
b740e95a62
@ -41,7 +41,7 @@ public class NlpRunner {
|
|||||||
// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
|
// [^\\r\\n]*[\\r\\n]+ is match to line breaker.
|
||||||
private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
|
private static final Pattern mRegStrCctor = Pattern.compile("\\\\[^\\\\rn][^\\r\\n]*[\\r\\n]+");
|
||||||
private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
|
private static final Pattern mRegDoubleQuote = Pattern.compile("\\\"\\\"");
|
||||||
private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
|
// private static final Pattern mRegEscSlash = Pattern.compile("\\\\\\\\");
|
||||||
private static final Pattern mRegEscTab = Pattern.compile("\\t");
|
private static final Pattern mRegEscTab = Pattern.compile("\\t");
|
||||||
private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
|
private static final Pattern mRegEscEol = Pattern.compile("\\r?\\n");
|
||||||
private String cutLangHead(String strl) {
|
private String cutLangHead(String strl) {
|
||||||
@ -56,8 +56,8 @@ public class NlpRunner {
|
|||||||
private String regulateString(String strl) {
|
private String regulateString(String strl) {
|
||||||
strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator
|
strl = mRegStrCctor.matcher(strl).replaceAll(Matcher.quoteReplacement("")); // remove string concator
|
||||||
strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
|
strl = mRegDoubleQuote.matcher(strl).replaceAll(Matcher.quoteReplacement("\""));// replace "" with "
|
||||||
strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\")); // replace real escape to escape char
|
// strl = mRegEscSlash.matcher(strl).replaceAll(Matcher.quoteReplacement("\\"));// leave double back slash alone. we still need it.
|
||||||
strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t"));
|
strl = mRegEscTab.matcher(strl).replaceAll(Matcher.quoteReplacement("\\t")); // replace real escape to escape char
|
||||||
strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
|
strl = mRegEscEol.matcher(strl).replaceAll(Matcher.quoteReplacement("\\n"));
|
||||||
|
|
||||||
return strl;
|
return strl;
|
||||||
|
@ -13,7 +13,7 @@ def ConstructVtTrDataTuple() -> tuple[VtTrDataTuple]:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
prevJson = None
|
prevPlainValues = None
|
||||||
for vtVer in ConstructVtTrDataTuple():
|
for vtVer in ConstructVtTrDataTuple():
|
||||||
print(f'Processing {vtVer.nlpJson}...')
|
print(f'Processing {vtVer.nlpJson}...')
|
||||||
|
|
||||||
@ -25,14 +25,14 @@ if __name__ == "__main__":
|
|||||||
NlpUtils.DumpTrIndex(vtVer.trIndex, plainKeys)
|
NlpUtils.DumpTrIndex(vtVer.trIndex, plainKeys)
|
||||||
|
|
||||||
# compare with previous one
|
# compare with previous one
|
||||||
if prevJson is None:
|
if prevPlainValues is None:
|
||||||
# this is first json. omit diff
|
# this is first json. omit diff
|
||||||
# write blank diff and write whole translation values
|
# write blank diff and write whole translation values
|
||||||
NlpUtils.DumpTrDiff(vtVer.trDiff, [], [])
|
NlpUtils.DumpTrDiff(vtVer.trDiff, [], [])
|
||||||
NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict(zip(plainKeys, plainValues)))
|
NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict(zip(plainKeys, plainValues)))
|
||||||
else:
|
else:
|
||||||
# compare with prev json
|
# compare with prev json
|
||||||
cmpResult = jsondiff.diff(prevJson, plainValues)
|
cmpResult = jsondiff.diff(prevPlainValues, plainValues)
|
||||||
# seperate diff result
|
# seperate diff result
|
||||||
(insertedKey, deletedKey, insertedVal) = NlpUtils.SeperatePlainJsonDiff(cmpResult)
|
(insertedKey, deletedKey, insertedVal) = NlpUtils.SeperatePlainJsonDiff(cmpResult)
|
||||||
|
|
||||||
@ -42,5 +42,5 @@ if __name__ == "__main__":
|
|||||||
NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict((plainKeys[insertedKey[i]], insertedVal[i]) for i in range(len(insertedKey))))
|
NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict((plainKeys[insertedKey[i]], insertedVal[i]) for i in range(len(insertedKey))))
|
||||||
|
|
||||||
# assign prev json
|
# assign prev json
|
||||||
prevJson = plainValues
|
prevPlainValues = plainValues
|
||||||
|
|
||||||
|
@ -3,12 +3,12 @@ import jsondiff
|
|||||||
import collections
|
import collections
|
||||||
|
|
||||||
g_SupportedEncoding = {
|
g_SupportedEncoding = {
|
||||||
'zh-cn': ('utf-8', 'gb2312', )
|
'zh-cn': ('Chinese', ('utf-8', 'gb2312', ), )
|
||||||
}
|
}
|
||||||
|
|
||||||
VtTrDataTuple = collections.namedtuple('VtTrDataTuple', ('rawNlp', 'trTemplate', 'trDiff', 'trIndex'))
|
VtTrDataTuple = collections.namedtuple('VtTrDataTuple', ('rawNlp', 'trTemplate', 'trDiff', 'trIndex'))
|
||||||
def GetRawNlpPath(ver: str, lang: str) -> str:
|
def GetRawNlpPath(ver: str, lang: str, enc: str) -> str:
|
||||||
return f'../NlpTr/out/VT{ver}.{lang}.txt'
|
return f'../NlpTr/out/VT{ver}.{lang}.{enc}.txt'
|
||||||
def GetTrPath(ver: str, lang: str) -> str:
|
def GetTrPath(ver: str, lang: str) -> str:
|
||||||
return f'../NlpTr/VT{ver}.{lang}.json'
|
return f'../NlpTr/VT{ver}.{lang}.json'
|
||||||
def GetTrDiffPath(ver: str) -> str:
|
def GetTrDiffPath(ver: str) -> str:
|
||||||
@ -18,16 +18,52 @@ def GetTrIndexPath(ver: str) -> str:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# load each version's diff data and patch data for conventient using
|
||||||
|
PreLoadedDiffIdxTuple = collections.namedtuple('PreLoadedDiffIndexTuple', ('insertedKey', 'deletedKey', 'plainKeys'))
|
||||||
|
preLoadedData: dict[str, PreLoadedDiffIdxTuple] = {}
|
||||||
for ver in NlpUtils.g_VirtoolsVersion:
|
for ver in NlpUtils.g_VirtoolsVersion:
|
||||||
# load diff and index data
|
# load diff and index data
|
||||||
|
insertedKey, deletedKey = NlpUtils.LoadTrDiff(GetTrDiffPath(ver))
|
||||||
|
plainKeys = NlpUtils.LoadTrIndex(GetTrIndexPath(ver))
|
||||||
|
# insert to dict
|
||||||
|
preLoadedData[ver] = PreLoadedDiffIdxTuple._make((insertedKey, deletedKey, plainKeys))
|
||||||
|
|
||||||
|
# iterate lang first
|
||||||
|
# because we use progressive patch. we need iterate vt ver in order
|
||||||
for lang in NlpUtils.g_SupportedLangs:
|
for lang in NlpUtils.g_SupportedLangs:
|
||||||
|
|
||||||
|
prevPlainValues: list[str] = None
|
||||||
|
for ver in NlpUtils.g_VirtoolsVersion:
|
||||||
|
print(f'Processing {ver}.{lang}...')
|
||||||
|
|
||||||
|
# pick data from pre-loaded dict
|
||||||
|
diffIdxData = preLoadedData[ver]
|
||||||
|
|
||||||
# load lang file
|
# load lang file
|
||||||
|
# and only keeps its value.
|
||||||
|
trFull = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang))
|
||||||
|
_, plainValues = zip(*trFull.items())
|
||||||
|
|
||||||
# patch it
|
# patch it if needed
|
||||||
|
if prevPlainValues is not None:
|
||||||
|
# patch needed load
|
||||||
|
# load patch part first
|
||||||
|
trPart = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang))
|
||||||
|
|
||||||
# convert plain json to nested json
|
# re-construct the diff structure understood by jsondiff
|
||||||
|
cmpResult = NlpUtils.CombinePlainJsonDiff(diffIdxData.insertedKey, diffIdxData.deletedKey, plainValues)
|
||||||
|
|
||||||
|
# patch data
|
||||||
|
plainValues = jsondiff.patch(prevPlainValues, cmpResult)
|
||||||
|
|
||||||
|
# convert plain json to nlp json
|
||||||
|
nlpJson = NlpUtils.PlainJson2NlpJson(plainKeys, plainValues)
|
||||||
|
|
||||||
# write into file with different encoding
|
# write into file with different encoding
|
||||||
for enc in g_SupportedEncoding[lang]:
|
lang_macro, encs = g_SupportedEncoding[lang]
|
||||||
print(f'Process {ver}.{lang}.{enc}...')
|
for enc in encs:
|
||||||
|
print(f'Processing {ver}.{lang}.{enc}...')
|
||||||
|
NlpUtils.DumpNlpJson(GetRawNlpPath(ver, lang, enc), enc, lang_macro, nlpJson)
|
||||||
|
|
||||||
|
# assign prev json
|
||||||
|
prevPlainValues = plainValues
|
@ -2,6 +2,7 @@ import jsondiff
|
|||||||
import collections
|
import collections
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
g_VirtoolsVersion: tuple[str] = (
|
g_VirtoolsVersion: tuple[str] = (
|
||||||
'25', '35', '40', '50',
|
'25', '35', '40', '50',
|
||||||
@ -40,7 +41,7 @@ def DumpTrTemplate(filepath: str, templateData: dict[str, str]):
|
|||||||
def LoadTrTemplate(filepath: str) -> dict[str, str]:
|
def LoadTrTemplate(filepath: str) -> dict[str, str]:
|
||||||
return LoadJson(filepath)
|
return LoadJson(filepath)
|
||||||
|
|
||||||
def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]):
|
def DumpTrDiff(filepath: str, insertedKey: list[int], deletedKey: list[int]):
|
||||||
with open(filepath, 'w', encoding='utf-8') as f:
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
for entryIdx in insertedKey:
|
for entryIdx in insertedKey:
|
||||||
f.write(f'i/{entryIdx}\n')
|
f.write(f'i/{entryIdx}\n')
|
||||||
@ -49,9 +50,9 @@ def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]):
|
|||||||
f.write(f'd/{entryIdx}\n')
|
f.write(f'd/{entryIdx}\n')
|
||||||
|
|
||||||
# return a tuple. (insertedKey, deletedKey)
|
# return a tuple. (insertedKey, deletedKey)
|
||||||
def LoadTrDiff(filepath: str) -> dict:
|
def LoadTrDiff(filepath: str) -> tuple:
|
||||||
insertedKey: list[str] = []
|
insertedKey: list[int] = []
|
||||||
deletedKey: list[str] = []
|
deletedKey: list[int] = []
|
||||||
with open(filepath, 'r', encoding='utf-8') as f:
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||||||
while True:
|
while True:
|
||||||
ln = f.readline()
|
ln = f.readline()
|
||||||
@ -59,9 +60,9 @@ def LoadTrDiff(filepath: str) -> dict:
|
|||||||
|
|
||||||
sp = ln.strip('\n').split('/')
|
sp = ln.strip('\n').split('/')
|
||||||
if sp[0] == 'i':
|
if sp[0] == 'i':
|
||||||
insertedKey.append(sp[1])
|
insertedKey.append(int(sp[1]))
|
||||||
else:
|
else:
|
||||||
deletedKey.append(sp[1])
|
deletedKey.append(int(sp[1]))
|
||||||
|
|
||||||
return (insertedKey, deletedKey)
|
return (insertedKey, deletedKey)
|
||||||
|
|
||||||
@ -121,3 +122,81 @@ def InternalNlpJson2PlainJson(nlpJson: dict, stack: collections.deque, keyList:
|
|||||||
InternalNlpJson2PlainJson(entry, stack, keyList, valueList)
|
InternalNlpJson2PlainJson(entry, stack, keyList, valueList)
|
||||||
stack.pop()
|
stack.pop()
|
||||||
|
|
||||||
|
def PlainJson2NlpJson(keyList: list[str], valueList: list[str]) -> dict:
|
||||||
|
# create the base section
|
||||||
|
# each section will have 3 k-v pair. language/section and entities are existed in original nlp json
|
||||||
|
# and key_map is served for path finding and convenient for looking for sub section.
|
||||||
|
result: dict = {
|
||||||
|
"language": "English",
|
||||||
|
"entities": [],
|
||||||
|
"key_map": {}
|
||||||
|
}
|
||||||
|
# inerate list and construct dict
|
||||||
|
for k, v in zip(keyList, valueList):
|
||||||
|
InternalPlainJson2NlpJson(result, k, v)
|
||||||
|
return result
|
||||||
|
def InternalPlainJson2NlpJson(nlpJson: dict, pairKey: str, pairVal: str):
|
||||||
|
keypath = pairKey.split('/')
|
||||||
|
# confirm last node is number and remove it
|
||||||
|
assert keypath[-1].isdecimal()
|
||||||
|
keypath = keypath[0:-1]
|
||||||
|
|
||||||
|
# move to correct sub section
|
||||||
|
for pathpart in keypath:
|
||||||
|
if pathpart in nlpJson['key_map']:
|
||||||
|
# existed sub section. directly entering
|
||||||
|
nlpJson = nlpJson['key_map'][pathpart]
|
||||||
|
else:
|
||||||
|
# create a new one
|
||||||
|
sub_section = {
|
||||||
|
'section': pathpart,
|
||||||
|
'entities': [],
|
||||||
|
'key_map': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# add into current section
|
||||||
|
nlpJson['entities'].append(sub_section)
|
||||||
|
nlpJson['key_map'][pathpart] = sub_section
|
||||||
|
|
||||||
|
# move to the new created sub section
|
||||||
|
nlpJson = sub_section
|
||||||
|
|
||||||
|
# insert data
|
||||||
|
nlpJson['entities'].append(pairVal)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def DumpNlpJson(filepath: str, encoding: str, lang_macro: str, nlpJson: dict):
|
||||||
|
# write in wb mode because we need explicitly write \r\n, not \n
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
f.write(f'Language:{lang_macro}\r\n'.encode(encoding, errors='ignore'))
|
||||||
|
InternalDumpNlpJson(f, encoding, 0, nlpJson)
|
||||||
|
|
||||||
|
g_NlpJsonStrRepl1 = re.compile('\\\\')
|
||||||
|
g_NlpJsonStrRepl2 = re.compile('\"')
|
||||||
|
def NlpJsonStringProcessor(strl: str) -> str:
|
||||||
|
return g_NlpJsonStrRepl2.sub('\"\"', strl)
|
||||||
|
|
||||||
|
def InternalDumpNlpJson(f: io.BufferedWriter, encoding: str, depth: int, nlpJson: dict):
|
||||||
|
assert 'entities' in nlpJson
|
||||||
|
|
||||||
|
is_first: bool = True
|
||||||
|
for entity in nlpJson['entities']:
|
||||||
|
if isinstance(entity, str):
|
||||||
|
# write comma if not the first element
|
||||||
|
if not is_first: f.write(','.encode(encoding))
|
||||||
|
else: is_first = False
|
||||||
|
|
||||||
|
# write real data
|
||||||
|
# replace all " to "" to escape
|
||||||
|
f.write('"{0}"'.format(NlpJsonStringProcessor(entity)).encode(encoding, errors='ignore'))
|
||||||
|
else:
|
||||||
|
# sub section
|
||||||
|
# write section header and call self.
|
||||||
|
if depth == 0:
|
||||||
|
f.write(f'\r\n[{entity["section"]}]\r\n'.encode(encoding, errors='ignore'))
|
||||||
|
else:
|
||||||
|
f.write(f'\r\n<{entity["section"]}>\r\n'.encode(encoding, errors='ignore'))
|
||||||
|
|
||||||
|
InternalDumpNlpJson(f, encoding, depth + 1, entity)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user