finish script. but vt can not recognize it
This commit is contained in:
		@ -13,7 +13,7 @@ def ConstructVtTrDataTuple() -> tuple[VtTrDataTuple]:
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    prevJson = None
 | 
			
		||||
    prevPlainValues = None
 | 
			
		||||
    for vtVer in ConstructVtTrDataTuple():
 | 
			
		||||
        print(f'Processing {vtVer.nlpJson}...')
 | 
			
		||||
 | 
			
		||||
@ -25,14 +25,14 @@ if __name__ == "__main__":
 | 
			
		||||
        NlpUtils.DumpTrIndex(vtVer.trIndex, plainKeys)
 | 
			
		||||
 | 
			
		||||
        # compare with previous one
 | 
			
		||||
        if prevJson is None:
 | 
			
		||||
        if prevPlainValues is None:
 | 
			
		||||
            # this is first json. omit diff
 | 
			
		||||
            # write blank diff and write whole translation values
 | 
			
		||||
            NlpUtils.DumpTrDiff(vtVer.trDiff, [], [])
 | 
			
		||||
            NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict(zip(plainKeys, plainValues)))
 | 
			
		||||
        else:
 | 
			
		||||
            # compare with prev json
 | 
			
		||||
            cmpResult = jsondiff.diff(prevJson, plainValues)
 | 
			
		||||
            cmpResult = jsondiff.diff(prevPlainValues, plainValues)
 | 
			
		||||
            # seperate diff result
 | 
			
		||||
            (insertedKey, deletedKey, insertedVal) = NlpUtils.SeperatePlainJsonDiff(cmpResult)
 | 
			
		||||
 | 
			
		||||
@ -42,5 +42,5 @@ if __name__ == "__main__":
 | 
			
		||||
            NlpUtils.DumpTrTemplate(vtVer.trTemplate, dict((plainKeys[insertedKey[i]], insertedVal[i]) for i in range(len(insertedKey))))
 | 
			
		||||
 | 
			
		||||
        # assign prev json
 | 
			
		||||
        prevJson = plainValues
 | 
			
		||||
        prevPlainValues = plainValues
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -3,12 +3,12 @@ import jsondiff
 | 
			
		||||
import collections
 | 
			
		||||
 | 
			
		||||
g_SupportedEncoding = {
 | 
			
		||||
    'zh-cn': ('utf-8', 'gb2312', )
 | 
			
		||||
    'zh-cn': ('Chinese', ('utf-8', 'gb2312', ), )
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
VtTrDataTuple = collections.namedtuple('VtTrDataTuple', ('rawNlp', 'trTemplate', 'trDiff', 'trIndex'))
 | 
			
		||||
def GetRawNlpPath(ver: str, lang: str) -> str:
 | 
			
		||||
    return f'../NlpTr/out/VT{ver}.{lang}.txt'
 | 
			
		||||
def GetRawNlpPath(ver: str, lang: str, enc: str) -> str:
 | 
			
		||||
    return f'../NlpTr/out/VT{ver}.{lang}.{enc}.txt'
 | 
			
		||||
def GetTrPath(ver: str, lang: str) -> str:
 | 
			
		||||
    return f'../NlpTr/VT{ver}.{lang}.json'
 | 
			
		||||
def GetTrDiffPath(ver: str) -> str:
 | 
			
		||||
@ -18,16 +18,52 @@ def GetTrIndexPath(ver: str) -> str:
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    # load each version's diff data and patch data for conventient using
 | 
			
		||||
    PreLoadedDiffIdxTuple = collections.namedtuple('PreLoadedDiffIndexTuple', ('insertedKey', 'deletedKey', 'plainKeys'))
 | 
			
		||||
    preLoadedData: dict[str, PreLoadedDiffIdxTuple] = {}
 | 
			
		||||
    for ver in NlpUtils.g_VirtoolsVersion:
 | 
			
		||||
        # load diff and index data
 | 
			
		||||
        insertedKey, deletedKey = NlpUtils.LoadTrDiff(GetTrDiffPath(ver))
 | 
			
		||||
        plainKeys = NlpUtils.LoadTrIndex(GetTrIndexPath(ver))
 | 
			
		||||
        # insert to dict
 | 
			
		||||
        preLoadedData[ver] = PreLoadedDiffIdxTuple._make((insertedKey, deletedKey, plainKeys))
 | 
			
		||||
 | 
			
		||||
    # iterate lang first
 | 
			
		||||
    # because we use progressive patch. we need iterate vt ver in order
 | 
			
		||||
    for lang in NlpUtils.g_SupportedLangs:
 | 
			
		||||
        
 | 
			
		||||
        prevPlainValues: list[str] = None
 | 
			
		||||
        for ver in NlpUtils.g_VirtoolsVersion:
 | 
			
		||||
            print(f'Processing {ver}.{lang}...')
 | 
			
		||||
 | 
			
		||||
            # pick data from pre-loaded dict
 | 
			
		||||
            diffIdxData = preLoadedData[ver]
 | 
			
		||||
 | 
			
		||||
        for lang in NlpUtils.g_SupportedLangs:
 | 
			
		||||
            # load lang file
 | 
			
		||||
            # and only keeps its value.
 | 
			
		||||
            trFull = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang))
 | 
			
		||||
            _, plainValues = zip(*trFull.items())
 | 
			
		||||
 | 
			
		||||
            # patch it
 | 
			
		||||
            # patch it if needed
 | 
			
		||||
            if prevPlainValues is not None:
 | 
			
		||||
                # patch needed load
 | 
			
		||||
                # load patch part first
 | 
			
		||||
                trPart = NlpUtils.LoadTrTemplate(GetTrPath(ver, lang))
 | 
			
		||||
 | 
			
		||||
            # convert plain json to nested json
 | 
			
		||||
                # re-construct the diff structure understood by jsondiff
 | 
			
		||||
                cmpResult = NlpUtils.CombinePlainJsonDiff(diffIdxData.insertedKey, diffIdxData.deletedKey, plainValues)
 | 
			
		||||
 | 
			
		||||
                # patch data
 | 
			
		||||
                plainValues = jsondiff.patch(prevPlainValues, cmpResult)
 | 
			
		||||
 | 
			
		||||
            # convert plain json to nlp json
 | 
			
		||||
            nlpJson = NlpUtils.PlainJson2NlpJson(plainKeys, plainValues)
 | 
			
		||||
 | 
			
		||||
            # write into file with different encoding
 | 
			
		||||
            for enc in g_SupportedEncoding[lang]:
 | 
			
		||||
                print(f'Process {ver}.{lang}.{enc}...')
 | 
			
		||||
            lang_macro, encs = g_SupportedEncoding[lang]
 | 
			
		||||
            for enc in encs:
 | 
			
		||||
                print(f'Processing {ver}.{lang}.{enc}...')
 | 
			
		||||
                NlpUtils.DumpNlpJson(GetRawNlpPath(ver, lang, enc), enc, lang_macro, nlpJson)
 | 
			
		||||
 | 
			
		||||
            # assign prev json
 | 
			
		||||
            prevPlainValues = plainValues
 | 
			
		||||
@ -2,6 +2,7 @@ import jsondiff
 | 
			
		||||
import collections
 | 
			
		||||
import io
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
g_VirtoolsVersion: tuple[str] = (
 | 
			
		||||
    '25', '35', '40', '50',
 | 
			
		||||
@ -40,7 +41,7 @@ def DumpTrTemplate(filepath: str, templateData: dict[str, str]):
 | 
			
		||||
def LoadTrTemplate(filepath: str) ->  dict[str, str]:
 | 
			
		||||
    return LoadJson(filepath)
 | 
			
		||||
 | 
			
		||||
def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]):
 | 
			
		||||
def DumpTrDiff(filepath: str, insertedKey: list[int], deletedKey: list[int]):
 | 
			
		||||
    with open(filepath, 'w', encoding='utf-8') as f:
 | 
			
		||||
        for entryIdx in insertedKey:
 | 
			
		||||
            f.write(f'i/{entryIdx}\n')
 | 
			
		||||
@ -49,9 +50,9 @@ def DumpTrDiff(filepath: str, insertedKey: list[str], deletedKey: list[str]):
 | 
			
		||||
            f.write(f'd/{entryIdx}\n')
 | 
			
		||||
 | 
			
		||||
# return a tuple. (insertedKey, deletedKey)
 | 
			
		||||
def LoadTrDiff(filepath: str) -> dict:
 | 
			
		||||
    insertedKey: list[str] = []
 | 
			
		||||
    deletedKey: list[str] = []
 | 
			
		||||
def LoadTrDiff(filepath: str) -> tuple:
 | 
			
		||||
    insertedKey: list[int] = []
 | 
			
		||||
    deletedKey: list[int] = []
 | 
			
		||||
    with open(filepath, 'r', encoding='utf-8') as f:
 | 
			
		||||
        while True:
 | 
			
		||||
            ln = f.readline()
 | 
			
		||||
@ -59,9 +60,9 @@ def LoadTrDiff(filepath: str) -> dict:
 | 
			
		||||
 | 
			
		||||
            sp = ln.strip('\n').split('/')
 | 
			
		||||
            if sp[0] == 'i':
 | 
			
		||||
                insertedKey.append(sp[1])
 | 
			
		||||
                insertedKey.append(int(sp[1]))
 | 
			
		||||
            else:
 | 
			
		||||
                deletedKey.append(sp[1])
 | 
			
		||||
                deletedKey.append(int(sp[1]))
 | 
			
		||||
 | 
			
		||||
    return (insertedKey, deletedKey)
 | 
			
		||||
 | 
			
		||||
@ -121,3 +122,81 @@ def InternalNlpJson2PlainJson(nlpJson: dict, stack: collections.deque, keyList:
 | 
			
		||||
            InternalNlpJson2PlainJson(entry, stack, keyList, valueList)
 | 
			
		||||
            stack.pop()
 | 
			
		||||
 | 
			
		||||
def PlainJson2NlpJson(keyList: list[str], valueList: list[str]) -> dict:
 | 
			
		||||
    # create the base section
 | 
			
		||||
    # each section will have 3 k-v pair. language/section and entities are existed in original nlp json
 | 
			
		||||
    # and key_map is served for path finding and convenient for looking for sub section.
 | 
			
		||||
    result: dict = {
 | 
			
		||||
        "language": "English",
 | 
			
		||||
        "entities": [],
 | 
			
		||||
        "key_map": {}
 | 
			
		||||
    }
 | 
			
		||||
    # inerate list and construct dict
 | 
			
		||||
    for k, v in zip(keyList, valueList):
 | 
			
		||||
        InternalPlainJson2NlpJson(result, k, v)
 | 
			
		||||
    return result
 | 
			
		||||
def InternalPlainJson2NlpJson(nlpJson: dict, pairKey: str, pairVal: str):
 | 
			
		||||
    keypath = pairKey.split('/')
 | 
			
		||||
    # confirm last node is number and remove it
 | 
			
		||||
    assert keypath[-1].isdecimal()
 | 
			
		||||
    keypath = keypath[0:-1]
 | 
			
		||||
 | 
			
		||||
    # move to correct sub section
 | 
			
		||||
    for pathpart in keypath:
 | 
			
		||||
        if pathpart in nlpJson['key_map']:
 | 
			
		||||
            # existed sub section. directly entering
 | 
			
		||||
            nlpJson = nlpJson['key_map'][pathpart]
 | 
			
		||||
        else:
 | 
			
		||||
            # create a new one
 | 
			
		||||
            sub_section = {
 | 
			
		||||
                'section': pathpart,
 | 
			
		||||
                'entities': [],
 | 
			
		||||
                'key_map': {}
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            # add into current section
 | 
			
		||||
            nlpJson['entities'].append(sub_section)
 | 
			
		||||
            nlpJson['key_map'][pathpart] = sub_section
 | 
			
		||||
 | 
			
		||||
            # move to the new created sub section
 | 
			
		||||
            nlpJson = sub_section
 | 
			
		||||
 | 
			
		||||
    # insert data
 | 
			
		||||
    nlpJson['entities'].append(pairVal)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def DumpNlpJson(filepath: str, encoding: str, lang_macro: str, nlpJson: dict):
 | 
			
		||||
    # write in wb mode because we need explicitly write \r\n, not \n
 | 
			
		||||
    with open(filepath, 'wb') as f:
 | 
			
		||||
        f.write(f'Language:{lang_macro}\r\n'.encode(encoding, errors='ignore'))
 | 
			
		||||
        InternalDumpNlpJson(f, encoding, 0, nlpJson)
 | 
			
		||||
 | 
			
		||||
g_NlpJsonStrRepl1 = re.compile('\\\\')
 | 
			
		||||
g_NlpJsonStrRepl2 = re.compile('\"')
 | 
			
		||||
def NlpJsonStringProcessor(strl: str) -> str:
 | 
			
		||||
    return g_NlpJsonStrRepl2.sub('\"\"', strl)
 | 
			
		||||
 | 
			
		||||
def InternalDumpNlpJson(f: io.BufferedWriter, encoding: str, depth: int, nlpJson: dict):
 | 
			
		||||
    assert 'entities' in nlpJson
 | 
			
		||||
 | 
			
		||||
    is_first: bool = True
 | 
			
		||||
    for entity in nlpJson['entities']:
 | 
			
		||||
        if isinstance(entity, str):
 | 
			
		||||
            # write comma if not the first element
 | 
			
		||||
            if not is_first: f.write(','.encode(encoding))
 | 
			
		||||
            else: is_first = False
 | 
			
		||||
 | 
			
		||||
            # write real data
 | 
			
		||||
            # replace all " to "" to escape
 | 
			
		||||
            f.write('"{0}"'.format(NlpJsonStringProcessor(entity)).encode(encoding, errors='ignore'))
 | 
			
		||||
        else:
 | 
			
		||||
            # sub section
 | 
			
		||||
            # write section header and call self.
 | 
			
		||||
            if depth == 0:
 | 
			
		||||
                f.write(f'\r\n[{entity["section"]}]\r\n'.encode(encoding, errors='ignore'))
 | 
			
		||||
            else:
 | 
			
		||||
                f.write(f'\r\n<{entity["section"]}>\r\n'.encode(encoding, errors='ignore'))
 | 
			
		||||
 | 
			
		||||
            InternalDumpNlpJson(f, encoding, depth + 1, entity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user