refactor: update project
- add documentation CMake build script. re-organise document layout for future changes. - move LIBCMO_EXPORT to BMap and rename it to BMAP_EXPORT because only BMap need to use this macro. - fully refactor VTEncoding to make it more like Python - Now language name is platform independent. - Hide implementation detail as possible as I can. - Language mapping are still work in progress. - add code gen for new added universal encoding feature to generate language name mapping in Windows and Iconv respectively. - remove old code of CMake build script. - update VTUtils for new requirement. - remove useless functions. - create LibCmo specific custom exception classes.
This commit is contained in:
2
CodeGen/UniversalEncoding/.gitignore
vendored
Normal file
2
CodeGen/UniversalEncoding/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# Result
|
||||
*.cpp
|
98
CodeGen/UniversalEncoding/EncodingTable.csv
Normal file
98
CodeGen/UniversalEncoding/EncodingTable.csv
Normal file
@ -0,0 +1,98 @@
|
||||
Encoding Alias Code Page Iconv Identifier
|
||||
ascii 646, us-ascii ASCII
|
||||
big5 big5-tw, csbig5 950 BIG5
|
||||
big5hkscs big5-hkscs, hkscs BIG5-HKSCS
|
||||
cp037 IBM037, IBM039 037
|
||||
cp273 273, IBM273, csIBM273
|
||||
cp424 EBCDIC-CP-HE, IBM424
|
||||
cp437 437, IBM437 437
|
||||
cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 500
|
||||
cp720 720
|
||||
cp737 737
|
||||
cp775 IBM775 775
|
||||
cp850 850, IBM850 850 CP850
|
||||
cp852 852, IBM852 852
|
||||
cp855 855, IBM855 855
|
||||
cp856
|
||||
cp857 857, IBM857 857
|
||||
cp858 858, IBM858 858
|
||||
cp860 860, IBM860 860
|
||||
cp861 861, CP-IS, IBM861 861
|
||||
cp862 862, IBM862 862 CP862
|
||||
cp863 863, IBM863 863
|
||||
cp864 IBM864 864
|
||||
cp865 865, IBM865 865
|
||||
cp866 866, IBM866 866 CP866
|
||||
cp869 869, CP-GR, IBM869 869
|
||||
cp874 874 CP874
|
||||
cp875 875
|
||||
cp932 932, ms932, mskanji, ms-kanji, windows-31j 932 CP932
|
||||
cp949 949, ms949, uhc 949 CP949
|
||||
cp950 950, ms950 950 CP950
|
||||
cp1006
|
||||
cp1026 ibm1026 1026
|
||||
cp1125 1125, ibm1125, cp866u, ruscii
|
||||
cp1140 ibm1140 1140
|
||||
cp1250 windows-1250 1250 CP1250
|
||||
cp1251 windows-1251 1251 CP1251
|
||||
cp1252 windows-1252 1252 CP1252
|
||||
cp1253 windows-1253 1253 CP1253
|
||||
cp1254 windows-1254 1254 CP1254
|
||||
cp1255 windows-1255 1255 CP1255
|
||||
cp1256 windows-1256 1256 CP1256
|
||||
cp1257 windows-1257 1257 CP1257
|
||||
cp1258 windows-1258 1258 CP1258
|
||||
euc_jp eucjp, ujis, u-jis EUC-JP
|
||||
euc_jis_2004 jisx0213, eucjis2004
|
||||
euc_jisx0213 eucjisx0213
|
||||
euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 51949 EUC-KR
|
||||
gb2312 chinese, csiso58gb231280, euc-cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, iso-ir-58
|
||||
gbk 936, cp936, ms936 936 CP936
|
||||
gb18030 gb18030-2000 54936 GB18030
|
||||
hz hzgb, hz-gb, hz-gb-2312 52936 HZ
|
||||
iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp ISO-2022-JP
|
||||
iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 ISO-2022-JP-1
|
||||
iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 ISO-2022-JP-2
|
||||
iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004
|
||||
iso2022_jp_3 iso2022jp-3, iso-2022-jp-3
|
||||
iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext
|
||||
iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr 50225 ISO-2022-KR
|
||||
latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 28591 ISO-8859-1
|
||||
iso8859_2 iso-8859-2, latin2, L2 28592 ISO-8859-2
|
||||
iso8859_3 iso-8859-3, latin3, L3 28593 ISO-8859-3
|
||||
iso8859_4 iso-8859-4, latin4, L4 28594 ISO-8859-4
|
||||
iso8859_5 iso-8859-5, cyrillic 28595 ISO-8859-5
|
||||
iso8859_6 iso-8859-6, arabic 28596 ISO-8859-6
|
||||
iso8859_7 iso-8859-7, greek, greek8 28597 ISO-8859-7
|
||||
iso8859_8 iso-8859-8, hebrew 28598 ISO-8859-8
|
||||
iso8859_9 iso-8859-9, latin5, L5 28599 ISO-8859-9
|
||||
iso8859_10 iso-8859-10, latin6, L6 ISO-8859-10
|
||||
iso8859_11 iso-8859-11, thai ISO-8859-11
|
||||
iso8859_13 iso-8859-13, latin7, L7 28603 ISO-8859-13
|
||||
iso8859_14 iso-8859-14, latin8, L8 ISO-8859-14
|
||||
iso8859_15 iso-8859-15, latin9, L9 28605 ISO-8859-15
|
||||
iso8859_16 iso-8859-16, latin10, L10 ISO-8859-16
|
||||
johab cp1361, ms1361 1361 JOHAB
|
||||
koi8_r
|
||||
koi8_t KOI8-T
|
||||
koi8_u
|
||||
kz1048 kz_1048, strk1048_2002, rk1048
|
||||
mac_cyrillic maccyrillic 10007 MacCyrillic
|
||||
mac_greek macgreek 10006 MacGreek
|
||||
mac_iceland maciceland 10079 MacIceland
|
||||
mac_latin2 maclatin2, maccentraleurope, mac_centeuro
|
||||
mac_roman macroman, macintosh MacRoman
|
||||
mac_turkish macturkish 10081 MacTurkish
|
||||
ptcp154 csptcp154, pt154, cp154, cyrillic-asian PT154
|
||||
shift_jis csshiftjis, shiftjis, sjis, s_jis 932 SHIFT_JIS
|
||||
shift_jis_2004 shiftjis2004, sjis_2004, sjis2004
|
||||
shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213
|
||||
utf_32 U32, utf32 UTF-32
|
||||
utf_32_be UTF-32BE UTF-32BE
|
||||
utf_32_le UTF-32LE UTF-32LE
|
||||
utf_16 U16, utf16 UTF16
|
||||
utf_16_be UTF-16BE UTF-16BE
|
||||
utf_16_le UTF-16LE UTF-16LE
|
||||
utf_7 U7, unicode-1-1-utf-7 65000 UTF-7
|
||||
utf_8 U8, UTF, utf8, cp65001 65001 UTF-8
|
||||
utf_8_sig
|
|
57
CodeGen/UniversalEncoding/UniversalEncoding.py
Normal file
57
CodeGen/UniversalEncoding/UniversalEncoding.py
Normal file
@ -0,0 +1,57 @@
|
||||
import typing
|
||||
import io
|
||||
|
||||
class LanguageToken:
|
||||
m_Name: str
|
||||
m_Alias: tuple[str, ...]
|
||||
m_CodePage: str | None
|
||||
m_IconvCode: str | None
|
||||
|
||||
def __init__(self, name: str, alias: typing.Iterator[str], code_page: str, iconv_code: str):
|
||||
self.m_Name = name.lower()
|
||||
self.m_Alias = tuple(map(lambda x: x.lower(), alias))
|
||||
self.m_CodePage = None if code_page == '' else code_page
|
||||
self.m_IconvCode = None if iconv_code == '' else iconv_code
|
||||
|
||||
def extract_data(fs: io.TextIOWrapper) -> tuple[str, ...]:
|
||||
# remove first line to remove table header
|
||||
return fs.readlines()[1:]
|
||||
|
||||
def extract_token(csv_data: tuple[str, ...]) -> tuple[LanguageToken, ...]:
|
||||
ret: list[LanguageToken] = []
|
||||
for line in csv_data:
|
||||
line = line.strip('\n')
|
||||
line_sp = line.split('\t')
|
||||
alias_sp = filter(lambda x: x != '', map(lambda x: x.strip(), line_sp[1].split(',')))
|
||||
ret.append(LanguageToken(line_sp[0], alias_sp, line_sp[2], line_sp[3]))
|
||||
return tuple(ret)
|
||||
|
||||
def write_alias_map(fs: io.TextIOWrapper, data: tuple[LanguageToken, ...]) -> None:
|
||||
fs.write('static const std::map<std::u8string, std::u8string> c_AliasMap {\n')
|
||||
for i in data:
|
||||
for j in i.m_Alias:
|
||||
fs.write(f'\t{{ u8"{j}", u8"{i.m_Name}" }},\n')
|
||||
fs.write('};\n')
|
||||
|
||||
def write_win_cp_map(fs: io.TextIOWrapper, data: tuple[LanguageToken, ...]) -> None:
|
||||
fs.write('static const std::map<std::u8string, UINT> c_WinCPMap {\n')
|
||||
for i in data:
|
||||
if i.m_CodePage is not None:
|
||||
fs.write(f'\t{{ u8"{i.m_Name}", static_cast<UINT>({i.m_CodePage}u) }},\n')
|
||||
fs.write('};\n')
|
||||
|
||||
def write_iconv_map(fs: io.TextIOWrapper, data: tuple[LanguageToken, ...]) -> None:
|
||||
fs.write('static const std::map<std::u8string, std::string> c_IconvMap {\n')
|
||||
for i in data:
|
||||
if i.m_IconvCode is not None:
|
||||
fs.write(f'\t{{ u8"{i.m_Name}", "{i.m_IconvCode}" }},\n')
|
||||
fs.write('};\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
with open('EncodingTable.csv', 'r', encoding='utf-8') as fr:
|
||||
with open('UEncodingTable.cpp', 'w', encoding='utf-8') as fw:
|
||||
data = extract_data(fr)
|
||||
token = extract_token(data)
|
||||
write_alias_map(fw, token)
|
||||
write_win_cp_map(fw, token)
|
||||
write_iconv_map(fw, token)
|
Reference in New Issue
Block a user