refactor: update project

- add documentation CMake build script. re-organise document layout for future changes.
- move LIBCMO_EXPORT to BMap and rename it to BMAP_EXPORT because only BMap need to use this macro.
- fully refactor VTEncoding to make it more like Python
	- Now language name is platform independent.
	- Hide implementation detail as possible as I can.
	- Language mapping are still work in progress.
- add code gen for new added universal encoding feature to generate language name mapping in Windows and Iconv respectively.
- remove old code of CMake build script.
- update VTUtils for new requirement.
	- remove useless functions.
	- create LibCmo specific custom exception classes.
This commit is contained in:
2024-08-16 22:07:23 +08:00
parent afa06339b2
commit f870d4dde3
20 changed files with 1013 additions and 726 deletions

2
CodeGen/UniversalEncoding/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Result
*.cpp

View File

@ -0,0 +1,98 @@
Encoding Alias Code Page Iconv Identifier
ascii 646, us-ascii ASCII
big5 big5-tw, csbig5 950 BIG5
big5hkscs big5-hkscs, hkscs BIG5-HKSCS
cp037 IBM037, IBM039 037
cp273 273, IBM273, csIBM273
cp424 EBCDIC-CP-HE, IBM424
cp437 437, IBM437 437
cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 500
cp720 720
cp737 737
cp775 IBM775 775
cp850 850, IBM850 850 CP850
cp852 852, IBM852 852
cp855 855, IBM855 855
cp856
cp857 857, IBM857 857
cp858 858, IBM858 858
cp860 860, IBM860 860
cp861 861, CP-IS, IBM861 861
cp862 862, IBM862 862 CP862
cp863 863, IBM863 863
cp864 IBM864 864
cp865 865, IBM865 865
cp866 866, IBM866 866 CP866
cp869 869, CP-GR, IBM869 869
cp874 874 CP874
cp875 875
cp932 932, ms932, mskanji, ms-kanji, windows-31j 932 CP932
cp949 949, ms949, uhc 949 CP949
cp950 950, ms950 950 CP950
cp1006
cp1026 ibm1026 1026
cp1125 1125, ibm1125, cp866u, ruscii
cp1140 ibm1140 1140
cp1250 windows-1250 1250 CP1250
cp1251 windows-1251 1251 CP1251
cp1252 windows-1252 1252 CP1252
cp1253 windows-1253 1253 CP1253
cp1254 windows-1254 1254 CP1254
cp1255 windows-1255 1255 CP1255
cp1256 windows-1256 1256 CP1256
cp1257 windows-1257 1257 CP1257
cp1258 windows-1258 1258 CP1258
euc_jp eucjp, ujis, u-jis EUC-JP
euc_jis_2004 jisx0213, eucjis2004
euc_jisx0213 eucjisx0213
euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 51949 EUC-KR
gb2312 chinese, csiso58gb231280, euc-cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, iso-ir-58
gbk 936, cp936, ms936 936 CP936
gb18030 gb18030-2000 54936 GB18030
hz hzgb, hz-gb, hz-gb-2312 52936 HZ
iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp ISO-2022-JP
iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 ISO-2022-JP-1
iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 ISO-2022-JP-2
iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004
iso2022_jp_3 iso2022jp-3, iso-2022-jp-3
iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext
iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr 50225 ISO-2022-KR
latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 28591 ISO-8859-1
iso8859_2 iso-8859-2, latin2, L2 28592 ISO-8859-2
iso8859_3 iso-8859-3, latin3, L3 28593 ISO-8859-3
iso8859_4 iso-8859-4, latin4, L4 28594 ISO-8859-4
iso8859_5 iso-8859-5, cyrillic 28595 ISO-8859-5
iso8859_6 iso-8859-6, arabic 28596 ISO-8859-6
iso8859_7 iso-8859-7, greek, greek8 28597 ISO-8859-7
iso8859_8 iso-8859-8, hebrew 28598 ISO-8859-8
iso8859_9 iso-8859-9, latin5, L5 28599 ISO-8859-9
iso8859_10 iso-8859-10, latin6, L6 ISO-8859-10
iso8859_11 iso-8859-11, thai ISO-8859-11
iso8859_13 iso-8859-13, latin7, L7 28603 ISO-8859-13
iso8859_14 iso-8859-14, latin8, L8 ISO-8859-14
iso8859_15 iso-8859-15, latin9, L9 28605 ISO-8859-15
iso8859_16 iso-8859-16, latin10, L10 ISO-8859-16
johab cp1361, ms1361 1361 JOHAB
koi8_r
koi8_t KOI8-T
koi8_u
kz1048 kz_1048, strk1048_2002, rk1048
mac_cyrillic maccyrillic 10007 MacCyrillic
mac_greek macgreek 10006 MacGreek
mac_iceland maciceland 10079 MacIceland
mac_latin2 maclatin2, maccentraleurope, mac_centeuro
mac_roman macroman, macintosh MacRoman
mac_turkish macturkish 10081 MacTurkish
ptcp154 csptcp154, pt154, cp154, cyrillic-asian PT154
shift_jis csshiftjis, shiftjis, sjis, s_jis 932 SHIFT_JIS
shift_jis_2004 shiftjis2004, sjis_2004, sjis2004
shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213
utf_32 U32, utf32 UTF-32
utf_32_be UTF-32BE UTF-32BE
utf_32_le UTF-32LE UTF-32LE
utf_16 U16, utf16 UTF16
utf_16_be UTF-16BE UTF-16BE
utf_16_le UTF-16LE UTF-16LE
utf_7 U7, unicode-1-1-utf-7 65000 UTF-7
utf_8 U8, UTF, utf8, cp65001 65001 UTF-8
utf_8_sig
1 Encoding Alias Code Page Iconv Identifier
2 ascii 646, us-ascii ASCII
3 big5 big5-tw, csbig5 950 BIG5
4 big5hkscs big5-hkscs, hkscs BIG5-HKSCS
5 cp037 IBM037, IBM039 037
6 cp273 273, IBM273, csIBM273
7 cp424 EBCDIC-CP-HE, IBM424
8 cp437 437, IBM437 437
9 cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 500
10 cp720 720
11 cp737 737
12 cp775 IBM775 775
13 cp850 850, IBM850 850 CP850
14 cp852 852, IBM852 852
15 cp855 855, IBM855 855
16 cp856
17 cp857 857, IBM857 857
18 cp858 858, IBM858 858
19 cp860 860, IBM860 860
20 cp861 861, CP-IS, IBM861 861
21 cp862 862, IBM862 862 CP862
22 cp863 863, IBM863 863
23 cp864 IBM864 864
24 cp865 865, IBM865 865
25 cp866 866, IBM866 866 CP866
26 cp869 869, CP-GR, IBM869 869
27 cp874 874 CP874
28 cp875 875
29 cp932 932, ms932, mskanji, ms-kanji, windows-31j 932 CP932
30 cp949 949, ms949, uhc 949 CP949
31 cp950 950, ms950 950 CP950
32 cp1006
33 cp1026 ibm1026 1026
34 cp1125 1125, ibm1125, cp866u, ruscii
35 cp1140 ibm1140 1140
36 cp1250 windows-1250 1250 CP1250
37 cp1251 windows-1251 1251 CP1251
38 cp1252 windows-1252 1252 CP1252
39 cp1253 windows-1253 1253 CP1253
40 cp1254 windows-1254 1254 CP1254
41 cp1255 windows-1255 1255 CP1255
42 cp1256 windows-1256 1256 CP1256
43 cp1257 windows-1257 1257 CP1257
44 cp1258 windows-1258 1258 CP1258
45 euc_jp eucjp, ujis, u-jis EUC-JP
46 euc_jis_2004 jisx0213, eucjis2004
47 euc_jisx0213 eucjisx0213
48 euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 51949 EUC-KR
49 gb2312 chinese, csiso58gb231280, euc-cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, iso-ir-58
50 gbk 936, cp936, ms936 936 CP936
51 gb18030 gb18030-2000 54936 GB18030
52 hz hzgb, hz-gb, hz-gb-2312 52936 HZ
53 iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp ISO-2022-JP
54 iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 ISO-2022-JP-1
55 iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 ISO-2022-JP-2
56 iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004
57 iso2022_jp_3 iso2022jp-3, iso-2022-jp-3
58 iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext
59 iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr 50225 ISO-2022-KR
60 latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 28591 ISO-8859-1
61 iso8859_2 iso-8859-2, latin2, L2 28592 ISO-8859-2
62 iso8859_3 iso-8859-3, latin3, L3 28593 ISO-8859-3
63 iso8859_4 iso-8859-4, latin4, L4 28594 ISO-8859-4
64 iso8859_5 iso-8859-5, cyrillic 28595 ISO-8859-5
65 iso8859_6 iso-8859-6, arabic 28596 ISO-8859-6
66 iso8859_7 iso-8859-7, greek, greek8 28597 ISO-8859-7
67 iso8859_8 iso-8859-8, hebrew 28598 ISO-8859-8
68 iso8859_9 iso-8859-9, latin5, L5 28599 ISO-8859-9
69 iso8859_10 iso-8859-10, latin6, L6 ISO-8859-10
70 iso8859_11 iso-8859-11, thai ISO-8859-11
71 iso8859_13 iso-8859-13, latin7, L7 28603 ISO-8859-13
72 iso8859_14 iso-8859-14, latin8, L8 ISO-8859-14
73 iso8859_15 iso-8859-15, latin9, L9 28605 ISO-8859-15
74 iso8859_16 iso-8859-16, latin10, L10 ISO-8859-16
75 johab cp1361, ms1361 1361 JOHAB
76 koi8_r
77 koi8_t KOI8-T
78 koi8_u
79 kz1048 kz_1048, strk1048_2002, rk1048
80 mac_cyrillic maccyrillic 10007 MacCyrillic
81 mac_greek macgreek 10006 MacGreek
82 mac_iceland maciceland 10079 MacIceland
83 mac_latin2 maclatin2, maccentraleurope, mac_centeuro
84 mac_roman macroman, macintosh MacRoman
85 mac_turkish macturkish 10081 MacTurkish
86 ptcp154 csptcp154, pt154, cp154, cyrillic-asian PT154
87 shift_jis csshiftjis, shiftjis, sjis, s_jis 932 SHIFT_JIS
88 shift_jis_2004 shiftjis2004, sjis_2004, sjis2004
89 shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213
90 utf_32 U32, utf32 UTF-32
91 utf_32_be UTF-32BE UTF-32BE
92 utf_32_le UTF-32LE UTF-32LE
93 utf_16 U16, utf16 UTF16
94 utf_16_be UTF-16BE UTF-16BE
95 utf_16_le UTF-16LE UTF-16LE
96 utf_7 U7, unicode-1-1-utf-7 65000 UTF-7
97 utf_8 U8, UTF, utf8, cp65001 65001 UTF-8
98 utf_8_sig

View File

@ -0,0 +1,57 @@
import typing
import io
class LanguageToken:
m_Name: str
m_Alias: tuple[str, ...]
m_CodePage: str | None
m_IconvCode: str | None
def __init__(self, name: str, alias: typing.Iterator[str], code_page: str, iconv_code: str):
self.m_Name = name.lower()
self.m_Alias = tuple(map(lambda x: x.lower(), alias))
self.m_CodePage = None if code_page == '' else code_page
self.m_IconvCode = None if iconv_code == '' else iconv_code
def extract_data(fs: io.TextIOWrapper) -> tuple[str, ...]:
# remove first line to remove table header
return fs.readlines()[1:]
def extract_token(csv_data: tuple[str, ...]) -> tuple[LanguageToken, ...]:
ret: list[LanguageToken] = []
for line in csv_data:
line = line.strip('\n')
line_sp = line.split('\t')
alias_sp = filter(lambda x: x != '', map(lambda x: x.strip(), line_sp[1].split(',')))
ret.append(LanguageToken(line_sp[0], alias_sp, line_sp[2], line_sp[3]))
return tuple(ret)
def write_alias_map(fs: io.TextIOWrapper, data: tuple[LanguageToken, ...]) -> None:
fs.write('static const std::map<std::u8string, std::u8string> c_AliasMap {\n')
for i in data:
for j in i.m_Alias:
fs.write(f'\t{{ u8"{j}", u8"{i.m_Name}" }},\n')
fs.write('};\n')
def write_win_cp_map(fs: io.TextIOWrapper, data: tuple[LanguageToken, ...]) -> None:
fs.write('static const std::map<std::u8string, UINT> c_WinCPMap {\n')
for i in data:
if i.m_CodePage is not None:
fs.write(f'\t{{ u8"{i.m_Name}", static_cast<UINT>({i.m_CodePage}u) }},\n')
fs.write('};\n')
def write_iconv_map(fs: io.TextIOWrapper, data: tuple[LanguageToken, ...]) -> None:
fs.write('static const std::map<std::u8string, std::string> c_IconvMap {\n')
for i in data:
if i.m_IconvCode is not None:
fs.write(f'\t{{ u8"{i.m_Name}", "{i.m_IconvCode}" }},\n')
fs.write('};\n')
if __name__ == '__main__':
with open('EncodingTable.csv', 'r', encoding='utf-8') as fr:
with open('UEncodingTable.cpp', 'w', encoding='utf-8') as fw:
data = extract_data(fr)
token = extract_token(data)
write_alias_map(fw, token)
write_win_cp_map(fw, token)
write_iconv_map(fw, token)

View File

@ -1,3 +1,2 @@
# Result
*.hpp