refactor: continue refactor to make the project can be built
This commit is contained in:
7
script/pycodec/README.md
Normal file
7
script/pycodec/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
# PyCodec
|
||||
|
||||
This directory contain all stuff related to PyCodec.
|
||||
|
||||
PyCodec use different encoding system on different OS. In Windows it use Win32 functions, and it will use Iconv in other OS. So we need a table converting PyCodec universal encoding name to Windows Code Page or Iconv Code Name. These relation was stored in CSV file and Python script will render it into C++ source code.
|
||||
|
||||
For the format of CSV file, each line is a record. The first item in record is the standard PyCodec name. The second item is corresponding Windows Code Page. If there is no corresponding Code Page, it can be empty. The third item is corresponding Iconv Code Name. It also can be empty with same case. Then, the count of remain columns is variables after forth item (inclusive). All of them is the alias of this standard PyCodec name.
|
@ -1,6 +1,7 @@
|
||||
import typing
|
||||
import csv
|
||||
from pathlib import Path
|
||||
import os
|
||||
import jinja2
|
||||
|
||||
|
||||
class LanguageToken:
|
||||
name: str
|
||||
@ -8,56 +9,46 @@ class LanguageToken:
|
||||
code_page: str | None
|
||||
iconv_code: str | None
|
||||
|
||||
def __init__(self, name: str, alias: typing.Iterator[str], code_page: str, iconv_code: str):
|
||||
self.name = name.lower()
|
||||
self.alias = tuple(map(lambda x: x.lower(), alias))
|
||||
self.code_page = None if code_page == '' else code_page
|
||||
self.iconv_code = None if iconv_code == '' else iconv_code
|
||||
def __init__(self, row: list[str]):
|
||||
"""Init language token from CSV row."""
|
||||
self.name = row[0].lower()
|
||||
code_page = row[1]
|
||||
self.code_page = None if len(code_page) == 0 else code_page
|
||||
iconv_code = row[2]
|
||||
self.iconv_code = None if len(iconv_code) == 0 else iconv_code
|
||||
# For alias, we strip and to lower them first, and remove all empty entries
|
||||
alias = row[3:]
|
||||
self.alias = tuple(
|
||||
filter(lambda x: len(x) != 0,
|
||||
map(lambda x: x.strip().lower(), alias)))
|
||||
|
||||
def extract_data(fs: typing.TextIO) -> list[str]:
|
||||
# remove first line to remove table header
|
||||
return fs.readlines()[1:]
|
||||
|
||||
def extract_token(csv_data: list[str]) -> tuple[LanguageToken, ...]:
|
||||
ret: list[LanguageToken] = []
|
||||
for line in csv_data:
|
||||
line = line.strip('\n')
|
||||
line_sp = line.split('\t')
|
||||
alias_sp = filter(lambda x: len(x) != 0, map(lambda x: x.strip(), line_sp[1].split(',')))
|
||||
ret.append(LanguageToken(line_sp[0], alias_sp, line_sp[2], line_sp[3]))
|
||||
return tuple(ret)
|
||||
def _get_self_dir() -> Path:
|
||||
return Path(__file__).resolve().parent
|
||||
|
||||
def write_alias_map(fs: typing.TextIO, data: tuple[LanguageToken, ...]) -> None:
|
||||
fs.write('static const std::map<NS_YYCC_STRING::u8string, NS_YYCC_STRING::u8string> ALISA_MAP {\n')
|
||||
for i in data:
|
||||
for j in i.alias:
|
||||
fs.write(f'\t{{ YYCC_U8("{j}"), YYCC_U8("{i.name}") }},\n')
|
||||
fs.write('};\n')
|
||||
|
||||
def write_win_cp_map(fs: typing.TextIO, data: tuple[LanguageToken, ...]) -> None:
|
||||
fs.write('static const std::map<NS_YYCC_STRING::u8string, CodePage> WINCP_MAP {\n')
|
||||
for i in data:
|
||||
if i.code_page is not None:
|
||||
fs.write(f'\t{{ YYCC_U8("{i.name}"), static_cast<CodePage>({i.code_page}u) }},\n')
|
||||
fs.write('};\n')
|
||||
def _extract_tokens() -> list[LanguageToken]:
|
||||
rv: list[LanguageToken] = []
|
||||
csv_file = _get_self_dir() / 'encoding_table.csv'
|
||||
|
||||
with open(csv_file, 'r', encoding='utf-8', newline='') as f:
|
||||
reader = csv.reader(f, delimiter='\t')
|
||||
for row in reader:
|
||||
rv.append(LanguageToken(row))
|
||||
|
||||
return rv
|
||||
|
||||
|
||||
def _render_cpp(tokens: list[LanguageToken]) -> None:
|
||||
loader = jinja2.FileSystemLoader(_get_self_dir())
|
||||
environment = jinja2.Environment(loader=loader)
|
||||
template = environment.get_template('encoding_table.cpp.jinja')
|
||||
|
||||
cpp_file = _get_self_dir() / 'encoding_table.cpp'
|
||||
with open(cpp_file, 'w', encoding='utf-8') as f:
|
||||
f.write(template.render(tokens=tokens))
|
||||
|
||||
def write_iconv_map(fs: typing.TextIO, data: tuple[LanguageToken, ...]) -> None:
|
||||
fs.write('static const std::map<NS_YYCC_STRING::u8string, std::string> ICONV_MAP {\n')
|
||||
for i in data:
|
||||
if i.iconv_code is not None:
|
||||
fs.write(f'\t{{ YYCC_U8("{i.name}"), "{i.iconv_code}" }},\n')
|
||||
fs.write('};\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
# get file path
|
||||
self_path = Path(__file__).resolve().parent
|
||||
csv_file = self_path / 'encoding_table.csv'
|
||||
cpp_file = self_path / 'encoding_table.cpp'
|
||||
# process files
|
||||
with open(csv_file, 'r', encoding='utf-8') as fr:
|
||||
with open(cpp_file, 'w', encoding='utf-8') as fw:
|
||||
data = extract_data(fr)
|
||||
token = extract_token(data)
|
||||
write_alias_map(fw, token)
|
||||
write_win_cp_map(fw, token)
|
||||
write_iconv_map(fw, token)
|
||||
tokens = _extract_tokens()
|
||||
_render_cpp(tokens)
|
||||
|
23
script/pycodec/encoding_table.cpp.jinja
Normal file
23
script/pycodec/encoding_table.cpp.jinja
Normal file
@ -0,0 +1,23 @@
|
||||
static const std::map<std::u8string_view, std::u8string_view> ALIAS_MAP {
|
||||
{% for token in tokens -%}
|
||||
{% for alias in token.alias -%}
|
||||
{ u8"{{ alias }}"sv, u8"{{ token.name }}"sv },
|
||||
{% endfor -%}
|
||||
{% endfor -%}
|
||||
};
|
||||
|
||||
static const std::map<std::u8string_view, CodePage> WINCP_MAP {
|
||||
{% for token in tokens -%}
|
||||
{% if token.code_page is not none -%}
|
||||
{ u8"{{ token.name }}"sv, static_cast<CodePage>({{ token.code_page }}u) },
|
||||
{% endif -%}
|
||||
{% endfor -%}
|
||||
};
|
||||
|
||||
static const std::map<std::u8string_view, std::string_view> ICONV_MAP {
|
||||
{% for token in tokens -%}
|
||||
{% if token.iconv_code is not none -%}
|
||||
{ u8"{{ token.name }}"sv, "{{ token.iconv_code }}"sv },
|
||||
{% endif -%}
|
||||
{% endfor -%}
|
||||
};
|
@ -1,98 +1,97 @@
|
||||
Encoding Alias Code Page Iconv Identifier
|
||||
ascii 646, us-ascii 437 ASCII
|
||||
big5 big5-tw, csbig5 950 BIG5
|
||||
big5hkscs big5-hkscs, hkscs BIG5-HKSCS
|
||||
cp037 IBM037, IBM039 037
|
||||
cp273 273, IBM273, csIBM273
|
||||
cp424 EBCDIC-CP-HE, IBM424
|
||||
cp437 437, IBM437 437
|
||||
cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 500
|
||||
cp720 720
|
||||
cp737 737
|
||||
cp775 IBM775 775
|
||||
cp850 850, IBM850 850 CP850
|
||||
cp852 852, IBM852 852
|
||||
cp855 855, IBM855 855
|
||||
cp856
|
||||
cp857 857, IBM857 857
|
||||
cp858 858, IBM858 858
|
||||
cp860 860, IBM860 860
|
||||
cp861 861, CP-IS, IBM861 861
|
||||
cp862 862, IBM862 862 CP862
|
||||
cp863 863, IBM863 863
|
||||
cp864 IBM864 864
|
||||
cp865 865, IBM865 865
|
||||
cp866 866, IBM866 866 CP866
|
||||
cp869 869, CP-GR, IBM869 869
|
||||
cp874 874 CP874
|
||||
cp875 875
|
||||
cp932 932, ms932, mskanji, ms-kanji, windows-31j 932 CP932
|
||||
cp949 949, ms949, uhc 949 CP949
|
||||
cp950 950, ms950 950 CP950
|
||||
cp1006
|
||||
cp1026 ibm1026 1026
|
||||
cp1125 1125, ibm1125, cp866u, ruscii
|
||||
cp1140 ibm1140 1140
|
||||
cp1250 windows-1250 1250 CP1250
|
||||
cp1251 windows-1251 1251 CP1251
|
||||
cp1252 windows-1252 1252 CP1252
|
||||
cp1253 windows-1253 1253 CP1253
|
||||
cp1254 windows-1254 1254 CP1254
|
||||
cp1255 windows-1255 1255 CP1255
|
||||
cp1256 windows-1256 1256 CP1256
|
||||
cp1257 windows-1257 1257 CP1257
|
||||
cp1258 windows-1258 1258 CP1258
|
||||
euc_jp eucjp, ujis, u-jis 20932 EUC-JP
|
||||
euc_jis_2004 jisx0213, eucjis2004
|
||||
euc_jisx0213 eucjisx0213
|
||||
euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 51949 EUC-KR
|
||||
gb2312 chinese, csiso58gb231280, euc-cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, iso-ir-58 936 CP936
|
||||
gbk 936, cp936, ms936 936 GBK
|
||||
gb18030 gb18030-2000 54936 GB18030
|
||||
hz hzgb, hz-gb, hz-gb-2312 52936 HZ
|
||||
iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp 50220 ISO-2022-JP
|
||||
iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 ISO-2022-JP-1
|
||||
iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 ISO-2022-JP-2
|
||||
iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004
|
||||
iso2022_jp_3 iso2022jp-3, iso-2022-jp-3
|
||||
iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext
|
||||
iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr 50225 ISO-2022-KR
|
||||
latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 28591 ISO-8859-1
|
||||
iso8859_2 iso-8859-2, latin2, L2 28592 ISO-8859-2
|
||||
iso8859_3 iso-8859-3, latin3, L3 28593 ISO-8859-3
|
||||
iso8859_4 iso-8859-4, latin4, L4 28594 ISO-8859-4
|
||||
iso8859_5 iso-8859-5, cyrillic 28595 ISO-8859-5
|
||||
iso8859_6 iso-8859-6, arabic 28596 ISO-8859-6
|
||||
iso8859_7 iso-8859-7, greek, greek8 28597 ISO-8859-7
|
||||
iso8859_8 iso-8859-8, hebrew 28598 ISO-8859-8
|
||||
iso8859_9 iso-8859-9, latin5, L5 28599 ISO-8859-9
|
||||
iso8859_10 iso-8859-10, latin6, L6 ISO-8859-10
|
||||
iso8859_11 iso-8859-11, thai ISO-8859-11
|
||||
iso8859_13 iso-8859-13, latin7, L7 28603 ISO-8859-13
|
||||
iso8859_14 iso-8859-14, latin8, L8 ISO-8859-14
|
||||
iso8859_15 iso-8859-15, latin9, L9 28605 ISO-8859-15
|
||||
iso8859_16 iso-8859-16, latin10, L10 ISO-8859-16
|
||||
johab cp1361, ms1361 1361 JOHAB
|
||||
koi8_r
|
||||
koi8_t KOI8-T
|
||||
koi8_u
|
||||
kz1048 kz_1048, strk1048_2002, rk1048
|
||||
mac_cyrillic maccyrillic 10007 MacCyrillic
|
||||
mac_greek macgreek 10006 MacGreek
|
||||
mac_iceland maciceland 10079 MacIceland
|
||||
mac_latin2 maclatin2, maccentraleurope, mac_centeuro
|
||||
mac_roman macroman, macintosh MacRoman
|
||||
mac_turkish macturkish 10081 MacTurkish
|
||||
ptcp154 csptcp154, pt154, cp154, cyrillic-asian PT154
|
||||
shift_jis csshiftjis, shiftjis, sjis, s_jis 932 SHIFT_JIS
|
||||
shift_jis_2004 shiftjis2004, sjis_2004, sjis2004
|
||||
shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213
|
||||
utf_32 U32, utf32 UTF-32
|
||||
utf_32_be UTF-32BE UTF-32BE
|
||||
utf_32_le UTF-32LE UTF-32LE
|
||||
utf_16 U16, utf16 UTF16
|
||||
utf_16_be UTF-16BE UTF-16BE
|
||||
utf_16_le UTF-16LE UTF-16LE
|
||||
utf_7 U7, unicode-1-1-utf-7 65000 UTF-7
|
||||
utf_8 U8, UTF, utf8, utf-8, cp65001 65001 UTF-8
|
||||
utf_8_sig
|
||||
ascii 437 ASCII 646 us-ascii
|
||||
big5 950 BIG5 big5-tw csbig5
|
||||
big5hkscs BIG5-HKSCS big5-hkscs hkscs
|
||||
cp037 037 IBM037 IBM039
|
||||
cp273 273 IBM273 csIBM273
|
||||
cp424 EBCDIC-CP-HE IBM424
|
||||
cp437 437 437 IBM437
|
||||
cp500 500 EBCDIC-CP-BE EBCDIC-CP-CH IBM500
|
||||
cp720 720
|
||||
cp737 737
|
||||
cp775 775 IBM775
|
||||
cp850 850 CP850 850 IBM850
|
||||
cp852 852 852 IBM852
|
||||
cp855 855 855 IBM855
|
||||
cp856
|
||||
cp857 857 857 IBM857
|
||||
cp858 858 858 IBM858
|
||||
cp860 860 860 IBM860
|
||||
cp861 861 861 CP-IS IBM861
|
||||
cp862 862 CP862 862 IBM862
|
||||
cp863 863 863 IBM863
|
||||
cp864 864 IBM864
|
||||
cp865 865 865 IBM865
|
||||
cp866 866 CP866 866 IBM866
|
||||
cp869 869 869 CP-GR IBM869
|
||||
cp874 874 CP874
|
||||
cp875 875
|
||||
cp932 932 CP932 932 ms932 mskanji ms-kanji windows-31j
|
||||
cp949 949 CP949 949 ms949 uhc
|
||||
cp950 950 CP950 950 ms950
|
||||
cp1006
|
||||
cp1026 1026 ibm1026
|
||||
cp1125 1125 ibm1125 cp866u ruscii
|
||||
cp1140 1140 ibm1140
|
||||
cp1250 1250 CP1250 windows-1250
|
||||
cp1251 1251 CP1251 windows-1251
|
||||
cp1252 1252 CP1252 windows-1252
|
||||
cp1253 1253 CP1253 windows-1253
|
||||
cp1254 1254 CP1254 windows-1254
|
||||
cp1255 1255 CP1255 windows-1255
|
||||
cp1256 1256 CP1256 windows-1256
|
||||
cp1257 1257 CP1257 windows-1257
|
||||
cp1258 1258 CP1258 windows-1258
|
||||
euc_jp 20932 EUC-JP eucjp ujis u-jis
|
||||
euc_jis_2004 jisx0213 eucjis2004
|
||||
euc_jisx0213 eucjisx0213
|
||||
euc_kr 51949 EUC-KR euckr korean ksc5601 ks_c-5601 ks_c-5601-1987 ksx1001 ks_x-1001
|
||||
gb2312 936 CP936 chinese csiso58gb231280 euc-cn euccn eucgb2312-cn gb2312-1980 gb2312-80 iso-ir-58
|
||||
gbk 936 GBK 936 cp936 ms936
|
||||
gb18030 54936 GB18030 gb18030-2000
|
||||
hz 52936 HZ hzgb hz-gb hz-gb-2312
|
||||
iso2022_jp 50220 ISO-2022-JP csiso2022jp iso2022jp iso-2022-jp
|
||||
iso2022_jp_1 ISO-2022-JP-1 iso2022jp-1 iso-2022-jp-1
|
||||
iso2022_jp_2 ISO-2022-JP-2 iso2022jp-2 iso-2022-jp-2
|
||||
iso2022_jp_2004 iso2022jp-2004 iso-2022-jp-2004
|
||||
iso2022_jp_3 iso2022jp-3 iso-2022-jp-3
|
||||
iso2022_jp_ext iso2022jp-ext iso-2022-jp-ext
|
||||
iso2022_kr 50225 ISO-2022-KR csiso2022kr iso2022kr iso-2022-kr
|
||||
latin_1 28591 ISO-8859-1 iso-8859-1 iso8859-1 8859 cp819 latin latin1 L1
|
||||
iso8859_2 28592 ISO-8859-2 iso-8859-2 latin2 L2
|
||||
iso8859_3 28593 ISO-8859-3 iso-8859-3 latin3 L3
|
||||
iso8859_4 28594 ISO-8859-4 iso-8859-4 latin4 L4
|
||||
iso8859_5 28595 ISO-8859-5 iso-8859-5 cyrillic
|
||||
iso8859_6 28596 ISO-8859-6 iso-8859-6 arabic
|
||||
iso8859_7 28597 ISO-8859-7 iso-8859-7 greek greek8
|
||||
iso8859_8 28598 ISO-8859-8 iso-8859-8 hebrew
|
||||
iso8859_9 28599 ISO-8859-9 iso-8859-9 latin5 L5
|
||||
iso8859_10 ISO-8859-10 iso-8859-10 latin6 L6
|
||||
iso8859_11 ISO-8859-11 iso-8859-11 thai
|
||||
iso8859_13 28603 ISO-8859-13 iso-8859-13 latin7 L7
|
||||
iso8859_14 ISO-8859-14 iso-8859-14 latin8 L8
|
||||
iso8859_15 28605 ISO-8859-15 iso-8859-15 latin9 L9
|
||||
iso8859_16 ISO-8859-16 iso-8859-16 latin10 L10
|
||||
johab 1361 JOHAB cp1361 ms1361
|
||||
koi8_r
|
||||
koi8_t KOI8-T
|
||||
koi8_u
|
||||
kz1048 kz_1048 strk1048_2002 rk1048
|
||||
mac_cyrillic 10007 MacCyrillic maccyrillic
|
||||
mac_greek 10006 MacGreek macgreek
|
||||
mac_iceland 10079 MacIceland maciceland
|
||||
mac_latin2 maclatin2 maccentraleurope mac_centeuro
|
||||
mac_roman MacRoman macroman macintosh
|
||||
mac_turkish 10081 MacTurkish macturkish
|
||||
ptcp154 PT154 csptcp154 pt154 cp154 cyrillic-asian
|
||||
shift_jis 932 SHIFT_JIS csshiftjis shiftjis sjis s_jis
|
||||
shift_jis_2004 shiftjis2004 sjis_2004 sjis2004
|
||||
shift_jisx0213 shiftjisx0213 sjisx0213 s_jisx0213
|
||||
utf_32 UTF-32 U32 utf32
|
||||
utf_32_be UTF-32BE UTF-32BE
|
||||
utf_32_le UTF-32LE UTF-32LE
|
||||
utf_16 UTF16 U16 utf16
|
||||
utf_16_be UTF-16BE UTF-16BE
|
||||
utf_16_le UTF-16LE UTF-16LE
|
||||
utf_7 65000 UTF-7 U7 unicode-1-1-utf-7
|
||||
utf_8 65001 UTF-8 U8 UTF utf8 utf-8 cp65001
|
||||
utf_8_sig
|
||||
|
|
Reference in New Issue
Block a user