refactor: merge multiple project into one and create new project
This commit is contained in:
199
dl-exp/exp3/source/dataset.py
Normal file
199
dl-exp/exp3/source/dataset.py
Normal file
@@ -0,0 +1,199 @@
|
||||
#ANLI College of Artificial Intelligence
|
||||
|
||||
from collections import Counter
|
||||
import math
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import settings
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""
|
||||
分词器
|
||||
"""
|
||||
|
||||
def __init__(self, token_dict):
|
||||
# 词->编号的映射
|
||||
self.token_dict = token_dict
|
||||
# 编号->词的映射
|
||||
self.token_dict_rev = {value: key for key, value in self.token_dict.items()}
|
||||
# 词汇表大小
|
||||
self.vocab_size = len(self.token_dict)
|
||||
|
||||
def id_to_token(self, token_id):
|
||||
"""
|
||||
给定一个编号,查找词汇表中对应的词
|
||||
:param token_id: 带查找词的编号
|
||||
:return: 编号对应的词
|
||||
"""
|
||||
return self.token_dict_rev[token_id]
|
||||
|
||||
def token_to_id(self, token):
|
||||
"""
|
||||
给定一个词,查找它在词汇表中的编号
|
||||
未找到则返回低频词[UNK]的编号
|
||||
:param token: 带查找编号的词
|
||||
:return: 词的编号
|
||||
"""
|
||||
return self.token_dict.get(token, self.token_dict['[UNK]'])
|
||||
|
||||
def encode(self, tokens):
|
||||
"""
|
||||
给定一个字符串s,在头尾分别加上标记开始和结束的特殊字符,并将它转成对应的编号序列
|
||||
:param tokens: 待编码字符串
|
||||
:return: 编号序列
|
||||
"""
|
||||
# 加上开始标记
|
||||
token_ids = [self.token_to_id('[CLS]'), ]
|
||||
# 加入字符串编号序列
|
||||
for token in tokens:
|
||||
token_ids.append(self.token_to_id(token))
|
||||
# 加上结束标记
|
||||
token_ids.append(self.token_to_id('[SEP]'))
|
||||
return token_ids
|
||||
|
||||
def decode(self, token_ids):
|
||||
"""
|
||||
给定一个编号序列,将它解码成字符串
|
||||
:param token_ids: 待解码的编号序列
|
||||
:return: 解码出的字符串
|
||||
"""
|
||||
# 起止标记字符特殊处理
|
||||
spec_tokens = {'[CLS]', '[SEP]'}
|
||||
# 保存解码出的字符的list
|
||||
tokens = []
|
||||
for token_id in token_ids:
|
||||
token = self.id_to_token(token_id)
|
||||
if token in spec_tokens:
|
||||
continue
|
||||
tokens.append(token)
|
||||
# 拼接字符串
|
||||
return ''.join(tokens)
|
||||
|
||||
|
||||
# 禁用词
|
||||
disallowed_words = settings.DISALLOWED_WORDS
|
||||
# 句子最大长度
|
||||
max_len = settings.MAX_LEN
|
||||
# 最小词频
|
||||
min_word_frequency = settings.MIN_WORD_FREQUENCY
|
||||
# mini batch 大小
|
||||
batch_size = settings.BATCH_SIZE
|
||||
|
||||
# 加载数据集
|
||||
with open(settings.DATASET_PATH, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
# 将冒号统一成相同格式
|
||||
lines = [line.replace(':', ':') for line in lines]
|
||||
# 数据集列表
|
||||
poetry = []
|
||||
# 逐行处理读取到的数据
|
||||
for line in lines:
|
||||
# 有且只能有一个冒号用来分割标题
|
||||
if line.count(':') != 1:
|
||||
continue
|
||||
# 后半部分不能包含禁止词
|
||||
__, last_part = line.split(':')
|
||||
ignore_flag = False
|
||||
for dis_word in disallowed_words:
|
||||
if dis_word in last_part:
|
||||
ignore_flag = True
|
||||
break
|
||||
if ignore_flag:
|
||||
continue
|
||||
# 长度不能超过最大长度
|
||||
if len(last_part) > max_len - 2:
|
||||
continue
|
||||
poetry.append(last_part.replace('\n', ''))
|
||||
|
||||
# 统计词频
|
||||
counter = Counter()
|
||||
for line in poetry:
|
||||
counter.update(line)
|
||||
# 过滤掉低频词
|
||||
_tokens = [(token, count) for token, count in counter.items() if count >= min_word_frequency]
|
||||
# 按词频排序
|
||||
_tokens = sorted(_tokens, key=lambda x: -x[1])
|
||||
# 去掉词频,只保留词列表
|
||||
_tokens = [token for token, count in _tokens]
|
||||
|
||||
# 将特殊词和数据集中的词拼接起来
|
||||
_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] + _tokens
|
||||
# 创建词典 token->id映射关系
|
||||
token_id_dict = dict(zip(_tokens, range(len(_tokens))))
|
||||
# 使用新词典重新建立分词器
|
||||
tokenizer = Tokenizer(token_id_dict)
|
||||
# 混洗数据
|
||||
np.random.shuffle(poetry)
|
||||
|
||||
|
||||
class PoetryDataGenerator:
|
||||
"""
|
||||
古诗数据集生成器
|
||||
"""
|
||||
|
||||
def __init__(self, data, random=False):
|
||||
# 数据集
|
||||
self.data = data
|
||||
# batch size
|
||||
self.batch_size = batch_size
|
||||
# 每个epoch迭代的步数
|
||||
self.steps = int(math.floor(len(self.data) / self.batch_size))
|
||||
# 每个epoch开始时是否随机混洗
|
||||
self.random = random
|
||||
|
||||
def sequence_padding(self, data, length=None, padding=None):
|
||||
"""
|
||||
将给定数据填充到相同长度
|
||||
:param data: 待填充数据
|
||||
:param length: 填充后的长度,不传递此参数则使用data中的最大长度
|
||||
:param padding: 用于填充的数据,不传递此参数则使用[PAD]的对应编号
|
||||
:return: 填充后的数据
|
||||
"""
|
||||
# 计算填充长度
|
||||
if length is None:
|
||||
length = max(map(len, data))
|
||||
# 计算填充数据
|
||||
if padding is None:
|
||||
padding = tokenizer.token_to_id('[PAD]')
|
||||
# 开始填充
|
||||
outputs = []
|
||||
for line in data:
|
||||
padding_length = length - len(line)
|
||||
# 不足就进行填充
|
||||
if padding_length > 0:
|
||||
outputs.append(np.concatenate([line, [padding] * padding_length]))
|
||||
# 超过就进行截断
|
||||
else:
|
||||
outputs.append(line[:length])
|
||||
return np.array(outputs)
|
||||
|
||||
def __len__(self):
|
||||
return self.steps
|
||||
|
||||
def __iter__(self):
|
||||
total = len(self.data)
|
||||
# 是否随机混洗
|
||||
if self.random:
|
||||
np.random.shuffle(self.data)
|
||||
# 迭代一个epoch,每次yield一个batch
|
||||
for start in range(0, total, self.batch_size):
|
||||
end = min(start + self.batch_size, total)
|
||||
batch_data = []
|
||||
# 逐一对古诗进行编码
|
||||
for single_data in self.data[start:end]:
|
||||
batch_data.append(tokenizer.encode(single_data))
|
||||
# 填充为相同长度
|
||||
batch_data = self.sequence_padding(batch_data)
|
||||
# yield x,y
|
||||
yield batch_data[:, :-1], tf.one_hot(batch_data[:, 1:], tokenizer.vocab_size)
|
||||
del batch_data
|
||||
|
||||
def for_fit(self):
|
||||
"""
|
||||
创建一个生成器,用于训练
|
||||
"""
|
||||
# 死循环,当数据训练一个epoch之后,重新迭代数据
|
||||
while True:
|
||||
# 委托生成器
|
||||
yield from self.__iter__()
|
||||
16
dl-exp/exp3/source/eval.py
Normal file
16
dl-exp/exp3/source/eval.py
Normal file
@@ -0,0 +1,16 @@
|
||||
#ANLI College of Artificial Intelligence
|
||||
|
||||
|
||||
import tensorflow as tf
|
||||
from dataset import tokenizer
|
||||
import settings
|
||||
import utils
|
||||
|
||||
# 加载训练好的模型
|
||||
model = tf.keras.models.load_model(settings.BEST_MODEL_PATH)
|
||||
# 随机生成一首诗
|
||||
print(utils.generate_random_poetry(tokenizer, model))
|
||||
# 给出部分信息的情况下,随机生成剩余部分
|
||||
print(utils.generate_random_poetry(tokenizer, model, s='床前明月光,'))
|
||||
# 生成藏头诗
|
||||
print(utils.generate_acrostic(tokenizer, model, head='好好学习天天向上'))
|
||||
19
dl-exp/exp3/source/settings.py
Normal file
19
dl-exp/exp3/source/settings.py
Normal file
@@ -0,0 +1,19 @@
|
||||
#ANLI College of Artificial Intelligence
|
||||
|
||||
|
||||
# 禁用词,包含如下字符的唐诗将被忽略
|
||||
DISALLOWED_WORDS = ['(', ')', '(', ')', '__', '《', '》', '【', '】', '[', ']']
|
||||
# 句子最大长度
|
||||
MAX_LEN = 64
|
||||
# 最小词频
|
||||
MIN_WORD_FREQUENCY = 8
|
||||
# 训练的batch size
|
||||
BATCH_SIZE = 16
|
||||
# 数据集路径
|
||||
DATASET_PATH = './poetry.txt'
|
||||
# 每个epoch训练完成后,随机生成SHOW_NUM首古诗作为展示
|
||||
SHOW_NUM = 5
|
||||
# 共训练多少个epoch
|
||||
TRAIN_EPOCHS = 10
|
||||
# 最佳权重保存路径
|
||||
BEST_MODEL_PATH = './best_model.h5'
|
||||
35
dl-exp/exp3/source/train.py
Normal file
35
dl-exp/exp3/source/train.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import tensorflow as tf
|
||||
from dataset import PoetryDataGenerator, tokenizer, poetry
|
||||
import settings
|
||||
import utils
|
||||
|
||||
model = tf.keras.Sequential([
|
||||
tf.keras.layers.Input((None,)),
|
||||
tf.keras.layers.Embedding(input_dim=tokenizer.vocab_size, output_dim=128),
|
||||
tf.keras.layers.LSTM(128, dropout=0.5, return_sequences=True),
|
||||
tf.keras.layers.LSTM(128, dropout=0.5, return_sequences=True),
|
||||
tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tokenizer.vocab_size, activation='softmax')),
|
||||
|
||||
])
|
||||
model.summary()
|
||||
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.categorical_crossentropy)
|
||||
|
||||
class Evaluate(tf.keras.callbacks.Callback):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.lowest = 1e10
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
if logs['loss'] <= self.lowest:
|
||||
self.lowest = logs['loss']
|
||||
model.save(settings.BEST_MODEL_PATH)
|
||||
print()
|
||||
for i in range(settings.SHOW_NUM):
|
||||
print(utils.generate_random_poetry(tokenizer, model))
|
||||
|
||||
data_generator = PoetryDataGenerator(poetry, random=False)
|
||||
model.fit_generator(data_generator.for_fit(),
|
||||
steps_per_epoch=data_generator.steps,
|
||||
epochs=settings.TRAIN_EPOCHS,
|
||||
callbacks=[Evaluate()])
|
||||
86
dl-exp/exp3/source/utils.py
Normal file
86
dl-exp/exp3/source/utils.py
Normal file
@@ -0,0 +1,86 @@
|
||||
|
||||
import numpy as np
|
||||
import settings
|
||||
|
||||
|
||||
def generate_random_poetry(tokenizer, model, s=''):
|
||||
"""
|
||||
随机生成一首诗
|
||||
:param tokenizer: 分词器
|
||||
:param model: 用于生成古诗的模型
|
||||
:param s: 用于生成古诗的起始字符串,默认为空串
|
||||
:return: 一个字符串,表示一首古诗
|
||||
"""
|
||||
# 将初始字符串转成token
|
||||
token_ids = tokenizer.encode(s)
|
||||
# 去掉结束标记[SEP]
|
||||
token_ids = token_ids[:-1]
|
||||
while len(token_ids) < settings.MAX_LEN:
|
||||
# 进行预测,只保留第一个样例(我们输入的样例数只有1)的、最后一个token的预测的、不包含[PAD][UNK][CLS]的概率分布
|
||||
output = model(np.array([token_ids, ], dtype=np.int32))
|
||||
_probas = output.numpy()[0, -1, 3:]
|
||||
del output
|
||||
# print(_probas)
|
||||
# 按照出现概率,对所有token倒序排列
|
||||
p_args = _probas.argsort()[::-1][:100]
|
||||
# 排列后的概率顺序
|
||||
p = _probas[p_args]
|
||||
# 先对概率归一
|
||||
p = p / sum(p)
|
||||
# 再按照预测出的概率,随机选择一个词作为预测结果
|
||||
target_index = np.random.choice(len(p), p=p)
|
||||
target = p_args[target_index] + 3
|
||||
# 保存
|
||||
token_ids.append(target)
|
||||
if target == 3:
|
||||
break
|
||||
return tokenizer.decode(token_ids)
|
||||
|
||||
|
||||
def generate_acrostic(tokenizer, model, head):
|
||||
"""
|
||||
随机生成一首藏头诗
|
||||
:param tokenizer: 分词器
|
||||
:param model: 用于生成古诗的模型
|
||||
:param head: 藏头诗的头
|
||||
:return: 一个字符串,表示一首古诗
|
||||
"""
|
||||
# 使用空串初始化token_ids,加入[CLS]
|
||||
token_ids = tokenizer.encode('')
|
||||
token_ids = token_ids[:-1]
|
||||
# 标点符号,这里简单的只把逗号和句号作为标点
|
||||
punctuations = [',', '。']
|
||||
punctuation_ids = {tokenizer.token_to_id(token) for token in punctuations}
|
||||
# 缓存生成的诗的list
|
||||
poetry = []
|
||||
# 对于藏头诗中的每一个字,都生成一个短句
|
||||
for ch in head:
|
||||
# 先记录下这个字
|
||||
poetry.append(ch)
|
||||
# 将藏头诗的字符转成token id
|
||||
token_id = tokenizer.token_to_id(ch)
|
||||
# 加入到列表中去
|
||||
token_ids.append(token_id)
|
||||
# 开始生成一个短句
|
||||
while True:
|
||||
# 进行预测,只保留第一个样例(我们输入的样例数只有1)的、最后一个token的预测的、不包含[PAD][UNK][CLS]的概率分布
|
||||
output = model(np.array([token_ids, ], dtype=np.int32))
|
||||
_probas = output.numpy()[0, -1, 3:]
|
||||
del output
|
||||
# 按照出现概率,对所有token倒序排列
|
||||
p_args = _probas.argsort()[::-1][:100]
|
||||
# 排列后的概率顺序
|
||||
p = _probas[p_args]
|
||||
# 先对概率归一
|
||||
p = p / sum(p)
|
||||
# 再按照预测出的概率,随机选择一个词作为预测结果
|
||||
target_index = np.random.choice(len(p), p=p)
|
||||
target = p_args[target_index] + 3
|
||||
# 保存
|
||||
token_ids.append(target)
|
||||
# 只有不是特殊字符时,才保存到poetry里面去
|
||||
if target > 3:
|
||||
poetry.append(tokenizer.id_to_token(target))
|
||||
if target in punctuation_ids:
|
||||
break
|
||||
return ''.join(poetry)
|
||||
Reference in New Issue
Block a user