文章目录
- 1. 数据处理
- 2. 编码器、解码器数据
- 2.1 编码器
- 2.2 解码器
- 2.3 模型
- 3. 训练
- 4. 推理模型
- 5. 采样
参考 基于深度学习的自然语言处理
1. 数据处理
- 读取数据
with open('deu.txt', 'r', encoding='utf-8') as f:lines = f.read().split('\n')
print("文档有 {} 行。".format(len(lines)))
num_samples = 20000 # 使用的语料行数
lines_to_use = lines[ : min(num_samples, len(lines)-1)]
print(lines_to_use)
- 替换数字
import re
print(lines_to_use[19516])
for i in range(len(lines_to_use)):lines_to_use[i] = re.sub('\d', ' _NUMBER_ ', lines_to_use[i])# 用 ' _NUMBER_ ' 替换 数字(\d)
print(lines_to_use[19516])
输出:(数字被替换了)
Turn to channel 1. Wechsle auf Kanal eins.
Turn to channel _NUMBER_ . Wechsle auf Kanal eins.
- 切分 输入,输出
input_texts = [] # 输入句子集
target_texts = [] # 输出句子集
input_words = set() # 输入词集合
target_words = set() # 输出词集合
for line in lines_to_use:x, y = line.split('\t')y = 'BEGIN_ ' + y + ' _END' # 输出加上 开始结束 标记input_texts.append(x)target_texts.append(y)for word in x.split():if word not in input_words:input_words.add(word)for word in y.split():if word not in target_words:target_words.add(word)
- 输入输出句子的 最大长度
max_input_seq_len = max([len(seq.split()) for seq in input_texts])
# 11
max_target_seq_len = max([len(seq.split()) for seq in target_texts])
# 15
- 输入输出 tokens 个数
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words) # 5724
num_decoder_tokens = len(target_words) # 9126
- 建立 tokens 与 id 的映射关系
inputToken_idx = {token : i for (i, token) in enumerate(input_words)}
outputToken_idx = {token : i for (i, token) in enumerate(target_words)}
idx_inputToken = {i : token for (i, token) in enumerate(input_words)}
idx_outputToken = {i : token for (i, token) in enumerate(target_words)}
2. 编码器、解码器数据
- 注意维度的意义
import numpy as np
encoder_input_data = np.zeros((len(input_texts), max_input_seq_len),# 句子数量, 最大输入句子长度dtype=np.float32
)decoder_input_data = np.zeros((len(target_texts), max_target_seq_len),# 句子数量, 最大输出句子长度dtype=np.float32
)decoder_output_data = np.zeros((len(target_texts), max_target_seq_len, num_decoder_tokens),# 句子数量, 最大输出句子长度, 输出 tokens ids 个数dtype=np.float32
)
- 填充矩阵
for i,(input_text, target_text) in enumerate(zip(input_texts, target_texts)):for t, word in enumerate(input_text.split()):encoder_input_data[i, t] = inputToken_idx[word]for t, word in enumerate(target_text.split()):decoder_input_data[i, t] = outputToken_idx[word]if t > 0:# 解码器的输出比输入提前一个时间步decoder_output_data[i, t-1, outputToken_idx[word]] = 1.
2.1 编码器
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Modelembedding_size = 256 # 嵌入维度
rnn_size = 64
# 编码器
encoder_inputs = Input(shape=(None,))
encoder_after_embedding = Embedding(input_dim=num_encoder_tokens, # 单词个数output_dim=embedding_size)(encoder_inputs)
encoder_lstm = LSTM(units=rnn_size, return_state=True)
# return_state: Boolean. Whether to return
# the last state in addition to the output.
_, state_h, state_c = encoder_lstm(encoder_after_embedding)
encoder_states = [state_h, state_c] # 思想向量
2.2 解码器
# 解码器
decoder_inputs = Input(shape=(None,))
decoder_after_embedding = Embedding(input_dim=num_decoder_tokens, # 单词个数output_dim=embedding_size)(decoder_inputs)
decoder_lstm = LSTM(units=rnn_size, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_after_embedding,initial_state=encoder_states)
# 使用 encoder 输出的思想向量初始化 decoder 的 LSTM 的初始状态
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
# 输出词个数,多分类
decoder_outputs = decoder_dense(decoder_outputs)
2.3 模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()from keras.utils import plot_model
plot_model(model,to_file='model.png')
输出:
Model: "model_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) (None, None) 0
__________________________________________________________________________________________________
input_2 (InputLayer) (None, None) 0
__________________________________________________________________________________________________
embedding_1 (Embedding) (None, None, 256) 1465344 input_1[0][0]
__________________________________________________________________________________________________
embedding_2 (Embedding) (None, None, 256) 2336256 input_2[0][0]
__________________________________________________________________________________________________
lstm_1 (LSTM) [(None, 64), (None, 82176 embedding_1[0][0]
__________________________________________________________________________________________________
lstm_2 (LSTM) [(None, None, 64), ( 82176 embedding_2[0][0] ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201215221559994.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzIxMjAxMjY3,size_16,color_FFFFFF,t_70)lstm_1[0][1] lstm_1[0][2]
__________________________________________________________________________________________________
dense_1 (Dense) (None, None, 9126) 593190 lstm_2[0][0]
==================================================================================================
Total params: 4,559,142
Trainable params: 4,559,142
Non-trainable params: 0
__________________________________________________________________________________________________
3. 训练
- 训练 + 回调函数保存最佳模型
from keras.callbacks import ModelCheckpointfilepath='weights.best.h5'# 有一次提升, 则覆盖一次 save_best_only=True
checkpoint = ModelCheckpoint(filepath, monitor='accuracy', verbose=1,save_best_only=True,mode='max',save_freq=2)
callbacks_list = [checkpoint]
# https://keras.io/api/callbacks/model_checkpoint/history = model.fit(x=[encoder_input_data, decoder_input_data],y=decoder_output_data,batch_size=128,epochs=200,validation_split=0.1,callbacks=callbacks_list)
model.save('model.h5')
- 绘制训练曲线
import pandas as pd
from matplotlib import pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']plt.plot(loss, label='train Loss')
plt.plot(val_loss, label='valid Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid()
plt.show()plt.plot(acc, label='train Acc')
plt.plot(val_acc, label='valid Acc')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid()
plt.show()
4. 推理模型
- 编码器
encoder_model = Model(encoder_inputs, encoder_states) # 输入(带embedding),输出思想向量
- 解码器
# 编码器的输出,作为解码器的初始状态
decoder_state_input_h = Input(shape=(rnn_size,))
decoder_state_input_c = Input(shape=(rnn_size,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
初始状态 + embedding 作为输入,经过LSTM,输出 decoder_outputs_inf, state_h_inf, state_c_inf
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(decoder_after_embedding,initial_state=decoder_states_inputs)
# 作为下一次推理的状态输入 h, c
decoder_states_inf = [state_h_inf, state_c_inf]
# LSTM的输出,接 FC,预测下一个词是什么
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs_inf] + decoder_states_inf
)
5. 采样
def decode_sequence(input_seq):# encoder_states = [state_h, state_c]states_value = encoder_model.predict(input_seq) # list 2个 array 1*rnn_sizetarget_seq = np.zeros((1, 1))# 目标输入序列 初始为 'BEGIN_' 的 idxtarget_seq[0, 0] = outputToken_idx['BEGIN_']stop = Falsedecoded_sentence = ''while not stop:output_tokens, h, c = decoder_model.predict([target_seq] + states_value)# output_tokens [1*1*9126] h,c [1*rnn_size]sampled_token_idx = np.argmax(output_tokens)sampled_word = idx_outputToken[sampled_token_idx]decoded_sentence += ' ' + sampled_wordif sampled_word == '_END' or len(decoded_sentence) > 60:stop = Truetarget_seq = np.zeros((1, 1))target_seq[0, 0] = sampled_token_idx # 作为下一次预测,输入# Update statesstates_value = [h, c] # 作为下一次的状态输入return decoded_sentence# 简单测试 采样
text_to_translate = 'Are you happy ?'
encoder_input_to_translate = np.zeros((1, max_input_seq_len),dtype=np.float32)
for t, word in enumerate(text_to_translate.split()):encoder_input_to_translate[0, t] = inputToken_idx[word]# encoder_input_to_translate [[ids,...,0,0,0,0]]
print(decode_sequence(encoder_input_to_translate))
输出:
text_to_translate = 'Are you happy?'
输出: Sind Sie glücklich? _END # 你高兴吗?
text_to_translate = 'Where is my car?'
输出: Wo ist mein Auto? _END # 我的车呢?
text_to_translate = 'When I see you, I fall in love with you!'
输出:Sind Sie mit uns gehen. _END # 你跟我们一起去吗?
注意:
- 待翻译句子长度不能超过最大长度
- 且不能出现没有出现过的词汇,如
dear
出现过,但是与标点连着写dear!
没有出现过,会报错