数据挖掘 - 为什么我的 seq2seq 模型没有重新识别 <END> 标签是什么？ - 吾爱随笔录

我在本指南的帮助下制作了一个 seq2seq 聊天机器人：https ://medium.com/predict/creating-a-chatbot-from-scratch-using-keras-and-tensorflow-59e8fc76be79 。我像这样设置模型和数据处理：

import tensorflow as tf
import os
import yaml
import numpy as np
import requests, zipfile, io
import pickle
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.utils import to_categorical
from tensorflow.keras import layers , activations , models , preprocessing
import requests, zipfile, io
tokenizer = Tokenizer(num_words=5000)

dir_path = 'raw_data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep

            answers.append(str(ans) + " end")
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(str(con[1]) + " end")
a = []
for i in answers:
    a.append("start "+i)
tokenizer.fit_on_texts(questions + a)
encoder_input_data = pad_sequences(tokenizer.texts_to_sequences(questions), maxlen=22)
decoder_input_data = pad_sequences(tokenizer.texts_to_sequences(a), maxlen=74)
decoder_target_data = to_categorical(pad_sequences(tokenizer.texts_to_sequences(answers), maxlen=74))

num_tokens = len( tokenizer.word_index )+1
word_dict = tokenizer.word_index
max_question_len = encoder_input_data.shape[1]
max_answer_len = decoder_input_data.shape[1]

print( 'Max length of question is {}'.format( max_question_len) )
print( 'Max length of answer is {}'.format( max_answer_len) )
print(num_tokens)
print( encoder_input_data.shape )
print( decoder_input_data.shape )
print( decoder_target_data.shape )

encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_tokens, 200 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_tokens, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')


model.summary()

model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=100, epochs=100 ) 
model.save( 'model.h5' ) 


def make_inference_models():

    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))

    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)

    return encoder_model , decoder_model

def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( word_dict[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_question_len , padding='post')


enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > max_answer_len:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

它可以正常工作，但是聊天时的输出非常糟糕，这是其中之一：

Enter question : how are you

capacity normally again again again often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often

从它说同一个词直到达到字符限制的方式来看，我认为模型没有学习标签。但是我检查了正在为 encoder_target_data 变量处理的数据，并且所有句子都以结束标记结尾。我不知道为什么它没有收到结束标签。我怎样才能解决这个问题？