使用 pytorch 从头开始​​一个简单的基于注意力的文本预测模型

数据挖掘 nlp 火炬 序列到序列 注意机制 语言模型
2022-02-22 12:41:51

我首先在 codereview SE 中问了这个问题,但一位用户建议在这里发布这个问题。

我使用 pytorch 创建了一个简单的基于自我注意的文本预测模型。用于创建注意力层的注意力公式是,

在此处输入图像描述

我想验证整个代码是否正确实现,尤其是我自定义的Attention层实现。

完整代码

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import random
random.seed(0)
torch.manual_seed(0)

# Sample text for Training
test_sentence = """Thomas Edison. The famed American inventor rose to prominence in the late
19th century because of his successes, yes, but even he felt that these successes
were the result of his many failures. He did not succeed in his work on one of his
most famous inventions, the lightbulb, on his first try nor even on his hundred and
first try. In fact, it took him more than 1,000 attempts to make the first incandescent
bulb but, along the way, he learned quite a deal. As he himself said,
"I did not fail a thousand times but instead succeeded in finding a thousand ways it would not work." 
Thus Edison demonstrated both in thought and action how instructive mistakes can be. 
""".lower().split()

# Build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]

# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = list(set(test_sentence))
word_to_ix2 = {word: i for i, word in enumerate(vocab)}

# Number of Epochs
EPOCHS = 25

# SEQ_SIZE is the number of words we are using as a context for the next word we want to predict
SEQ_SIZE = 2

# Embedding dimension is the size of the embedding vector
EMBEDDING_DIM = 10

# Size of the hidden layer
HIDDEN_DIM = 256

class Attention(nn.Module):
    """
    A custom self attention layer
    """
    def __init__(self, in_feat,out_feat):
        super().__init__()             
        self.Q = nn.Linear(in_feat,out_feat) # Query
        self.K = nn.Linear(in_feat,out_feat) # Key
        self.V = nn.Linear(in_feat,out_feat) # Value
        self.softmax = nn.Softmax(dim=1)

    def forward(self,x):
        Q = self.Q(x)
        K = self.K(x)
        V = self.V(x)
        d = K.shape[0] # dimension of key vector
        QK_d = (Q @ K.T)/(d)**0.5
        prob = self.softmax(QK_d)
        attention = prob @ V
        return attention

class Model(nn.Module):
    def __init__(self,vocab_size,embed_size,seq_size,hidden):
        super().__init__()
        self.embed = nn.Embedding(vocab_size,embed_size)
        self.attention = Attention(embed_size,hidden)
        self.fc1 = nn.Linear(hidden*seq_size,vocab_size) # converting n rows to 1
        self.softmax = nn.Softmax(dim=1)

    def forward(self,x):
        x = self.embed(x)
        x = self.attention(x).view(1,-1)
        x = self.fc1(x)
        log_probs = F.log_softmax(x,dim=1)
        return log_probs

learning_rate = 0.001
loss_function = nn.NLLLoss()  # negative log likelihood

model = Model(len(vocab),EMBEDDING_DIM,CONTEXT_SIZE,HIDDEN_DIM)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Training
for i in range(EPOCHS):
    total_loss = 0
    for context, target in trigrams:
        # context, target = ['thomas', 'edison.'] the
        
        # step 1: context id generation
        context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)

        # step 2: setting zero gradient for models
        model.zero_grad()

        # step 3: Forward propogation for calculating log probs
        log_probs = model(context_idxs)

        # step 4: calculating loss
        loss = loss_function(log_probs, torch.tensor([word_to_ix2[target]], dtype=torch.long))

        # step 5: finding the gradients
        loss.backward()

        #step 6: updating the weights
        optimizer.step()

        total_loss += loss.item()
    if i%2==0:
        print("Epoch: ",str(i)," Loss: ",str(total_loss))

# Prediction
with torch.no_grad():
    # Fetching a random context and target 
    rand_val = trigrams[random.randrange(len(trigrams))]
    print(rand_val)
    context = rand_val[0]
    target = rand_val[1]
    
    # Getting context and target index's
    context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
    target_idxs = torch.tensor([word_to_ix2[w] for w in [target]], dtype=torch.long)
    print("Acutal indices: ", context_idxs, target_idxs)
    log_preds = model(context_idxs)
    print("Predicted indices: ",torch.argmax(log_preds))
1个回答

只是为了好玩,运行它以获得长时间的生成输出。这是一些放在最后的代码。此外,您可能希望将其更改为 n 元组或 ngram。这是一个不错的玩具语言模型!

output_str = []
with torch.no_grad():
  context = ngrams[0][0][:]

  # Getting context and target index's
  context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
  output_txt = context

  for i in range(30):
    context = context[1:]+[vocab[ixp]]
    output_txt+=[vocab[ixp]]
    #print(vocab[ixp],end=' ')

    context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
    log_preds = model(context_idxs)
    ixp=torch.multinomial(np.exp(log_preds),1)
    #ixp=torch.argmax(log_preds)

" ".join(output_txt)