Keras 值错误:无法使用形状(100,100)广播操作数 - GRU

数据挖掘 喀拉斯 nlp 词嵌入 注意机制
2022-02-18 01:20:28

我正在尝试使用分层注意网络使用我从互联网下载的 20 个新闻组数据集对新闻文章进行分类。我遇到了这个实现代码,并尝试在 20 个新闻组数据集上使用它,因为我很想看到结果并且忍不住。就像他在示例中展示的那样,我遵循相同的步骤并得到错误“ValueError:操作数无法与形状一起广播 (100, 200) (None, Elemwise{mul,no_inplace}.0)”。我从未使用过keras,任何人都可以帮助我解决单词编码和句子编码的尺寸问题。

import re
import numpy as np
import pandas as pd
import sys
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from keras_han.model import HAN

MAX_WORDS_PER_SENT = 100
MAX_SENT = 15
MAX_VOC_SIZE = 20000
GLOVE_DIM = 100
TEST_SPLIT = 0.2

def remove_quotations(text):
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    text = re.sub(r"[^A-Za-z0-9]+", " ", text)
    return text

def remove_html(text):
    tags_regex = re.compile(r'<.*?>')
    return tags_regex.sub('', text)

print('Processing text dataset')
TEXT_DATA_DIR = "E:/Thesis/20news-bydate-test"
df = pd.DataFrame(columns=['Text','Labels'])
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)
print('Found %s texts.' % len(texts))
df=pd.DataFrame({'Text':texts,'Label':labels})
df['Text'] = df['Text'].apply(remove_quotations) 
df['Text'] = df['Text'].apply(remove_html) 
df = df.replace('\n','', regex=True)
news = df['Text'].values
labels = df['Label'].values
#print(news)
#print(labels)

print("Tokenization.")

# Build a Keras Tokenizer that can encode every token
word_tokenizer = Tokenizer(num_words=MAX_VOC_SIZE)
word_tokenizer.fit_on_texts(news)

# Construct the input matrix. This should be a nd-array of
# shape (n_samples, MAX_SENT, MAX_WORDS_PER_SENT).
# We zero-pad this matrix (this does not influence
# any predictions due to the attention mechanism.
X = np.zeros((len(news), MAX_SENT, MAX_WORDS_PER_SENT), dtype='int32')

for i, review in enumerate(news):
    sentences = sent_tokenize(review)
    tokenized_sentences = word_tokenizer.texts_to_sequences(
        sentences
    )
    tokenized_sentences = pad_sequences(
        tokenized_sentences, maxlen=MAX_WORDS_PER_SENT
    )

    pad_size = MAX_SENT - tokenized_sentences.shape[0]

    if pad_size < 0:
        tokenized_sentences = tokenized_sentences[0:MAX_SENT]
    else:
        tokenized_sentences = np.pad(
            tokenized_sentences, ((0,pad_size),(0,0)),
            mode='constant', constant_values=0
        )

    # Store this observation as the i-th observation in
    # the data matrix
    X[i] = tokenized_sentences[None, ...]

# Transform the labels into a format Keras can handle
y = to_categorical(labels)

# We make a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT)

embeddings = {}
with open('./embeddings(100).txt',encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs
# Initialize a matrix to hold the word embeddings
embedding_matrix = np.random.random(
    (len(word_tokenizer.word_index) + 1, GLOVE_DIM)
)
embedding_matrix[0] = 0

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector      
print("Training the model")
han_model = HAN(
    MAX_WORDS_PER_SENT, MAX_SENT, 20, embedding_matrix,
    word_encoding_dim=100, sentence_encoding_dim=100
)

han_model.summary()

checkpoint_saver = ModelCheckpoint(
    filepath='./tmp/model.{epoch:02d}-{val_loss:.2f}.hdf5',
    verbose=1, save_best_only=True
)

han_model.compile(
    optimizer='adagrad', loss='categorical_crossentropy',
    metrics=['acc']
)

han_model.fit(
    X_train, y_train, batch_size=20, epochs=10,
    validation_data=(X_test, y_test),
    callbacks=[checkpoint_saver]
)

这是堆栈跟踪: 在此处输入图像描述

0个回答
没有发现任何回复~