我正在尝试使用分层注意网络使用我从互联网下载的 20 个新闻组数据集对新闻文章进行分类。我遇到了这个实现代码,并尝试在 20 个新闻组数据集上使用它,因为我很想看到结果并且忍不住。就像他在示例中展示的那样,我遵循相同的步骤并得到错误“ValueError:操作数无法与形状一起广播 (100, 200) (None, Elemwise{mul,no_inplace}.0)”。我从未使用过keras,任何人都可以帮助我解决单词编码和句子编码的尺寸问题。
import re
import numpy as np
import pandas as pd
import sys
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from keras_han.model import HAN
MAX_WORDS_PER_SENT = 100
MAX_SENT = 15
MAX_VOC_SIZE = 20000
GLOVE_DIM = 100
TEST_SPLIT = 0.2
def remove_quotations(text):
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\"", "", text)
text = re.sub(r"[^A-Za-z0-9]+", " ", text)
return text
def remove_html(text):
tags_regex = re.compile(r'<.*?>')
return tags_regex.sub('', text)
print('Processing text dataset')
TEXT_DATA_DIR = "E:/Thesis/20news-bydate-test"
df = pd.DataFrame(columns=['Text','Labels'])
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
if sys.version_info < (3,):
f = open(fpath)
else:
f = open(fpath, encoding='latin-1')
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
f.close()
labels.append(label_id)
print('Found %s texts.' % len(texts))
df=pd.DataFrame({'Text':texts,'Label':labels})
df['Text'] = df['Text'].apply(remove_quotations)
df['Text'] = df['Text'].apply(remove_html)
df = df.replace('\n','', regex=True)
news = df['Text'].values
labels = df['Label'].values
#print(news)
#print(labels)
print("Tokenization.")
# Build a Keras Tokenizer that can encode every token
word_tokenizer = Tokenizer(num_words=MAX_VOC_SIZE)
word_tokenizer.fit_on_texts(news)
# Construct the input matrix. This should be a nd-array of
# shape (n_samples, MAX_SENT, MAX_WORDS_PER_SENT).
# We zero-pad this matrix (this does not influence
# any predictions due to the attention mechanism.
X = np.zeros((len(news), MAX_SENT, MAX_WORDS_PER_SENT), dtype='int32')
for i, review in enumerate(news):
sentences = sent_tokenize(review)
tokenized_sentences = word_tokenizer.texts_to_sequences(
sentences
)
tokenized_sentences = pad_sequences(
tokenized_sentences, maxlen=MAX_WORDS_PER_SENT
)
pad_size = MAX_SENT - tokenized_sentences.shape[0]
if pad_size < 0:
tokenized_sentences = tokenized_sentences[0:MAX_SENT]
else:
tokenized_sentences = np.pad(
tokenized_sentences, ((0,pad_size),(0,0)),
mode='constant', constant_values=0
)
# Store this observation as the i-th observation in
# the data matrix
X[i] = tokenized_sentences[None, ...]
# Transform the labels into a format Keras can handle
y = to_categorical(labels)
# We make a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT)
embeddings = {}
with open('./embeddings(100).txt',encoding='utf-8') as file:
for line in file:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings[word] = coefs
# Initialize a matrix to hold the word embeddings
embedding_matrix = np.random.random(
(len(word_tokenizer.word_index) + 1, GLOVE_DIM)
)
embedding_matrix[0] = 0
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
print("Training the model")
han_model = HAN(
MAX_WORDS_PER_SENT, MAX_SENT, 20, embedding_matrix,
word_encoding_dim=100, sentence_encoding_dim=100
)
han_model.summary()
checkpoint_saver = ModelCheckpoint(
filepath='./tmp/model.{epoch:02d}-{val_loss:.2f}.hdf5',
verbose=1, save_best_only=True
)
han_model.compile(
optimizer='adagrad', loss='categorical_crossentropy',
metrics=['acc']
)
han_model.fit(
X_train, y_train, batch_size=20, epochs=10,
validation_data=(X_test, y_test),
callbacks=[checkpoint_saver]
)
