您可以使用one-hot 编码将您的域编码为 char 数组。那么你的训练样本应该有维度(样本、最长域长度、所有使用的字符)。这是一个代码示例:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
# make some fake samples
urls = ['https://datascience.stackexchange.com/',
'https://github.com/',
'www.google.com/']
labels = [1,0,1]
df = pd.DataFrame(zip(urls, labels),
columns=['domain', 'label'])
# Make it all to a long string
concat_domains = '\n'.join(df['domain']).lower()
# Find all unique characters by using set()
chars = sorted(list(set(concat_domains)))
num_chars = len(chars)
# Build translation dictionaries, 'a' -> 0, 0 -> 'a'
char2idx = dict((c, i) for i, c in enumerate(chars))
idx2char = dict((i, c) for i, c in enumerate(chars))
# Use longest name length as our sequence window
max_sequence_length = max([len(name) for name in df['domain']])
# build dataset with domains as one-hot encoded chars
X = np.zeros((df.shape[0], max_sequence_length, num_chars), dtype=np.bool)
y = df['label'].values
for i, sequence in enumerate(df['domain']):
for j, char in enumerate(sequence):
X[i, j, char2idx[char]] = 1
# build a model with input dim: (length of longest domain name, number of unique chars found)
model = Sequential()
model.add(LSTM(64, input_shape=(max_sequence_length, num_chars)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
# train for 10 epochs
model.fit(X, y, epochs=10)