数据挖掘 - 如何正确使用 keras.layers.AdditiveAttention？ - 吾爱随笔录

我对这个话题的理解充其量只是肤浅的，所以请多多包涵。我有几个问题（特别是关于如何使用 keras.layers.AdditiveAttention）我希望适合一起问。还发布了许多类似的问题，如果我没有正确理解它们以解决我的问题，我深表歉意。

对于注意力机制，为什么query和value的维度必须相同？例如堆叠 1a 和堆叠 3a。
我的理解是query：=解码器的最后一个隐藏状态，和values：=编码器的所有隐藏状态。
对于我的其他没有错误的示例（堆叠 1b 和 2b），注意力层实际上是否正确实施？如果没有，我该怎么做？
在 Single network 的情况下，Stacked 3a 和 Stacked 3b，它们各自的query和value输入应该是什么？

背景：我正在尝试使用多个堆叠的网络来提取特征，然后使用处理后的特征进行预测。

我认为其中大部分都是重复的，但我希望这些示例有助于更好地说明我的问题/困惑。

##-------------------------------------------------------------------
## Some imports and functions
import numpy as np
import pandas as pd

from keras import Model
from keras.layers import Input, Dense, Dropout, RepeatVector, AdditiveAttention, GRU

def temporalize(X, y, past_records):
    """
    Taken from https://towardsdatascience.com/lstm-autoencoder-for-extreme-rare-event-classification-in-keras-ce209a224cfb      (I edited it by a tiny bit for my use case).
    """
    output_X = []
    output_y = []
    for i in range(len(X)-past_records):
        t = []
        t.append(X[i:(i+past_records+1)])
        output_X.append(t)
        output_y.append(y[i+past_records])
    return np.squeeze(np.array(output_X)), np.array(output_y)

##-------------------------------------------------------------------
## Random data
df = pd.concat([
    pd.Series(np.arange(30)), 
    pd.Series((np.arange(30))**2), 
    pd.Series((np.arange(30))**3),
    pd.Series((np.arange(30))**4)
    ], axis=1)
df.columns = ['A','B','Label_1','Label_2']

past_records = 4
n_batch      = 5

index_train = df.index[df.index[0:20]]
index_test  = df.index[df.index[20:]]

X_train          = df.drop(['Label_1','Label_2'], axis=1).loc[index_train]
y_train          = df[['Label_1','Label_2']].loc[index_train]
X_train, y_train = np.array(X_train), np.array(y_train)

X_test         = df.drop(['Label_1','Label_2'], axis=1).iloc[np.concatenate((index_train[-past_records:], index_test), axis=0)]
y_test         = df[['Label_1','Label_2']].iloc[np.concatenate((index_train[-past_records:], index_test), axis=0)]
X_test, y_test = np.array(X_test), np.array(y_test)

n_features = X_train.shape[1]
n_outputs  = y_train.shape[1]
X_train, y_train = temporalize(X=X_train, y=y_train, past_records=past_records)
X_train = X_train.reshape(X_train.shape[0], past_records+1, n_features)
X_test, y_test   = temporalize(X=X_test, y=y_test, past_records=past_records)
X_test = X_test.reshape(X_test.shape[0], past_records+1, n_features)

input_shape = (past_records+1, n_features)
inputs = Input(shape=input_shape, name='inputs')

##-------------------------------------------------------------------
## Single network
enc = GRU(8, return_sequences=True)(inputs)
# Attention
attn = AdditiveAttention()([enc, enc]) # query, value
out  = Dense(n_outputs, activation='relu')(attn)

# Compile
model = Model(inputs=inputs, outputs=out)
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, np.array(y_train), epochs=3, batch_size=n_batch, shuffle=False)

##-------------------------------------------------------------------
## Stacked 1a (Error)
# InvalidArgumentError: Dimension must be equal, but are 4 and 8
enc = GRU(8, return_sequences=True)(inputs)
dec = GRU(4, return_sequences=False)(enc)

# Attention
attn = AdditiveAttention()([dec, enc]) # query, value
out  = Dense(n_outputs, activation='relu')(attn)
# Compile
model = Model(inputs=inputs, outputs=out)
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, np.array(y_train), epochs=3, batch_size=n_batch, shuffle=False)

##-------------------------------------------------------------------
## Stacked 1b (In response to Stacked 1a)
enc = GRU(8, return_sequences=True)(inputs)
dec = GRU(8, return_sequences=False)(enc)

# Attention
attn = AdditiveAttention()([dec, enc]) # query, value
out  = Dense(n_outputs, activation='relu')(attn)
# Compile
model = Model(inputs=inputs, outputs=out)
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, np.array(y_train), epochs=3, batch_size=n_batch, shuffle=False)


##-------------------------------------------------------------------
## Stacked 2a
enc = GRU(16, return_sequences=True)(inputs)
enc = GRU(8, return_sequences=True)(enc)
dec = GRU(8, return_sequences=False)(enc)

# Attention
attn = AdditiveAttention()([dec, enc]) # query, value
out  = Dense(n_outputs, activation='relu')(attn)
# Compile
model = Model(inputs=inputs, outputs=out)
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, np.array(y_train), epochs=3, batch_size=n_batch, shuffle=False)

##-------------------------------------------------------------------
## Stacked 3a (Dimensions error again)
enc = GRU(16, return_sequences=True)(inputs)
enc = GRU(8, return_sequences=True)(enc)
dec = GRU(8, return_sequences=True)(enc)
dec = GRU(16, return_sequences=False)(dec )

# Attention
attn = AdditiveAttention()([dec, enc]) # query, value
out  = Dense(n_outputs, activation='relu')(attn)
# Compile
model = Model(inputs=inputs, outputs=out)
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, np.array(y_train), epochs=3, batch_size=n_batch, shuffle=False)


##-------------------------------------------------------------------
## Stacked 3b
gru      = GRU(64, return_sequences=True)(inputs)
gru      = GRU(32, return_sequences=True)(gru)
gru      = GRU(16, return_sequences=True)(gru)
gru_last = GRU(8, return_sequences=False)(gru)

# Attention
attn = AdditiveAttention()([gru_last, ?]) # query, value
out  = Dense(n_outputs, activation='relu')(attn)
# Compile
model = Model(inputs=inputs, outputs=out)
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, np.array(y_train), epochs=3, batch_size=n_batch, shuffle=False)

我见过的一些参考资料/资源：