谢谢参观!
我有一个自动编码器,我试图将其用于异常检测。我有 2 个日志文件,logfile.log 和 testfile.log。它们本质上是相同的日志文件,我只是将它们拆分用于训练和测试目的。日志文件由数百万行组成,其中每一行是一个日志条目或观察。
我已经对这些文件进行了一些预处理,使它们成为 JSON 字典,其中每一行都是代表观察的字典,每个键代表字典中的一个值。它们是嵌套的,但在我的代码中,为了简单起见,我将其展平。
我的目标是在 logfile.log 上训练自动编码器,然后逐行针对 testfile.log 运行模型并输出重建错误。根据重建错误的大小,我会标记它。(这部分还没有开始)
模型:
import keras
from keras.layers import Input, Dense
from keras.models import Model
import numpy as np
from tensorflow import set_random_seed
import os
import json
from flatten_json import flatten
import time
from sklearn.preprocessing import LabelEncoder
vartime=time.time()
log_file = './logfile.log'
def seedy(s):
np.random.seed(s)
set_random_seed(s)
decshape = 0
class AutoEncoder:
def __init__(self, encoding_dim=31):
self.encoding_dim = encoding_dim
#r = lambda: np.random.randint(1, 5)
newdata = []
with open(log_file, 'r') as file:
counter = 0
buffer = ''
data = []
print(np.intp)
text = file.readlines(40000000)
for line in text:
data = []
counter = counter+1
buffer=json.loads(line)
buffer = flatten(buffer)
if counter%50000 == 0:
print(counter)
print(time.time()-vartime)
data.append((buffer.get("Key1","None")))
data.append((buffer.get("Key2","None")))
data.append((buffer.get("Key3","None")))
data.append(int((buffer.get("Key4","None"))[11:12]))
data.append((buffer.get("key4", "None")))
data.append((buffer.get("key5","None")))
data.append((buffer.get("key6","None")))
newdata.append(data)
#print("TEST\n")
#print(newdata)
self.x = np.array(newdata)
labelencoder_X_1 = LabelEncoder()
self.x[:, 0] = labelencoder_X_1.fit_transform(self.x[:,0])
labelencoder_X_2 = LabelEncoder()
self.x[:, 1] = labelencoder_X_2.fit_transform(self.x[:, 1])
labelencoder_X_3 = LabelEncoder()
self.x[:, 2] = labelencoder_X_3.fit_transform(self.x[:, 2])
labelencoder_X_4 = LabelEncoder()
self.x[:, 4] = labelencoder_X_4.fit_transform(self.x[:, 4])
labelencoder_X_5 = LabelEncoder()
self.x[:, 5] = labelencoder_X_5.fit_transform(self.x[:, 5])
labelencoder_X_6 = LabelEncoder()
self.x[:, 6] = labelencoder_X_6.fit_transform(self.x[:, 6])
self.x = keras.utils.to_categorical(self.x, dtype='float32')
#X_train, X_test = train_test_split(self.x, test_size = 0.1, random_state = 0)
def _encoder(self):
inputs = Input(shape=(self.x[0].shape))
#keras.layers.BatchNormalization()
encoded = Dense(self.encoding_dim, activation='relu')(inputs)
model = Model(inputs, encoded)
self.encoder = model
return model
def _decoder(self):
decshape = self.x.shape[2]
inputs = Input(shape=(self.encoding_dim,))
decoded = Dense(decshape)(inputs)
model = Model(inputs, decoded)
self.decoder = model
return model
def encoder_decoder(self):
ec = self._encoder()
dc = self._decoder()
inputs = Input(shape=self.x[0].shape)
ec_out = ec(inputs)
dc_out = dc(ec_out)
model = Model(inputs, dc_out)
self.model = model
return model
def fit(self, batch_size=10000, epochs=300):
self.model.compile(optimizer='adam', loss='MSE')
log_dir = './log/'
tbCallBack = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True, write_images=True)
self.model.fit(self.x, self.x,
epochs=epochs,
batch_size=batch_size,
callbacks=[tbCallBack])
def save(self):
if not os.path.exists(r'./weights'):
os.mkdir(r'./weights')
else:
self.encoder.save(r'./weights/encoder_weights.h5')
self.decoder.save(r'./weights/decoder_weights.h5')
self.model.save(r'./weights/ae_weights.h5')
if __name__ == '__main__':
seedy(2)
ae = AutoEncoder(encoding_dim=2)
ae.encoder_decoder()
ae.fit(batch_size=10000, epochs=300)
ae.save()
测试:
import keras
from keras.models import load_model
import numpy as np
import json
from flatten_json import flatten
from sklearn.preprocessing import LabelEncoder
import time
vartime = time.time()
encoder = load_model(r'./weights/encoder_weights.h5')
decoder = load_model(r'./weights/decoder_weights.h5')
log_file = "./testfile.log"
hold =[]
with open(log_file, 'r') as file:
counter = 0
buffer = ''
data = []
newdata = []
text = file.readlines(40000000)
for line in text:
data = []
counter = counter+1
buffer=json.loads(line)
buffer = flatten(buffer)
if counter%50000 == 0:
print(counter)
print(time.time()-vartime)
data.append((buffer.get("Key1","None")))
data.append((buffer.get("Key2","None")))
data.append((buffer.get("Key3","None")))
data.append(int((buffer.get("Key4","None"))[11:12]))
data.append((buffer.get("Key5", "None")))
data.append((buffer.get("Key6","None")))
data.append((buffer.get("Key7","None")))
newdata.append(data)
#print("TEST\n")
#print(newdata)
newdata = np.array(newdata)
labelencoder_X_1 = LabelEncoder()
newdata[:, 0] = labelencoder_X_1.fit_transform(newdata[:,0])
labelencoder_X_2 = LabelEncoder()
newdata[:, 1] = labelencoder_X_2.fit_transform(newdata[:, 1])
labelencoder_X_3 = LabelEncoder()
newdata[:, 2] = labelencoder_X_3.fit_transform(newdata[:, 2])
labelencoder_X_4 = LabelEncoder()
newdata[:, 4] = labelencoder_X_4.fit_transform(newdata[:, 4])
labelencoder_X_5 = LabelEncoder()
newdata[:, 5] = labelencoder_X_5.fit_transform(newdata[:, 5])
labelencoder_X_6 = LabelEncoder()
newdata[:, 6] = labelencoder_X_6.fit_transform(newdata[:, 6])
newdata = keras.utils.to_categorical(newdata, dtype='float32')
inputs = newdata[0]
x = encoder.predict(inputs)
y = decoder.predict(x)
print('Input: {}'.format(inputs))
print('Encoded: {}'.format(x))
print('Decoded: {}'.format(y))
运行测试部分时出错:
x = 编码器.预测(输入)
'有形状' + str(data_shape))
ValueError:检查输入时出错:预期 input_8 有 3 个维度,但得到的数组形状为 (7, 31)
注意:当我执行“inputs = newdata”以使其成为 3 维时,我得到了同样的错误,但说应该是 2d 数组。我什至尝试使用 logfile.log 作为测试部分的测试文件,因为它实际上与训练中使用的相同。
我还将实际的键名更改为 Key1、key2、key3... 等,就好像我没有一样,代码看起来非常混乱。我确实知道我正在正确读取文件并正确编码所有内容。这不应该是问题,但如果绝对必要,我可以重新实现原始键名。
任何帮助将不胜感激。让我知道是否需要更多信息,或者这是否是这个问题的错误位置。
谢谢!