拥有不平衡的数据集。异常分类率为 %5。为了解决这个问题,我对异常类给予了额外的重视。然而,它并没有改变什么。这是我的代码:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
import os
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from keras import optimizers
from keras.layers import Dropout
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from keras import regularizers
from sklearn.utils.class_weight import compute_sample_weight
def GenerateData(w,t,normal_size,abnormal_size):
#w: window length
#t: parameter of abnormal pattern (t=0.6/seperable, t=0.06/partially seperable, t=0.006/inseperable)
data1=[]
data2=[]
mu, sigma = 0, 1
for i in range(normal_size):
x=np.random.normal(mu, sigma, w)
data1.append(x)
for i in range(abnormal_size):
y=np.random.normal(mu, sigma, w)+t*(np.arange(w)+1)
data2.append(y)
data1=np.array(data1)
data2=np.array(data2)
data=np.concatenate((data1, data2), axis=0)
labels=np.concatenate((np.ones(normal_size),np.zeros(abnormal_size)),axis=0)
labels=labels.reshape(-1,1)
Final_Data=np.concatenate((data, labels), axis=1)
return Final_Data
Final_Data=GenerateData(20,0.06,950,50)
df=pd.DataFrame(Final_Data)
df = df.sample(frac=1).reset_index(drop=True)
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform( X_train )
X_test = scaler.transform( X_test )
class_weight = class_weight.compute_class_weight('balanced', np.unique(y[:,-1]),y[:,-1])
#sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
model = Sequential()
model.add(Dense(8, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(y_train.shape[1],activation='softmax'))
opt=optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-3, amsgrad=False)
model.compile(loss='categorical_crossentropy', optimizer=opt)
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-8, patience=20, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=0, save_best_only=True)
history=model.fit(X_train, y_train,validation_data=(X_test, y_test),verbose=2,class_weight=class_weight,callbacks=[monitor,checkpointer],epochs=2000)#classes are weighted
#history=model.fit(X_train, y_train,validation_data=(X_test, y_test),verbose=2,sample_weight=sample_weight,callbacks=[monitor,checkpointer],epochs=2000)# samples are weighted
#history=model.fit(X_train, y_train,validation_data=(X_test, y_test),verbose=2,callbacks=[monitor,checkpointer],epochs=2000)# no weighting
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
model.load_weights('best_weights.hdf5') # load weights from best model
# Calculate accuracy
pred = model.predict(X_test)
pred = np.argmax(pred,axis=1)
y_compare = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_compare, pred)
print("Accuracy score: {}".format(score))
cnf_matrix = confusion_matrix(y_compare, pred)
基于class_weight函数,类权重分别为异常类10和0.52正常类。是否给予不同的权重并没有改变模型的性能。此外,我试图给异常类更多的权重(1e+6),但没有任何改变。模型无法学习。
而不是class_weight方法,我试过了compute_sample_weight,但没有任何改变。
所以,我做错了什么或者为什么加权策略在我的情况下不能正常工作。