TLDR:我的网络是成对训练的,所以不是 10^6 个样本,而是 10^12 个样本(样本数的平方)。这么大的数据集不应该过拟合,但在很少的时期之后就会过拟合。找不到原因,求帮助。谢谢。
我正在尝试实现一个国际象棋深度学习模型,如论文“DeepChess: End-to-End Deep Neural Network for Automatic Learning in Chess”(https://www.cs.tau.ac.il/~wolf /papers/deepchess.pdf)。
它使用连体神经网络来比较 2 个国际象棋位置并预测哪个位置更好。它还使用自动编码器来提取特征。我用 tensorflow 作为后端在 keras 中实现了所有东西,但我的问题是连体模型过度拟合,经过几个时期后它的准确率高达 87%(在论文中它高达 97%)并开始过度拟合(验证集的损失开始上升),而训练集的损失和准确性仍然提高。
在论文中它说“由于潜在训练对的数量是 6.5 × 10^12,几乎每个 epoch 中的所有训练样本都是新的,因此保证不会发生过度拟合。”
所以理论上它不应该过拟合,也不需要正则化。
任何人都知道如何处理这个问题?
这是我在深度学习方面的第一个项目,所以我真的不知道该怎么做,也许有人可以指出我应该研究的正确方向或我应该在哪里尝试检查问题(我的假设是连体模型或数据生成器中的错误,但对我来说一切都很好)。
提前致谢!
我在这里添加了项目的代码:
如果您想自己编译和尝试,您需要下载国际象棋 pgn 游戏数据库(http://ccrl.chessdom.com/ccrl/4040/games.html)。
生成数据:
dataPath = "ChessDataBase.pgn"
num_white_moves = 1000000
num_black_moves = 1000000
num_white_moves_per_arr = 100000
num_black_moves_per_arr = 100000
def get_valid_moves(game):
valid_moves = []
for i, move in enumerate(game.mainline_moves()):
if not game.board().is_capture(move) and i >= 5:
# Append the move index to the valid_moves list
valid_moves.append(i)
return valid_moves
# Get bit representation of chess board
def get_bitboard(board):
bitboard = np.zeros(2 * 6 * 64 + 5, dtype='float32')
piece_indices = {
'p': 0,
'n': 1,
'b': 2,
'r': 3,
'q': 4,
'k': 5}
for i in range(64):
if board.piece_at(i):
color = int(board.piece_at(i).color)
bitboard[(6 * color + piece_indices[board.piece_at(i).symbol().lower()] + 12 * i)] = 1
bitboard[-1] = int(board.turn)
bitboard[-2] = int(board.has_kingside_castling_rights(True))
bitboard[-3] = int(board.has_kingside_castling_rights(False))
bitboard[-4] = int(board.has_queenside_castling_rights(True))
bitboard[-5] = int(board.has_queenside_castling_rights(False))
return bitboard
# Adds 10 moves from game to move_array at location move_index
def add_moves(game, move_array, move_index):
valid_moves = get_valid_moves(game)
moves_count = 0
selected_moves = []
for i in range(8):
if not valid_moves:
break
move = random.choice(valid_moves)
valid_moves.remove(move)
selected_moves.append(move)
moves_count = moves_count + 1
board = chess.Board()
for i, move in enumerate(game.mainline_moves()):
board.push(move)
if move_index >= move_array.shape[0]:
break
if i in selected_moves:
move_array[move_index] = get_bitboard(board)
move_index += 1
return move_index, moves_count
def iterate_over_data():
white_moves = np.zeros((num_white_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
black_moves = np.zeros((num_black_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
# _white and black move counts store how many white and black moves have been stored
white_move_index = 0
black_move_index = 0
black_moves_count = 0
white_moves_count = 0
count = 0
white_count = 1
black_count = 1
white_empty = True
black_empty = True
pgn = open(dataPath)
while True:
# Debug printing
if count % 1000 == 0:
print("Game Number: {count}\twhite moves: {white_moves}\tblack moves: {black_moves}".format(
count=count,
black_moves=black_moves_count,
white_moves=white_moves_count))
game = chess.pgn.read_game(pgn)
if not game or white_moves_count >= num_white_moves and black_moves_count >= num_black_moves:
break
if game.headers["Result"] == "1-0" and white_moves_count < num_white_moves:
white_move_index, moves_count = add_moves(game, white_moves, white_move_index % num_white_moves_per_arr)
white_moves_count = white_moves_count + moves_count
if game.headers["Result"] == "0-1" and black_moves_count < num_black_moves:
black_move_index, moves_count = add_moves(game, black_moves, black_move_index % num_black_moves_per_arr)
black_moves_count = black_moves_count + moves_count
if white_moves_count > num_white_moves_per_arr:
print(len(white_moves))
w_str = str(white_count)
print("Saving white" + w_str + " array")
np.save('data4/white' + w_str + '.npy', white_moves[:num_white_moves_per_arr])
white_count = white_count + 1
white_moves = np.zeros((num_white_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
white_move_index = 0
white_moves_count = 0
if black_moves_count > num_black_moves_per_arr:
b_str = str(black_count)
print("Saving black" + b_str + " array")
np.save('data4/black' + b_str + '.npy', black_moves[:num_black_moves_per_arr])
black_count = black_count + 1
black_moves = np.zeros((num_black_moves_per_arr, 2 * 6 * 64 + 5), dtype='float32')
black_moves_count = 0
black_move_index = 0
count += 1
iterate_over_data()
拆分数据以训练和验证集合文件:
import numpy as np
num_of_positions = 2800000
num_of_positions_per_file = 100000
num_of_train_positions = int(num_of_positions-100000)
whites = np.zeros((num_of_positions, 773), dtype='float32')
blacks = np.zeros((num_of_positions, 773), dtype='float32')
for i in range(28):
print(i + 1)
whites[i * 100000:(i + 1) * 100000] = np.load('./data4/white' + str(i + 1) + '.npy')
blacks[i * 100000:(i + 1) * 100000] = np.load('./data4/black' + str(i + 1) + '.npy')
print("Shuffling white positions")
np.random.shuffle(whites)
print("Shuffling black positions")
np.random.shuffle(blacks)
train_whites = whites[:num_of_train_positions]
train_blacks = blacks[:num_of_train_positions]
val_whites = whites[num_of_train_positions:]
val_blacks = blacks[num_of_train_positions:]
for i in range(int(num_of_train_positions/num_of_positions_per_file)):
np.save('./data5/white_train' + str(i+1) + '.npy', train_whites[i*num_of_positions_per_file:(i+1)*num_of_positions_per_file])
np.save('./data5/black_train' + str(i+1) + '.npy', train_blacks[i*num_of_positions_per_file:(i+1)*num_of_positions_per_file])
if i < int((num_of_positions-num_of_train_positions)/num_of_positions_per_file):
np.save('./data5/white_val' + str(i + 1) + '.npy', val_whites[i * num_of_positions_per_file:(i + 1) * num_of_positions_per_file])
np.save('./data5/black_val' + str(i + 1) + '.npy', val_blacks[i * num_of_positions_per_file:(i + 1) * num_of_positions_per_file])
训练自动编码器:
from keras.layers import Dense, Input
from keras.models import Model
import keras
import numpy as np
import gc
from util import DenseTied
class AutoEncoder:
def __init__(self):
self.positions = []
self.positions_val = []
self.model = None
self.encoder = None
self.decoder = None
def __encoder(self):
input_layer = Input(shape=(773,))
hidden_1 = Dense(600, activation='relu')(input_layer)
hidden_2 = Dense(400, activation='relu')(hidden_1)
hidden_3 = Dense(200, activation='relu')(hidden_2)
code = Dense(100, activation='relu')(hidden_3)
encoder = Model(input_layer, code, name='encoder')
encoder.summary()
self.encoder = encoder
return encoder
def __decoder(self):
code_input = Input(shape=(100,))
hidden_1 = DenseTied(200, activation='relu', tied_to=self.encoder.layers[4])(code_input)
hidden_2 = DenseTied(400, activation='relu', tied_to=self.encoder.layers[3])(hidden_1)
hidden_3 = DenseTied(600, activation='relu', tied_to=self.encoder.layers[2])(hidden_2)
output_layer = DenseTied(773, activation='sigmoid', tied_to=self.encoder.layers[1])(hidden_3)
decoder = Model(code_input, output_layer, name='decoder')
decoder.summary()
self.decoder = decoder
return decoder
def encoder_decoder(self, load=0):
input_layer = Input(shape=(773,))
if load:
self.encoder = keras.models.load_model('./Pos2Vec/encoder_v1/encoder_epoch66')
else:
self.__encoder()
self.__decoder()
ec_out = self.encoder(input_layer)
dc_out = self.decoder(ec_out)
autoencoder = Model(input_layer, dc_out, name='autoencoder')
self.model = autoencoder
self.model.summary()
return autoencoder
def train(self, batch_size=256, epochs=20):
self.model.compile(optimizer='adam', loss='binary_crossentropy')
self.load_data3()
for epoch in range(epochs):
self.shuffle_positions()
gc.collect()
train = self.positions[:2000000]
self.model.fit(train, train, validation_data=(self.positions_val, self.positions_val), epochs=1, batch_size=batch_size)
train = []
gc.collect()
print('Saving ./Pos2Vec/encoder_v1/encoder_epoch' + str(epoch+1))
self.encoder.save('./Pos2Vec/encoder_v1/encoder_epoch' + str(epoch+1))
def save(self):
self.encoder.save('./weights3/encoder_v8.h5')
# self.decoder.save('./weights3/decoder_v7.h5')
# self.model.save('./weights3/autoencoder_v8.h5')
def load_data3(self):
positions = 2000000
val_positions = 200000
num_per_file = 100000
self.positions = np.zeros((2*positions, 773), dtype='float32')
self.positions_val = np.zeros((2*val_positions, 773), dtype='float32')
for i in range(int(positions/num_per_file)):
print(i + 1)
start = i*num_per_file
self.positions[start:start + num_per_file] = np.load('./data3/white_train' + str(i + 1) + '.npy')
self.positions[positions + start:positions + start + num_per_file] = np.load('./data3/black_train' + str(i + 1) + '.npy')
if i < val_positions/num_per_file:
self.positions_val[start:start + num_per_file] = np.load('./data3/white_val' + str(i + 1) + '.npy')
self.positions_val[val_positions + start:val_positions + start + num_per_file] = np.load('./data3/black_val' + str(i + 1) + '.npy')
def shuffle_positions(self):
# print("---Shuffling white positions---")
# random.shuffle(self.white_positions)
# gc.collect()
# print("---Shuffling black positions---")
# random.shuffle(self.black_positions)
# gc.collect()
print("Shuffling positions")
np.random.shuffle(self.positions)
gc.collect()
def predict(self, data):
return self.encoder.predict(data)
if __name__ == '__main__':
ae = AutoEncoder()
ae.encoder_decoder(load=0)
ae.train(batch_size=256, epochs=100)
ae.save()
DenseTied 层类:
class DenseTied(keras.layers.Layer):
def __init__(self, units,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
tied_to=None,
**kwargs):
self.tied_to = tied_to
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super().__init__(**kwargs)
self.units = units
self.activation = keras.activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = keras.initializers.get(kernel_initializer)
self.bias_initializer = keras.initializers.get(bias_initializer)
self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
self.bias_regularizer = keras.regularizers.get(bias_regularizer)
self.activity_regularizer = keras.regularizers.get(activity_regularizer)
self.kernel_constraint = keras.constraints.get(kernel_constraint)
self.bias_constraint = keras.constraints.get(bias_constraint)
self.input_spec = keras.layers.InputSpec(min_ndim=2)
self.supports_masking = True
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
if self.tied_to is not None:
self.kernel = K.transpose(self.tied_to.kernel)
self._non_trainable_weights.append(self.kernel)
else:
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def call(self, inputs, **kwargs):
output = K.dot(inputs, self.kernel)
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
连体模型训练:
import chess
import numpy as np
import gc
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate
from keras.utils import Sequence
import random
import matplotlib.pyplot as plt
def batch_to_data(white_batch, black_batch):
x1 = []
x2 = []
y = []
min_len = min(len(white_batch), len(black_batch))
for i in range(min_len):
if random.randint(0, 1) == 1:
x1.append(white_batch[i])
x2.append(black_batch[i])
y.append([1, 0])
else:
x1.append(black_batch[i])
x2.append(white_batch[i])
y.append([0, 1])
x1 = np.array(x1).reshape((-1, 773))
x2 = np.array(x2).reshape((-1, 773))
y = np.array(y).reshape((-1, 2))
return [x1, x2], y
# A class for training the models using keras fit_generator
class DeepChessDataGenerator(Sequence):
def __init__(self, batch_size, whites, blacks, train=1):
print("---Initializing data generator---")
self.batch_size = batch_size
self.train = train
self.whites = whites
self.blacks = blacks
if train:
self.num_of_positions = 1000000
else:
self.num_of_positions = len(self.whites)
def __len__(self):
return int(np.floor(self.num_of_positions / self.batch_size))
def __getitem__(self, index):
white_batch = self.whites[index * self.batch_size:(index+1) * self.batch_size]
black_batch = self.blacks[index * self.batch_size:(index+1) * self.batch_size]
return batch_to_data(white_batch, black_batch)
# Shuffle the order of the white and blacks
def on_epoch_end(self):
np.random.shuffle(self.whites)
np.random.shuffle(self.blacks)
class DeepChess:
def __init__(self):
self.model = None # DeepChess model
self.encoder = None # Encoder models, encodes 773 bits (chess board) to 100 bits
# Sets the deep neural network chess model. if load=1 then loads model. otherwise, creates a new model
def neural_chess(self, load=0):
input_size = 773
layer1_size = 400
layer2_size = 200
layer3_size = 100
if load == 1:
# model = keras.models.load_model('dc_models/my_model11.h5')
# encoder = keras.models.load_model('dc_models/encoder11.h5')
model = keras.models.load_model('dc_models/dropout/deepchess-18-0.281-0.846.h5')
encoder = None
else:
# encoder = keras.models.load_model('./Pos2Vec/encoder_v1/encoder_epoch94')
input_layer0 = Input(shape=(input_size, ))
layer1 = Dense(600, activation='relu')(input_layer0)
layer2 = Dense(400, activation='relu')(layer1)
layer3 = Dense(200, activation='relu')(layer2)
layer4 = Dense(100, activation='relu')(layer3)
encoder = Model(input_layer0, layer4)
input_layer1 = Input(shape=(input_size, ))
input_layer2 = Input(shape=(input_size,))
e1 = encoder(input_layer1)
e2 = encoder(input_layer2)
combined = concatenate([e1, e2])
layer1 = Dense(layer1_size, activation='relu')(combined)
layer2 = Dense(layer2_size, activation='relu')(layer1)
layer3 = Dense(layer3_size, activation='relu')(layer2)
output_layer = Dense(2, activation='softmax')(layer3)
model = Model(inputs=[input_layer1, input_layer2], outputs=output_layer)
self.model = model
self.encoder = encoder
model.summary()
return model
# Trains the model for a # of epochs
def fit(self, epochs=50, batch_size=256):
num_of_positions = 2800000
num_of_positions_per_file = 100000
num_of_train_positions = int(num_of_positions-100000)
num_of_val_positions = num_of_positions - num_of_train_positions
train_whites = np.zeros((num_of_train_positions, 773), dtype='float32')
train_blacks = np.zeros((num_of_train_positions, 773), dtype='float32')
val_whites = np.zeros((num_of_val_positions, 773), dtype='float32')
val_blacks = np.zeros((num_of_val_positions, 773), dtype='float32')
for i in range(int(num_of_train_positions/num_of_positions_per_file)):
print('Loading data ' + str(i + 1) + '/27', end='\r')
train_whites[i * 100000:(i + 1) * 100000] = np.load('./data5/white_train' + str(i + 1) + '.npy')
train_blacks[i * 100000:(i + 1) * 100000] = np.load('./data5/black_train' + str(i + 1) + '.npy')
if i < int(num_of_val_positions/num_of_positions_per_file):
val_whites[i * 100000:(i + 1) * 100000] = np.load('./data5/white_val' + str(i + 1) + '.npy')
val_blacks[i * 100000:(i + 1) * 100000] = np.load('./data5/black_val' + str(i + 1) + '.npy')
print()
# Data generators
train_generator = DeepChessDataGenerator(batch_size, whites=train_whites, blacks=train_blacks, train=1)
val_generator = DeepChessDataGenerator(batch_size, whites=val_whites, blacks=val_blacks, train=0)
self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
self.model.summary()
history = self.model.fit_generator(train_generator, validation_data=val_generator, epochs=epochs, shuffle=True)
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
if __name__ == '__main__':
dc = DeepChess()
dc.neural_chess(load=0)
dc.fit(epochs=20, batch_size=256)