我的 Q 学习算法目前选择“次优”选项而不是最佳选项。
from pprint import pprint
from random import shuffle, choice, random
from operator import itemgetter
epsilon = .03 # Exploration
gamma = .9 # Learning rate
epochs = 100000 # Number of matches vs itself
states = {}
WON = 1
LOSE = 0
TIE = 0.5
COLS = 3
ROWS = 3
SIZE = COLS * ROWS
def buildBoard():
return [''] * SIZE
def prettyPrint(board):
if type(board) != list:
board = eval(board)
print('+-----+')
for i in range(SIZE):
print('|{}'.format(board[i] if board[i] != '' else ' '), end='')
if i in (2, 5, 8):
print('|')
print('+-----+')
def isFinished(board):
# +-----+
# |0|1|2|
# |3|4|5|
# |6|7|8|
# +-----+
for player in ('X', 'O'):
# Check rows
for i in range(0, SIZE, 3):
if board[i] == board[i + 1] == board[i + 2] == player:
return {'winner': player, 'finished': True}
# Check cols
for i in range(ROWS):
if board[i] == board[i + 3] == board[i + 6] == player:
return {'winner': player, 'finished': True}
# Check diagonals
if (board[0] == board[4] == board[8] or
board[2] == board[4] == board[6]) and board[4] != '':
return {'winner': board[4], 'finished': True}
# Is it a draw?
for i in range(SIZE):
if board[i] == '':
return {'winner': None, 'finished': False}
# Still running
return {'winner': None, 'finished': True}
def genStates(state, player='X'):
for i in range(SIZE):
tmp = state[:]
if tmp[i] == '':
tmp[i] = player
info = isFinished(tmp)
if not info['finished']:
reward = 0.5
genStates(tmp, 'O' if player == 'X' else 'X')
elif info["winner"] == 'X':
reward = WON
elif info["winner"] == 'O':
reward = LOSE
else:
reward = TIE
states[str(tmp)] = reward
def getAvalaibleStates(state, player):
availables = []
for i in range(SIZE):
tmp = state[:]
if tmp[i] == '':
tmp[i] = player
availables.append((tmp, states[str(tmp)]))
shuffle(availables)
return availables
def nextState(state, player, eps=epsilon):
availables = getAvalaibleStates(state, player)
if eps > random():
return availables[0][0]
# X player choices the max value, O the min value
if player == 'X':
availables.sort(key=itemgetter(1), reverse=True)
else:
availables.sort(key=itemgetter(1))
return availables[0][0]
def updateState(state, nextState):
states[str(state)] = states[str(state)] + gamma * states[str(nextState)]
def play():
while True:
state = buildBoard()
who = input("X/O: ")
player = 'X'
prettyPrint(state)
while not isFinished(state)['finished']: # Until finish
if player == who:
state[int(input("[0-8]: "))] = who
else:
next = nextState(state, player)
state = next
player = 'O' if player == 'X' else 'X'
prettyPrint(state)
print(isFinished(state))
def train(epochs=epochs):
wins, loses, draw = 0, 0, 0
for _ in range(epochs):
state = buildBoard()
player = 'X'
while not isFinished(state)['finished']: # Until finish
next = nextState(state, player)
updateState(state, next)
state = next
if player == 'X':
player = 'O'
else:
player = 'X'
if isFinished(state)['winner'] == 'X':
wins += 1
elif isFinished(state)['winner'] == 'O':
loses += 1
else:
draw += 1
if _ % 10000 == 0:
print("Epoch n°", _)
print("wins: {} - loses: {} - draw: {}\n".format(wins, loses, draw))
if wins == 0 and _ > 0:
break
wins, loses, draw = 0, 0, 0
state = buildBoard()
states[str(state)] = TIE
genStates(state)
train()
play()
我对 epsilon 或学习率有太大的价值吗?还是我需要一个不同的公式来更新 Q 状态?还是我以不好的方式分享状态?
谢谢