#!/usr/bin/env python3
“””
Tic-Tac-Toe with simple reinforcement learning (Monte Carlo control)
– Runs in a terminal (works in Pydroid on Android)
– Can train by self-play, play human vs AI, or AI vs AI
– Learns state-action values and improves over time
– Save/Load the learned policy to/from JSON
How it learns (short version):
We record the sequence of (state, action, player) for a game. At the end:
winner’s actions get +1, loser’s actions get -1, draws get 0
We update a value table Q[(state, player)][action] by incremental averaging.
Policy is epsilon-greedy w.r.t. Q, so it explores a bit while learning.
“””
import json
import os
import random
from collections import defaultdict
from typing import Dict, List, Tuple
# =========================
# Game mechanics
# =========================
EMPTY = ” “
PLAYERS = (“X”, “O”)
WIN_LINES = [
(0, 1, 2), (3, 4, 5), (6, 7, 8), # rows
(0, 3, 6), (1, 4, 7), (2, 5, 8), # cols
(0, 4, 8), (2, 4, 6) # diagonals
]
Board = List[str]
State = str # 9-char string of ‘X’, ‘O’, or ‘ ‘
Action = int # index 0..8
QKey = Tuple[State, str] # (state, current_player)
def new_board() -> Board:
return [EMPTY] * 9
def board_to_state(board: Board) -> State:
return “”.join(board)
def legal_actions(board: Board) -> List[Action]:
return [i for i, c in enumerate(board) if c == EMPTY]
def check_winner(board: Board) -> str:
for a, b, c in WIN_LINES:
if board[a] != EMPTY and board[a] == board[b] == board[c]:
return board[a]
if EMPTY not in board:
return “DRAW”
return “” # game continues
def print_board(board: Board) -> None:
b = [c if c != EMPTY else str(i+1) for i, c in enumerate(board)]
rows = [f” {b[0]} | {b[1]} | {b[2]}”, f” {b[3]} | {b[4]} | {b[5]}”, f” {b[6]} | {b[7]} | {b[8]}”]
print(“\n” + “\n———–\n”.join(rows) + “\n”)
# =========================
# RL bits (First-Visit Monte Carlo control)
# =========================
class MonteCarloAgent:
def __init__(self, epsilon: float = 0.1):
self.epsilon = float(epsilon)
# Q maps (state, player) -> vector of 9 action values
self.Q: Dict[QKey, List[float]] = defaultdict(lambda: [0.0]*9)
# N maps (state, player, action) -> visit count for incremental mean
self.N: Dict[Tuple[State, str, Action], int] = defaultdict(int)
def policy(self, board: Board, player: str) -> Action:
acts = legal_actions(board)
if not acts:
return -1
if random.random() < self.epsilon:
return random.choice(acts)
state = board_to_state(board)
qvals = self.Q[(state, player)]
# choose best legal; tie-break randomly among maxima
best_val = max(qvals[a] for a in acts)
best_actions = [a for a in acts if qvals[a] == best_val]
return random.choice(best_actions)
def episode(self, starting_player: str = “X”, show: bool = False) -> str:
“””Play one game between two copies of this agent (self-play).
Returns the result: ‘X’, ‘O’, or ‘DRAW’.”””
board = new_board()
current = starting_player
# history of (state, player, action)
trajectory: List[Tuple[State, str, Action]] = []
while True:
action = self.policy(board, current)
if action == -1:
# no legal moves (shouldn’t happen)
result = “DRAW”
break
# record state before taking action
s = board_to_state(board)
trajectory.append((s, current, action))
# apply move
board[action] = current
outcome = check_winner(board)
if show:
print_board(board)
if outcome == “X” or outcome == “O”:
result = outcome
break
if outcome == “DRAW”:
result = “DRAW”
break
# switch player
current = “O” if current == “X” else “X”
# Assign returns
if result == “DRAW”:
G = {“X”: 0.0, “O”: 0.0}
else:
G = {“X”: 1.0 if result == “X” else -1.0,
“O”: 1.0 if result == “O” else -1.0}
# First-visit MC update
seen = set()
for s, p, a in trajectory:
key = (s, p, a)
if key in seen:
continue
seen.add(key)
qkey = (s, p)
self.N[(s, p, a)] += 1
n = self.N[(s, p, a)]
old = self.Q[qkey][a]
target = G[p]
self.Q[qkey][a] = old + (target – old) / n
return result
def choose_move(self, board: Board, player: str, greedy: bool = True) -> Action:
acts = legal_actions(board)
if not acts:
return -1
state = board_to_state(board)
qvals = self.Q[(state, player)]
if greedy:
best_val = max(qvals[a] for a in acts)
best_actions = [a for a in acts if qvals[a] == best_val]
return random.choice(best_actions)
else:
return self.policy(board, player)
# ———- persistence ———-
def save(self, path: str):
data = {f”{k[0]}|{k[1]}”: v for k, v in self.Q.items()}
with open(path, “w”, encoding=”utf-8″) as f:
json.dump({“epsilon”: self.epsilon, “Q”: data}, f)
print(f”Saved model to {path} ({len(self.Q)} states)”)
def load(self, path: str):
if not os.path.exists(path):
print(f”No such file: {path}”)
return
with open(path, “r”, encoding=”utf-8″) as f:
blob = json.load(f)
self.epsilon = float(blob.get(“epsilon”, 0.1))
rawQ = blob.get(“Q”, {})
self.Q = defaultdict(lambda: [0.0]*9)
for k, v in rawQ.items():
state, player = k.split(“|”)
vec = [float(x) for x in v]
if len(vec) != 9:
vec = (vec + [0.0]*9)[:9]
self.Q[(state, player)] = vec
print(f”Loaded model from {path} ({len(self.Q)} states), epsilon={self.epsilon}”)
# =========================
# Human interaction
# =========================
MODEL_PATH = “tictactoe_q.json”
def human_turn(board: Board, player: str) -> None:
while True:
try:
move = input(f”Your move ({player}). Enter 1-9: “).strip()
idx = int(move) – 1
if idx < 0 or idx > 8:
raise ValueError
if board[idx] != EMPTY:
print(“That spot is taken. Try again.”)
continue
board[idx] = player
return
except ValueError:
print(“Please type a number 1-9 corresponding to the board position.”)
def ai_turn(board: Board, player: str, agent: MonteCarloAgent, greedy: bool = True) -> None:
a = agent.choose_move(board, player, greedy=greedy)
if a == -1:
return
board[a] = player
def play_human_vs_ai(agent: MonteCarloAgent):
print(“\n=== Human vs AI ===”)
while True:
choice = input(“Do you want to be X (first) or O (second)? [X/O]: “).strip().upper()
if choice in (“X”, “O”):
human = choice
ai = “O” if human == “X” else “X”
break
print(“Please answer X or O.”)
greedy = input(“Should the AI play greedily (y) or allow exploration (n)? [y/n]: “).strip().lower() != ‘n’
board = new_board()
current = “X”
print_board(board)
while True:
if current == human:
human_turn(board, human)
else:
ai_turn(board, ai, agent, greedy=greedy)
print(“AI moves:”)
print_board(board)
outcome = check_winner(board)
if outcome in (“X”, “O”):
print(f”{outcome} wins!” + (” (You win!)” if outcome == human else ” (AI wins!)”))
break
if outcome == “DRAW”:
print(“It’s a draw.”)
break
current = “O” if current == “X” else “X”
def watch_ai_vs_ai(agent: MonteCarloAgent, games: int = 1, greedy: bool = True):
print(“\n=== AI vs AI ===”)
for g in range(1, games+1):
print(f”Game {g}:”)
result = agent.episode(starting_player=”X”, show=True)
if result == “DRAW”:
print(“Result: draw\n”)
else:
print(f”Result: {result} wins\n”)
def train_self_play(agent: MonteCarloAgent, episodes: int = 10000, report_every: int = 1000):
print(f”Training for {episodes} self-play games (epsilon={agent.epsilon})…”)
w = {“X”: 0, “O”: 0, “DRAW”: 0}
for i in range(1, episodes+1):
# alternate starting player to avoid bias
starter = “X” if i % 2 else “O”
result = agent.episode(starting_player=starter, show=False)
w[result] += 1
if report_every and i % report_every == 0:
total = i
wx, wo, d = w[“X”], w[“O”], w[“DRAW”]
print(f” After {total:6d}: X={wx} O={wo} Draw={d} | states learned={len(agent.Q)}”)
print(“Done.”)
# =========================
# Menu / main
# =========================
def main():
agent = MonteCarloAgent(epsilon=0.1)
if os.path.exists(MODEL_PATH):
try:
agent.load(MODEL_PATH)
except Exception as e:
print(f”(Could not auto-load model: {e})”)
while True:
print(“””
==============================
Tic-Tac-Toe RL (console)
==============================
1) Train by self-play
2) Play Human vs AI
3) Watch AI vs AI
4) Set exploration epsilon
5) Save model
6) Load model
7) Reset model
0) Exit
“””)
choice = input(“Select an option: “).strip()
if choice == “1”:
try:
n = int(input(“How many self-play games? (e.g., 5000): “).strip())
except ValueError:
print(“Please enter a number.”)
continue
report = max(1000, n // 10)
train_self_play(agent, episodes=n, report_every=report)
elif choice == “2”:
play_human_vs_ai(agent)
elif choice == “3”:
try:
g = int(input(“How many games to show? “).strip())
except ValueError:
g = 1
greedy = input(“Play greedily (y) or with exploration (n)? [y/n]: “).strip().lower() != ‘n’
watch_ai_vs_ai(agent, games=g, greedy=greedy)
elif choice == “4”:
try:
e = float(input(“Set epsilon (0 = greedy, 0.1 default, 1 = random): “).strip())
e = min(max(e, 0.0), 1.0)
agent.epsilon = e
print(f”epsilon set to {agent.epsilon}”)
except ValueError:
print(“Please enter a number (e.g., 0.1)”)
elif choice == “5”:
path = input(f”Save path [{MODEL_PATH}]: “).strip() or MODEL_PATH
agent.save(path)
elif choice == “6”:
path = input(f”Load path [{MODEL_PATH}]: “).strip() or MODEL_PATH
agent.load(path)
elif choice == “7”:
agent = MonteCarloAgent(epsilon=agent.epsilon)
print(“Model reset. (Learning starts fresh.)”)
elif choice == “0”:
print(“Goodbye!”)
break
else:
print(“Unknown option. Please choose from the menu.”)
if __name__ == “__main__”:
main()