21 | August | 2025 | The Scotto Grotto (org)

#!/usr/bin/env python3
“””
Tic-Tac-Toe with simple reinforcement learning (Monte Carlo control)
– Runs in a terminal (works in Pydroid on Android)
– Can train by self-play, play human vs AI, or AI vs AI
– Learns state-action values and improves over time
– Save/Load the learned policy to/from JSON

How it learns (short version):
We record the sequence of (state, action, player) for a game. At the end:
winner’s actions get +1, loser’s actions get -1, draws get 0
We update a value table Q[(state, player)][action] by incremental averaging.
Policy is epsilon-greedy w.r.t. Q, so it explores a bit while learning.
“””

import json
import os
import random
from collections import defaultdict
from typing import Dict, List, Tuple

# =========================
# Game mechanics
# =========================

EMPTY = ” “
PLAYERS = (“X”, “O”)
WIN_LINES = [
    (0, 1, 2), (3, 4, 5), (6, 7, 8), # rows
    (0, 3, 6), (1, 4, 7), (2, 5, 8), # cols
    (0, 4, 8), (2, 4, 6)              # diagonals
]

Board = List[str]
State = str # 9-char string of ‘X’, ‘O’, or ‘ ‘
Action = int # index 0..8
QKey = Tuple[State, str] # (state, current_player)

def new_board() -> Board:
return [EMPTY] * 9

def board_to_state(board: Board) -> State:
return “”.join(board)

def legal_actions(board: Board) -> List[Action]:
return [i for i, c in enumerate(board) if c == EMPTY]

def check_winner(board: Board) -> str:
    for a, b, c in WIN_LINES:
        if board[a] != EMPTY and board[a] == board[b] == board[c]:
            return board[a]
    if EMPTY not in board:
        return “DRAW”
    return “” # game continues

def print_board(board: Board) -> None:
    b = [c if c != EMPTY else str(i+1) for i, c in enumerate(board)]
    rows = [f” {b[0]} | {b[1]} | {b[2]}”, f” {b[3]} | {b[4]} | {b[5]}”, f” {b[6]} | {b[7]} | {b[8]}”]
    print(“\n” + “\n———–\n”.join(rows) + “\n”)

# =========================
# RL bits (First-Visit Monte Carlo control)
# =========================

class MonteCarloAgent:
    def __init__(self, epsilon: float = 0.1):
        self.epsilon = float(epsilon)
        # Q maps (state, player) -> vector of 9 action values
        self.Q: Dict[QKey, List[float]] = defaultdict(lambda: [0.0]*9)
        # N maps (state, player, action) -> visit count for incremental mean
        self.N: Dict[Tuple[State, str, Action], int] = defaultdict(int)

    def policy(self, board: Board, player: str) -> Action:
        acts = legal_actions(board)
        if not acts:
            return -1
        if random.random() < self.epsilon:
            return random.choice(acts)
        state = board_to_state(board)
        qvals = self.Q[(state, player)]
        # choose best legal; tie-break randomly among maxima
        best_val = max(qvals[a] for a in acts)
        best_actions = [a for a in acts if qvals[a] == best_val]
        return random.choice(best_actions)

    def episode(self, starting_player: str = “X”, show: bool = False) -> str:
        “””Play one game between two copies of this agent (self-play).
        Returns the result: ‘X’, ‘O’, or ‘DRAW’.”””
        board = new_board()
        current = starting_player
        # history of (state, player, action)
        trajectory: List[Tuple[State, str, Action]] = []

        while True:
            action = self.policy(board, current)
            if action == -1:
                # no legal moves (shouldn’t happen)
                result = “DRAW”
                break
            # record state before taking action
            s = board_to_state(board)
            trajectory.append((s, current, action))
            # apply move
            board[action] = current
            outcome = check_winner(board)
            if show:
                print_board(board)
            if outcome == “X” or outcome == “O”:
                result = outcome
                break
            if outcome == “DRAW”:
                result = “DRAW”
                break
            # switch player
            current = “O” if current == “X” else “X”

        # Assign returns
        if result == “DRAW”:
            G = {“X”: 0.0, “O”: 0.0}
        else:
            G = {“X”: 1.0 if result == “X” else -1.0,
                 “O”: 1.0 if result == “O” else -1.0}

        # First-visit MC update
        seen = set()
        for s, p, a in trajectory:
            key = (s, p, a)
            if key in seen:
                continue
            seen.add(key)
            qkey = (s, p)
            self.N[(s, p, a)] += 1
            n = self.N[(s, p, a)]
            old = self.Q[qkey][a]
            target = G[p]
            self.Q[qkey][a] = old + (target – old) / n
        return result

    def choose_move(self, board: Board, player: str, greedy: bool = True) -> Action:
        acts = legal_actions(board)
        if not acts:
            return -1
        state = board_to_state(board)
        qvals = self.Q[(state, player)]
        if greedy:
            best_val = max(qvals[a] for a in acts)
            best_actions = [a for a in acts if qvals[a] == best_val]
            return random.choice(best_actions)
        else:
            return self.policy(board, player)

    # ———- persistence ———-
    def save(self, path: str):
        data = {f”{k[0]}|{k[1]}”: v for k, v in self.Q.items()}
        with open(path, “w”, encoding=”utf-8″) as f:
            json.dump({“epsilon”: self.epsilon, “Q”: data}, f)
        print(f”Saved model to {path} ({len(self.Q)} states)”)

    def load(self, path: str):
        if not os.path.exists(path):
            print(f”No such file: {path}”)
            return
        with open(path, “r”, encoding=”utf-8″) as f:
            blob = json.load(f)
        self.epsilon = float(blob.get(“epsilon”, 0.1))
        rawQ = blob.get(“Q”, {})
        self.Q = defaultdict(lambda: [0.0]*9)
        for k, v in rawQ.items():
            state, player = k.split(“|”)
            vec = [float(x) for x in v]
            if len(vec) != 9:
                vec = (vec + [0.0]*9)[:9]
            self.Q[(state, player)] = vec
        print(f”Loaded model from {path} ({len(self.Q)} states), epsilon={self.epsilon}”)

# =========================
# Human interaction
# =========================

MODEL_PATH = “tictactoe_q.json”

def human_turn(board: Board, player: str) -> None:
    while True:
        try:
            move = input(f”Your move ({player}). Enter 1-9: “).strip()
            idx = int(move) – 1
            if idx < 0 or idx > 8:
                raise ValueError
            if board[idx] != EMPTY:
                print(“That spot is taken. Try again.”)
                continue
            board[idx] = player
            return
        except ValueError:
            print(“Please type a number 1-9 corresponding to the board position.”)

def ai_turn(board: Board, player: str, agent: MonteCarloAgent, greedy: bool = True) -> None:
    a = agent.choose_move(board, player, greedy=greedy)
    if a == -1:
        return
    board[a] = player

def play_human_vs_ai(agent: MonteCarloAgent):
    print(“\n=== Human vs AI ===”)
    while True:
        choice = input(“Do you want to be X (first) or O (second)? [X/O]: “).strip().upper()
        if choice in (“X”, “O”):
            human = choice
            ai = “O” if human == “X” else “X”
            break
        print(“Please answer X or O.”)

greedy = input(“Should the AI play greedily (y) or allow exploration (n)? [y/n]: “).strip().lower() != ‘n’

    board = new_board()
    current = “X”
    print_board(board)

    while True:
        if current == human:
            human_turn(board, human)
        else:
            ai_turn(board, ai, agent, greedy=greedy)
            print(“AI moves:”)
        print_board(board)
        outcome = check_winner(board)
        if outcome in (“X”, “O”):
            print(f”{outcome} wins!” + (” (You win!)” if outcome == human else ” (AI wins!)”))
            break
        if outcome == “DRAW”:
            print(“It’s a draw.”)
            break
        current = “O” if current == “X” else “X”

def watch_ai_vs_ai(agent: MonteCarloAgent, games: int = 1, greedy: bool = True):
    print(“\n=== AI vs AI ===”)
    for g in range(1, games+1):
        print(f”Game {g}:”)
        result = agent.episode(starting_player=”X”, show=True)
        if result == “DRAW”:
            print(“Result: draw\n”)
        else:
            print(f”Result: {result} wins\n”)

def train_self_play(agent: MonteCarloAgent, episodes: int = 10000, report_every: int = 1000):
    print(f”Training for {episodes} self-play games (epsilon={agent.epsilon})…”)
    w = {“X”: 0, “O”: 0, “DRAW”: 0}
    for i in range(1, episodes+1):
        # alternate starting player to avoid bias
        starter = “X” if i % 2 else “O”
        result = agent.episode(starting_player=starter, show=False)
        w[result] += 1
        if report_every and i % report_every == 0:
            total = i
            wx, wo, d = w[“X”], w[“O”], w[“DRAW”]
            print(f” After {total:6d}: X={wx} O={wo} Draw={d} | states learned={len(agent.Q)}”)
    print(“Done.”)

# =========================
# Menu / main
# =========================

def main():
    agent = MonteCarloAgent(epsilon=0.1)
    if os.path.exists(MODEL_PATH):
        try:
            agent.load(MODEL_PATH)
        except Exception as e:
            print(f”(Could not auto-load model: {e})”)

    while True:
        print(“””
==============================
Tic-Tac-Toe RL (console)
==============================
1) Train by self-play
2) Play Human vs AI
3) Watch AI vs AI
4) Set exploration epsilon
5) Save model
6) Load model
7) Reset model
0) Exit
“””)
        choice = input(“Select an option: “).strip()

        if choice == “1”:
            try:
                n = int(input(“How many self-play games? (e.g., 5000): “).strip())
            except ValueError:
                print(“Please enter a number.”)
                continue
            report = max(1000, n // 10)
            train_self_play(agent, episodes=n, report_every=report)
        elif choice == “2”:
            play_human_vs_ai(agent)
        elif choice == “3”:
            try:
                g = int(input(“How many games to show? “).strip())
            except ValueError:
                g = 1
            greedy = input(“Play greedily (y) or with exploration (n)? [y/n]: “).strip().lower() != ‘n’
            watch_ai_vs_ai(agent, games=g, greedy=greedy)
        elif choice == “4”:
            try:
                e = float(input(“Set epsilon (0 = greedy, 0.1 default, 1 = random): “).strip())
                e = min(max(e, 0.0), 1.0)
                agent.epsilon = e
                print(f”epsilon set to {agent.epsilon}”)
            except ValueError:
                print(“Please enter a number (e.g., 0.1)”)
        elif choice == “5”:
            path = input(f”Save path [{MODEL_PATH}]: “).strip() or MODEL_PATH
            agent.save(path)
        elif choice == “6”:
            path = input(f”Load path [{MODEL_PATH}]: “).strip() or MODEL_PATH
            agent.load(path)
        elif choice == “7”:
            agent = MonteCarloAgent(epsilon=agent.epsilon)
            print(“Model reset. (Learning starts fresh.)”)
        elif choice == “0”:
            print(“Goodbye!”)
            break
        else:
            print(“Unknown option. Please choose from the menu.”)

if __name__ == “__main__”:
main()

S	M	T	W	T	F	S
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30
31

The Scotto Grotto (org)

Daily Archives: August 21, 2025

Tic tac toe python test

Day 20654

Welcome to my wall scrawls.