AlphaZero/alpha_zero.py at main · xtreamsrl/AlphaZero · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
from tqdm.notebook import trange
from path_vars import model_path, optimizer_path
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from global_vars import device
from MCTS import AlphaZeroMCTS, AlphaZeroParallelMCTS
import torch.nn.functional as F
import numpy as np
import torch
from self_play_game import SPG

class AlphaZero:
    """
    This class implements the AlphaZero training loop. It uses a neural network model,
    an optimizer, and an AlphaZeroMCTS instance to:
      1) Generate self-play games (collect training data).
      2) Train the model on the collected data.
      3) Save and load models and optimizers.
    """

    def __init__(self, game, model, optimizer, args):
        """
        Initialize the AlphaZero class with the given game, model, optimizer, and arguments.

        :param game: An instance of a Game class (e.g., TicTacToe, ConnectFour).
        :param model: A PyTorch neural network module that outputs policy and value.
        :param optimizer: A PyTorch optimizer (e.g., Adam).
        :param args: A dictionary containing configuration parameters.
                     Possible keys:
                       - num_iterations: how many outer training loops
                       - num_self_play_games: how many self-play games per iteration
                       - num_epochs: training epochs per iteration
                       - batch_size: batch size for training
                       - temperature: softmax temperature for action selection
                       - show_iterations, print_loss: boolean flags for output control
        """
        self.game = game
        self.model = model
        self.optimizer = optimizer
        self.args = args

        # Create an AlphaZero MCTS object using the provided game and model
        self.mcts = AlphaZeroMCTS(game=game, model=model, args=args)

        # Move the model to the specified device (CPU, CUDA, or MPS)
        self.model.to(device)

        # Keep track of training losses for analysis and visualization
        self.losses = []

    def self_play(self):
        """
        Perform a single self-play game using the current model (via MCTS).
        Collect the (state, action_probs, value) triplets (memory) for training.

        :return: A list of tuples (encoded_state, action_probs, outcome),
                 where outcome is from the perspective of the player in each state.
        """
        # We'll store state transitions in memory
        memory = []

        # Start with player=1 (X in TicTacToe or first mover in ConnectFour, etc.)
        player = 1
        state = self.game.get_initial_state()

        while True:
            # Convert the current board state into the perspective of the current player
            neutral_state = self.game.change_perspective(state, player)

            # Use MCTS to get a probability distribution over actions
            action_probs = self.mcts.search(neutral_state)

            # Store the data (state from the player's perspective, action_probs, player)
            memory.append((neutral_state, action_probs, player))

            # Apply a temperature to soften or sharpen the policy distribution
            temperature_action_probs = action_probs ** (1 / self.args.get("temperature", 0.1))
            temperature_action_probs /= np.sum(temperature_action_probs)

            # Sample an action from the temperature-adjusted distribution
            action = np.random.choice(self.game.action_size, p=temperature_action_probs)

            # Execute the chosen action on the board
            state = self.game.get_next_state(state, action, player)

            # Check if the game is over (win/loss/draw)
            value, is_terminal = self.game.get_value_and_terminated(state, action)
            if is_terminal:
                # For every state in this game's history, assign a final outcome
                # from that player's perspective
                ret_memory = []
                for hist_neutral_state, hist_action_probs, hist_player in memory:
                    # If hist_player == the winner, outcome = value; otherwise, opponent's value
                    hist_outcome = value if hist_player == player else self.game.get_opponent_value(value)
                    # Encode the state for training
                    encoded_state = self.game.encode_state(hist_neutral_state)
                    ret_memory.append((encoded_state, hist_action_probs, hist_outcome))
                return ret_memory

            # Switch to the other player
            player = self.game.get_opponent(player)

    def train(self, memory):
        """
        Train the model on a dataset of (state, action_probs, values)
        using a standard supervised approach with MSE + cross-entropy.

        :param memory: A list/tuple of samples from self-play:
                       (encoded_state, action_probs, outcome_value).
        """
        # Create a DataLoader to batch and shuffle the training data
        dataloader = DataLoader(
            memory,
            batch_size=self.args.get("batch_size", 32),
            shuffle=True
        )

        # Iterate over the batches of data
        for batch in dataloader:
            states, action_probs, values = batch

            # Move data to the chosen device (CPU, CUDA, MPS)
            states, action_probs, values = self.move_to_device(states, action_probs, values)

            # Reset gradients
            self.optimizer.zero_grad()

            # Forward pass through the model
            policy, value = self.model(states)

            # value shape: [batch_size, 1] -> squeeze to [batch_size]
            value = value.squeeze(1)

            # Compute loss: MSE for values + cross-entropy for policy
            loss = F.mse_loss(value, values) + F.cross_entropy(policy, action_probs)

            # Backprop and update parameters
            loss.backward()
            self.optimizer.step()

            # Keep track of the training loss
            self.losses.append(loss.item())

    def move_to_device(self, states, action_probs, values):
        """
        Move the given tensors to the configured device (CPU/GPU/MPS).

        :param states: Tensor of shape (batch_size, channels, height, width).
        :param action_probs: Tensor of shape (batch_size, action_size).
        :param values: Tensor of shape (batch_size).
        :return: The three tensors on the correct device.
        """
        if device.type == "cuda":
            states = states.to(device)
            action_probs = action_probs.to(device)
            values = values.to(device)
        if device.type == "cpu":
            states = states.float().to(device)
            action_probs = action_probs.float().to(device)
            values = values.float().to(device)
        if device.type == "mps":
            states = states.float().to(device)
            action_probs = action_probs.float().to(device)
            values = values.float().to(device)
        return states, action_probs, values

    def save_model(self, iteration):
        """
        Save the model parameters to a file, using iteration count in the filename.

        :param iteration: Integer specifying the current iteration (epoch) count.
        """
        torch.save(self.model.state_dict(), model_path / f"model_{iteration}_{self.game}.pth")

    def save_optimizer(self, iteration):
        """
        Save the optimizer state to a file, using iteration count in the filename.

        :param iteration: Integer specifying the current iteration (epoch) count.
        """
        torch.save(self.optimizer.state_dict(), optimizer_path / f"optimizer_{iteration}_{self.game}.pth")

    def load_model(self, iteration):
        """
        Load the model parameters from a file.

        :param iteration: Integer specifying which iteration file to load.
        """
        self.model.load_state_dict(torch.load(model_path / f"model_{iteration}_{self.game}.pth"))

    def load_optimizer(self, iteration):
        """
        Load the optimizer state from a file.

        :param iteration: Integer specifying which iteration file to load.
        """
        self.optimizer.load_state_dict(torch.load(optimizer_path / f"optimizer_{iteration}_{self.game}.pth"))

    def learn(self):
        """
        The main training loop that:
          1) Generates self-play data.
          2) Trains the model on that data.
          3) Repeats for num_iterations.

        Saves the model/optimizer periodically if specified.
        """
        # Run for the specified number of iterations
        for iteration in trange(self.args["num_iterations"], desc="Iteration"):
            memory = []

            # Switch model to evaluation mode for self-play
            self.model.eval()

            # Optionally display progress with tqdm
            if self.args.get("show_iterations", False):
                # Use tqdm to track self-play
                for _ in trange(self.args["num_self_play_games"], desc="Self Play"):
                    memory += self.self_play()

                # Switch to train mode
                self.model.train()

                # Train for num_epochs
                for _ in trange(self.args["num_epochs"], desc="Training"):
                    self.train(memory)
            else:
                # Without tqdm, just do the self-play games in a loop
                for _ in range(self.args["num_self_play_games"]):
                    memory += self.self_play()

                # Train mode
                self.model.train()
                for _ in range(self.args["num_epochs"]):
                    self.train(memory)

            # Save model and optimizer states
            self.save_model(iteration)
            self.save_optimizer(iteration)

        # Optionally plot the loss after training completes
        if self.args.get("print_loss", False):
            plt.plot(self.losses)
            plt.show()


class AlphaZeroParallel:
    """
    A parallelized version of AlphaZero self-play, allowing multiple games
    to be played simultaneously (in batches) for faster data collection.
    """

    def __init__(self, game, model, optimizer, args):
        """
        Initialize the AlphaZeroParallel class.

        :param game: An instance of a Game class (e.g., TicTacToe, ConnectFour).
        :param model: A PyTorch neural network module that outputs policy and value.
        :param optimizer: A PyTorch optimizer (e.g., Adam).
        :param args: A dictionary containing configuration parameters.
                     Notable entries:
                       - num_parallel_games: how many games to run in parallel
                       - num_iterations, num_self_play_games, num_epochs
                       - batch_size, temperature, etc.
        """
        self.game = game
        self.model = model
        self.optimizer = optimizer
        self.args = args

        # Use a parallel MCTS variant that can handle multiple states at once
        self.mcts = AlphaZeroParallelMCTS(game=game, model=model, args=args)

        self.model.to(device)
        self.losses = []

    @torch.no_grad()
    def self_play(self):
        """
        Plays multiple games in parallel. Each game is represented by an SPG object
        storing its state and memory. Once a game finishes, its data is collected
        and returned.

        :return: A list of (encoded_state, action_probs, outcome) tuples for all finished games.
        """
        return_memory = []
        player = 1

        # Create multiple SPG (Self Play Game) objects
        spgs = [SPG(self.game) for _ in range(self.args["num_parallel_games"])]

        # Continue until all parallel games have ended
        while len(spgs) > 0:
            # Gather all states, then convert them to the current player's perspective
            states = np.stack([spg.state for spg in spgs])
            neutral_states = self.game.change_perspective(states, player)

            # Run parallel MCTS on all the current states at once
            self.mcts.search(neutral_states, spgs)

            # Process each SPG, remove finished games
            for i in range(len(spgs))[::-1]:
                spg = spgs[i]

                # Get action probabilities from the MCTS root's children
                action_probs = np.zeros(self.game.action_size)
                for child in spg.root.children:
                    action_probs[child.action_taken] = child.visit_count
                action_probs /= np.sum(action_probs)

                # Record the data in the SPG memory
                spg.memory.append((spg.root.state, action_probs, player))

                # Apply temperature to soften/sharpen the distribution
                temperature_action_probs = action_probs ** (1 / self.args.get("temperature", 0.1))
                temperature_action_probs /= np.sum(temperature_action_probs)

                # Sample an action
                action = np.random.choice(self.game.action_size, p=temperature_action_probs)

                # Update the state with the chosen action
                spg.state = self.game.get_next_state(spg.state, action, player)

                # Check if the game is now over
                value, is_terminal = self.game.get_value_and_terminated(spg.state, action)
                if is_terminal:
                    # Convert the entire SPG's memory to final training samples
                    for hist_neutral_state, hist_action_probs, hist_player in spg.memory:
                        hist_outcome = (
                            value if hist_player == player
                            else self.game.get_opponent_value(value)
                        )
                        encoded_state = self.game.encode_state(hist_neutral_state)
                        return_memory.append((encoded_state, hist_action_probs, hist_outcome))

                    # Remove this game from the active list
                    del spgs[i]

            # Switch to the other player
            player = self.game.get_opponent(player)

        return return_memory

    def train(self, memory):
        """
        Train the model using the collected memory samples.

        :param memory: A list of (encoded_state, action_probs, outcome_value) tuples.
        """
        dataloader = DataLoader(
            memory,
            batch_size=self.args.get("batch_size", 32),
            shuffle=True
        )

        for batch in dataloader:
            states, action_probs, values = batch
            states, action_probs, values = self.move_to_device(states, action_probs, values)

            self.optimizer.zero_grad()
            policy, value = self.model(states)
            value = value.squeeze(1)

            loss = F.mse_loss(value, values) + F.cross_entropy(policy, action_probs)
            loss.backward()
            self.optimizer.step()
            self.losses.append(loss.item())

    def move_to_device(self, states, action_probs, values):
        """
        Moves the given tensors to the configured device (CPU, CUDA, or MPS).

        :param states: Tensor of shape (batch_size, channels, height, width).
        :param action_probs: Tensor of shape (batch_size, action_size).
        :param values: Tensor of shape (batch_size,).
        :return: (states, action_probs, values) on the correct device.
        """
        if device.type == "cuda":
            states = states.to(device)
            action_probs = action_probs.to(device)
            values = values.to(device)
        if device.type == "cpu":
            states = states.float().to(device)
            action_probs = action_probs.float().to(device)
            values = values.float().to(device)
        if device.type == "mps":
            states = states.float().to(device)
            action_probs = action_probs.float().to(device)
            values = values.float().to(device)
        return states, action_probs, values

    def save_model(self, iteration):
        """
        Save the model parameters to a file, identified by the current iteration count.

        :param iteration: The current iteration or training round index.
        """
        torch.save(
            self.model.state_dict(),
            model_path / f"model_parallel_{iteration}_{self.game}.pth"
        )

    def save_optimizer(self, iteration):
        """
        Save the optimizer state to a file, identified by the current iteration count.

        :param iteration: The current iteration or training round index.
        """
        torch.save(
            self.optimizer.state_dict(),
            optimizer_path / f"optimizer_parallel_{iteration}_{self.game}.pth"
        )

    def load_model(self, iteration):
        """
        Load the model parameters from a file for the given iteration.

        :param iteration: The iteration index for which the model file was saved.
        """
        self.model.load_state_dict(
            torch.load(model_path / f"model_parallel_{iteration}_{self.game}.pth")
        )

    def load_optimizer(self, iteration):
        """
        Load the optimizer state from a file for the given iteration.

        :param iteration: The iteration index for which the optimizer file was saved.
        """
        self.optimizer.load_state_dict(
            torch.load(optimizer_path / f"optimizer_parallel_{iteration}_{self.game}.pth")
        )

    def learn(self):
        """
        The main training loop for parallel self-play:
          1) Collect data from multiple parallel self-play games.
          2) Train the model on that data.
          3) Repeat for num_iterations.
        """
        # Outer loop of the training
        for iteration in trange(self.args["num_iterations"], desc="Iteration"):
            memory = []

            # Determine whether to use tqdm for inner loops
            if self.args.get("show_iterations", False):
                self_play_iterator = trange(
                    self.args["num_self_play_games"] // self.args["num_parallel_games"],
                    desc="Self Play"
                )
                num_epochs_iterator = trange(self.args["num_epochs"], desc="Training")
            else:
                self_play_iterator = range(self.args["num_self_play_games"] // self.args["num_parallel_games"])
                num_epochs_iterator = range(self.args["num_epochs"])

            # Switch to eval mode for self-play
            self.model.eval()
            for _ in self_play_iterator:
                memory += self.self_play()

            # Switch to train mode for gradient updates
            self.model.train()
            for _ in num_epochs_iterator:
                self.train(memory)

            # Save the model and optimizer states
            self.save_model(iteration)
            self.save_optimizer(iteration)

        # Optionally display training loss curve
        if self.args.get("print_loss", False):
            plt.plot(self.losses)
            plt.show()