Source code for algorithms.util.action_selector

"""
This module defines several classes to select actions (ActionSelectors).

An ActionSelector takes a list of interests (e.g., Q-Values) and the time step,
to return a single identifier, which is considered the selected action.
They target the exploration-exploitation dilemma.

We consider 2 selectors:

- the Epsilon-Greedy selector selects the maximum interest action with a
  `(1-ε)` probability, e.g., 95%. Otherwise, it selects a random action.
- the Boltzmann selector applies a Boltzmann distribution over the interests.
  Interests that are closer have a similar probability, and higher interests
  yield higher probabilities. The distribution is controlled by a Boltzmann
  temperature, such that low interests can still yield significant probabilities.
"""

from random import random, randrange, choices

import numpy as np


[docs] class ActionSelector(object): def choose(self, interests, step) -> int: raise NotImplementedError() def __call__(self, *args, **kwargs): return self.choose(*args, **kwargs)
[docs] class EpsilonGreedyActionSelector(ActionSelector): """Implements the ε-greedy policy."""
[docs] def __init__(self, epsilon=0.05): self.epsilon = epsilon
def choose(self, interests, step): if random() < self.epsilon: # Exploration: pick a random unit action_idx = randrange(0, len(interests)) else: # Exploitation: pick the unit with the maximal Q-Value action_idx = np.argmax(interests) return action_idx
[docs] class BoltzmannActionSelector(ActionSelector): """Implements the Boltzmann policy."""
[docs] def __init__(self, initial_tau: float, tau_decay: bool, tau_decay_coeff: float): self.initial_tau = initial_tau self.tau_decay = tau_decay self.tau_decay_coeff = tau_decay_coeff
def choose(self, values, step): # Boltzmann decision process # First, compute tau (τ) if self.tau_decay: tau = self.initial_tau * (self.tau_decay_coeff ** step) tau = max(tau, 0.01) else: tau = self.initial_tau # Then, compute the weight for each value (exp(Q[s,a]) / τ) indices = np.arange(len(values)) weights = [np.exp(values[i] / tau) for i in indices] return choices(indices, weights=weights, k=1)[0]