Source code for smartgrid.wrappers.reward_aggregator

"""
RewardAggregators wrap the multi-objective env into a single-objective by
aggregating rewards (e.g., using an average, min, weighted sum, ...).
"""

import warnings
from abc import ABC, abstractmethod
from typing import List, Dict

import numpy as np
from gymnasium.core import RewardWrapper
from numpy import ndarray

from smartgrid.environment import SmartGrid


[docs] class RewardAggregator(ABC, RewardWrapper): """ Wraps the multi-objective env into a single-objective by aggregating rewards. The :py:class:`smartgrid.environment.SmartGrid` environment supports multiple reward functions; its :py:meth:`.SmartGrid.step` method returns a list of dictionaries, one dict for each agent, containing the rewards indexed by their reward function's name. However, most Reinforcement Learning algorithms expect a scalar reward, or in this case, a list of scalar rewards, one for each agent. Classes that extend the ``RewardAggregator`` bridge this gap, by aggregating (scalarizing) the multiple rewards into a single one. """
[docs] def __init__(self, env: SmartGrid): super().__init__(env)
[docs] @abstractmethod def reward(self, rewards: List[Dict[str, float]]) -> List[float]: """ Transform multi-objective rewards into single-objective rewards. :param rewards: A list of dicts, one dict for each learning agent. Each dict contains one or several rewards, indexed by their reward function's name, e.g., ``{ 'fct1': 0.8, 'fct2': 0.4 }``. :return: A list of scalar rewards, one for each agent. The rewards are scalarized from the dict. """ pass
def __str__(self): return type(self).__name__
[docs] class SingleRewardAggregator(RewardAggregator): """ Returns the single reward for simplicity. This wrapper can be used when a single reward function is used in the environment; although it still returns a dict, the dict consists of a single value, and thus the "aggregation" is in fact trivial. .. warning: This wrapper will raise a warning if multiple reward functions are used. In this case, the first reward of the dict will be returned. """
[docs] def __init__(self, env: SmartGrid): super().__init__(env) nb_rewards = len(env.reward_calculator.rewards) if nb_rewards > 1: warnings.warn(f'Expected 1 reward function, found {nb_rewards}')
[docs] def reward(self, rewards: List[Dict[str, float]]) -> List[float]: return [ list(agent_rewards.values())[0] for agent_rewards in rewards ]
[docs] class WeightedSumRewardAggregator(RewardAggregator): """ Scalarizes multiple rewards through a weighted sum. By default, coefficients are all equal to ``1/n`` where ``n`` is the number of rewards, i.e., this is equivalent to an average. """
[docs] def __init__(self, env: SmartGrid, coefficients: dict = None): """ Construct an instance of the Weighted Sum aggregator. :param env: The instance of the Smart Grid environment. :param coefficients: A dictionary describing the coefficients to use for each reward function. The keys must correspond to the name of the reward functions in the env (see its :py:attr:`.SmartGrid.reward_calculator`), and the values must be the weights (floats). Usually, the sum of weights is set to ``1.0`` to obtain a weighted average, but this is not mandatory. By default, weights are set to ``1 / n`` to obtain a simple average. .. warning: This class will emit a warning if the ``coefficients`` do not correspond to the reward functions' names. In this case, the coefficient during the computation is assumed to be ``0.0``, i.e., the reward function is ignored. """ super().__init__(env) if coefficients is None: nb_rewards = len(env.reward_calculator.rewards) coefficients = { reward.name: 1.0 / nb_rewards for reward in env.reward_calculator.rewards } else: # We use sets instead of lists, because we do not care about the order. expected_keys = { reward.name for reward in env.reward_calculator.rewards } found_keys = set(coefficients.keys()) if expected_keys != found_keys: warnings.warn(f'Expected {expected_keys}, found {found_keys}') self.coefficients = coefficients
[docs] def reward(self, rewards: List[Dict[str, float]]) -> List[float]: scalarized_rewards = [] for agent_rewards in rewards: scalar = 0.0 for reward_name, reward_value in agent_rewards.items(): # We set a default in case the coefficient was not set. coeff = self.coefficients.get(reward_name, 0.0) scalar += reward_value * coeff scalarized_rewards.append(scalar) return scalarized_rewards
[docs] class MinRewardAggregator(RewardAggregator): """ Returns the minimum of the rewards to scalarize. This corresponds to some sort of "Aristotelian" ethics, in the sense that we put the focus on the reward function with the worst consequences. """
[docs] def __init__(self, env: SmartGrid): super().__init__(env)
[docs] def reward(self, rewards: List[Dict[str, float]]) -> List[float]: return [ min(agent_rewards.values()) for agent_rewards in rewards ]
[docs] class ProductRewardAggregator(RewardAggregator): """ Scalarizes rewards by multiplying them together. This forces low rewards to have an important impact, because, e.g., ``0.1 * 0.9`` equals to ``0.09``. In other words, a low reward cannot be compensated by a high reward (as it would be in an average, for example). .. warning: This aggregation relies on assumptions that are **only** true when the reward range is set to ``[0,1]``! Otherwise, the multiplication would still work mathematically, but certainly not make sense in terms of a reward function. For example, if the reward range is ``[0,5]``, we could have ``5 * 5 = 25``. Or, if the reward range is ``[-1,1]``, we could have ``-1 * -1 = 1``, i.e., two negative rewards giving a positive scalar... """
[docs] def __init__(self, env: SmartGrid): super().__init__(env)
[docs] def reward(self, reward: List[Dict[str, float]]) -> List[ndarray]: return [ np.prod(list(agent_rewards.values()), axis=0) for agent_rewards in reward ]