Source code for smartgrid.wrappers.reward_aggregator

"""
RewardAggregators wrap the multi-objective env into a single-objective by
aggregating rewards (e.g., using an average, min, weighted sum, ...).
"""

import warnings
from abc import ABC, abstractmethod
from typing import Dict, Any, Tuple

import numpy as np
from pettingzoo.utils.env import ActionDict, ObsDict

from smartgrid.environment import SmartGrid, RewardsDict, InfoDict, AgentID



[docs]
class RewardAggregator(ABC, SmartGrid):
    """
    Wraps the multi-objective env into a single-objective by aggregating rewards.

    The :py:class:`smartgrid.environment.SmartGrid` environment supports
    multiple reward functions; its :py:meth:`.SmartGrid.step` method returns
    a dict of dictionaries, one dict for each agent, containing the rewards
    indexed by their reward function's name.
    However, most Reinforcement Learning algorithms expect a scalar reward,
    or in this case, a dict of scalar rewards, one for each agent.

    Classes that extend the ``RewardAggregator`` bridge this gap, by
    aggregating (scalarizing) the multiple rewards into a single one.

    .. note: PettingZoo only supports wrappers for AEC environments. AEC can
        be converted back-and-forth to Parallel environments, but that would
        hinder the performances. This class is a simpler wrapper for Parallel
        environments, although it does not follow PettingZoo's
        :py:class:`~pettingzoo.utils.wrappers.base.BaseWrapper` conventions.
    """


[docs]
    def __init__(self, env: SmartGrid):
        self._env = env



[docs]
    @abstractmethod
    def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
        """
        Transform multi-objective rewards into single-objective rewards.

        :param rewards: A dict mapping each learning agent to its rewards.
            The rewards are represented as a dict themselves (dict of dicts),
            containing one or several rewards, indexed by their reward
            function's name, e.g., ``{ 'fct1': 0.8, 'fct2': 0.4 }``.

        :return: A dict mapping each agent to its scalar reward. The rewards
            are scalarized from the agents' dict of rewards.
        """
        pass



[docs]
    def step(self, actions: ActionDict) -> Tuple[
        ObsDict, Dict[AgentID, float], Dict[AgentID, bool], Dict[AgentID, bool], InfoDict
    ]:
        obs, rewards, terminated, truncated, infos = self._env.step(actions)
        rewards = self.reward(rewards)
        return obs, rewards, terminated, truncated, infos


    def __getattribute__(self, name: str) -> Any:
        # Allow to use this wrapper exactly as the wrapped environment.
        # `getattribute` is similar to `getattr` but is called for *any*
        # attribute, even those that can be found in the class (e.g., through
        # inheritance). `getattr` is only called when the attribute is not found.
        if name.startswith('_') or name in ['reward', 'step', 'unwrapped']:
            # Private attribute or an attribute defined in this Wrapper class.
            # We want to directly access it (from this instance), not from the
            # wrapped env.
            return object.__getattribute__(self, name)
        else:
            # Another attribute: try to access it from the wrapped env.
            return object.__getattribute__(self._env, name)

    @property
    def unwrapped(self) -> SmartGrid:
        return self._env

    def __str__(self):
        """Return a name that looks like: ``Wrapper<WrappedEnv>``."""
        return f'{type(self).__name__}<{type(self.unwrapped).__name__}>'

    __repr__ = __str__




[docs]
class SingleRewardAggregator(RewardAggregator):
    """
    Returns the single reward for simplicity.

    This wrapper can be used when a single reward function is used in the
    environment; although it still returns a dict, the dict consists of a
    single value, and thus the "aggregation" is in fact trivial.

    .. warning:
        This wrapper will raise a warning if multiple reward functions are used.
        In this case, the first reward of the dict will be returned.
    """


[docs]
    def __init__(self, env: SmartGrid):
        super().__init__(env)
        nb_rewards = len(env.reward_calculator.rewards)
        if nb_rewards > 1:
            warnings.warn(f'Expected 1 reward function, found {nb_rewards}')



[docs]
    def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
        return {
            agent_name: list(agent_rewards.values())[0]
            for agent_name, agent_rewards in rewards.items()
        }





[docs]
class WeightedSumRewardAggregator(RewardAggregator):
    """
    Scalarizes multiple rewards through a weighted sum.

    By default, coefficients are all equal to ``1/n`` where ``n`` is the number
    of rewards, i.e., this is equivalent to an average.
    """


[docs]
    def __init__(self, env: SmartGrid, coefficients: dict = None):
        """
        Construct an instance of the Weighted Sum aggregator.

        :param env: The instance of the Smart Grid environment.

        :param coefficients: A dictionary describing the coefficients to use
            for each reward function. The keys must correspond to the name
            of the reward functions in the env
            (see its :py:attr:`.SmartGrid.reward_calculator`), and the values
            must be the weights (floats).
            Usually, the sum of weights is set to ``1.0`` to obtain a weighted
            average, but this is not mandatory.
            By default, weights are set to ``1 / n`` to obtain a simple average.

        .. warning:
            This class will emit a warning if the ``coefficients`` do not
            correspond to the reward functions' names. In this case, the
            coefficient during the computation is assumed to be ``0.0``, i.e.,
            the reward function is ignored.
        """
        super().__init__(env)
        if coefficients is None:
            nb_rewards = len(env.reward_calculator.rewards)
            coefficients = {
                reward.name: 1.0 / nb_rewards
                for reward in env.reward_calculator.rewards
            }
        else:
            # We use sets instead of lists, because we do not care about the order.
            expected_keys = {
                reward.name for reward in env.reward_calculator.rewards
            }
            found_keys = set(coefficients.keys())
            if expected_keys != found_keys:
                warnings.warn(f'Expected {expected_keys}, found {found_keys}')
        self._coefficients = coefficients



[docs]
    def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
        scalarized_rewards = {}
        for agent_name, agent_rewards in rewards.items():
            scalar = 0.0
            for reward_name, reward_value in agent_rewards.items():
                # We set a default in case the coefficient was not set.
                coeff = self._coefficients.get(reward_name, 0.0)
                scalar += reward_value * coeff
            scalarized_rewards[agent_name] = scalar
        return scalarized_rewards





[docs]
class MinRewardAggregator(RewardAggregator):
    """
    Returns the minimum of the rewards to scalarize.

    This corresponds to some sort of "Aristotelian" ethics, in the sense that
    we put the focus on the reward function with the worst consequences.
    """


[docs]
    def __init__(self, env: SmartGrid):
        super().__init__(env)



[docs]
    def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
        return {
            agent_name: min(agent_rewards.values())
            for agent_name, agent_rewards in rewards.items()
        }





[docs]
class ProductRewardAggregator(RewardAggregator):
    """
    Scalarizes rewards by multiplying them together.

    This forces low rewards to have an important impact, because, e.g.,
    ``0.1 * 0.9`` equals to ``0.09``. In other words, a low reward cannot be
    compensated by a high reward (as it would be in an average, for example).

    .. warning:
        This aggregation relies on assumptions that are **only** true when the
        reward range is set to ``[0,1]``!
        Otherwise, the multiplication would still work mathematically, but
        certainly not make sense in terms of a reward function. For example,
        if the reward range is ``[0,5]``, we could have ``5 * 5 = 25``.
        Or, if the reward range is ``[-1,1]``, we could have ``-1 * -1 = 1``,
        i.e., two negative rewards giving a positive scalar...
    """


[docs]
    def __init__(self, env: SmartGrid):
        super().__init__(env)



[docs]
    def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
        return {
            agent_name: np.prod(list(agent_rewards.values()), axis=0)
            for agent_name, agent_rewards in rewards.items()
        }