"""
RewardAggregators wrap the multi-objective env into a single-objective by
aggregating rewards (e.g., using an average, min, weighted sum, ...).
"""
import warnings
from abc import ABC, abstractmethod
from typing import Dict, Any, Tuple
import numpy as np
from pettingzoo.utils.env import ActionDict, ObsDict
from smartgrid.environment import SmartGrid, RewardsDict, InfoDict, AgentID
[docs]
class RewardAggregator(ABC, SmartGrid):
"""
Wraps the multi-objective env into a single-objective by aggregating rewards.
The :py:class:`smartgrid.environment.SmartGrid` environment supports
multiple reward functions; its :py:meth:`.SmartGrid.step` method returns
a dict of dictionaries, one dict for each agent, containing the rewards
indexed by their reward function's name.
However, most Reinforcement Learning algorithms expect a scalar reward,
or in this case, a dict of scalar rewards, one for each agent.
Classes that extend the ``RewardAggregator`` bridge this gap, by
aggregating (scalarizing) the multiple rewards into a single one.
.. note: PettingZoo only supports wrappers for AEC environments. AEC can
be converted back-and-forth to Parallel environments, but that would
hinder the performances. This class is a simpler wrapper for Parallel
environments, although it does not follow PettingZoo's
:py:class:`~pettingzoo.utils.wrappers.base.BaseWrapper` conventions.
"""
[docs]
def __init__(self, env: SmartGrid):
self._env = env
[docs]
@abstractmethod
def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
"""
Transform multi-objective rewards into single-objective rewards.
:param rewards: A dict mapping each learning agent to its rewards.
The rewards are represented as a dict themselves (dict of dicts),
containing one or several rewards, indexed by their reward
function's name, e.g., ``{ 'fct1': 0.8, 'fct2': 0.4 }``.
:return: A dict mapping each agent to its scalar reward. The rewards
are scalarized from the agents' dict of rewards.
"""
pass
[docs]
def step(self, actions: ActionDict) -> Tuple[
ObsDict, Dict[AgentID, float], Dict[AgentID, bool], Dict[AgentID, bool], InfoDict
]:
obs, rewards, terminated, truncated, infos = self._env.step(actions)
rewards = self.reward(rewards)
return obs, rewards, terminated, truncated, infos
def __getattribute__(self, name: str) -> Any:
# Allow to use this wrapper exactly as the wrapped environment.
# `getattribute` is similar to `getattr` but is called for *any*
# attribute, even those that can be found in the class (e.g., through
# inheritance). `getattr` is only called when the attribute is not found.
if name.startswith('_') or name in ['reward', 'step', 'unwrapped']:
# Private attribute or an attribute defined in this Wrapper class.
# We want to directly access it (from this instance), not from the
# wrapped env.
return object.__getattribute__(self, name)
else:
# Another attribute: try to access it from the wrapped env.
return object.__getattribute__(self._env, name)
@property
def unwrapped(self) -> SmartGrid:
return self._env
def __str__(self):
"""Return a name that looks like: ``Wrapper<WrappedEnv>``."""
return f'{type(self).__name__}<{type(self.unwrapped).__name__}>'
__repr__ = __str__
[docs]
class SingleRewardAggregator(RewardAggregator):
"""
Returns the single reward for simplicity.
This wrapper can be used when a single reward function is used in the
environment; although it still returns a dict, the dict consists of a
single value, and thus the "aggregation" is in fact trivial.
.. warning:
This wrapper will raise a warning if multiple reward functions are used.
In this case, the first reward of the dict will be returned.
"""
[docs]
def __init__(self, env: SmartGrid):
super().__init__(env)
nb_rewards = len(env.reward_calculator.rewards)
if nb_rewards > 1:
warnings.warn(f'Expected 1 reward function, found {nb_rewards}')
[docs]
def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
return {
agent_name: list(agent_rewards.values())[0]
for agent_name, agent_rewards in rewards.items()
}
[docs]
class WeightedSumRewardAggregator(RewardAggregator):
"""
Scalarizes multiple rewards through a weighted sum.
By default, coefficients are all equal to ``1/n`` where ``n`` is the number
of rewards, i.e., this is equivalent to an average.
"""
[docs]
def __init__(self, env: SmartGrid, coefficients: dict = None):
"""
Construct an instance of the Weighted Sum aggregator.
:param env: The instance of the Smart Grid environment.
:param coefficients: A dictionary describing the coefficients to use
for each reward function. The keys must correspond to the name
of the reward functions in the env
(see its :py:attr:`.SmartGrid.reward_calculator`), and the values
must be the weights (floats).
Usually, the sum of weights is set to ``1.0`` to obtain a weighted
average, but this is not mandatory.
By default, weights are set to ``1 / n`` to obtain a simple average.
.. warning:
This class will emit a warning if the ``coefficients`` do not
correspond to the reward functions' names. In this case, the
coefficient during the computation is assumed to be ``0.0``, i.e.,
the reward function is ignored.
"""
super().__init__(env)
if coefficients is None:
nb_rewards = len(env.reward_calculator.rewards)
coefficients = {
reward.name: 1.0 / nb_rewards
for reward in env.reward_calculator.rewards
}
else:
# We use sets instead of lists, because we do not care about the order.
expected_keys = {
reward.name for reward in env.reward_calculator.rewards
}
found_keys = set(coefficients.keys())
if expected_keys != found_keys:
warnings.warn(f'Expected {expected_keys}, found {found_keys}')
self._coefficients = coefficients
[docs]
def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
scalarized_rewards = {}
for agent_name, agent_rewards in rewards.items():
scalar = 0.0
for reward_name, reward_value in agent_rewards.items():
# We set a default in case the coefficient was not set.
coeff = self._coefficients.get(reward_name, 0.0)
scalar += reward_value * coeff
scalarized_rewards[agent_name] = scalar
return scalarized_rewards
[docs]
class MinRewardAggregator(RewardAggregator):
"""
Returns the minimum of the rewards to scalarize.
This corresponds to some sort of "Aristotelian" ethics, in the sense that
we put the focus on the reward function with the worst consequences.
"""
[docs]
def __init__(self, env: SmartGrid):
super().__init__(env)
[docs]
def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
return {
agent_name: min(agent_rewards.values())
for agent_name, agent_rewards in rewards.items()
}
[docs]
class ProductRewardAggregator(RewardAggregator):
"""
Scalarizes rewards by multiplying them together.
This forces low rewards to have an important impact, because, e.g.,
``0.1 * 0.9`` equals to ``0.09``. In other words, a low reward cannot be
compensated by a high reward (as it would be in an average, for example).
.. warning:
This aggregation relies on assumptions that are **only** true when the
reward range is set to ``[0,1]``!
Otherwise, the multiplication would still work mathematically, but
certainly not make sense in terms of a reward function. For example,
if the reward range is ``[0,5]``, we could have ``5 * 5 = 25``.
Or, if the reward range is ``[-1,1]``, we could have ``-1 * -1 = 1``,
i.e., two negative rewards giving a positive scalar...
"""
[docs]
def __init__(self, env: SmartGrid):
super().__init__(env)
[docs]
def reward(self, rewards: RewardsDict) -> Dict[AgentID, float]:
return {
agent_name: np.prod(list(agent_rewards.values()), axis=0)
for agent_name, agent_rewards in rewards.items()
}