download this notebook here

Learning a Reward Function using Kernel Density#

This demo shows how to train a Pendulum agent (exciting!) with our simple density-based imitation learning baselines. DensityTrainer has a few interesting parameters, but the key ones are:

  1. density_type: this governs whether density is measured on \((s,s')\) pairs (db.STATE_STATE_DENSITY), \((s,a)\) pairs (db.STATE_ACTION_DENSITY), or single states (db.STATE_DENSITY).

  2. is_stationary: determines whether a separate density model is used for each time step \(t\) (False), or the same model is used for transitions at all times (True).

  3. standardise_inputs: if True, each dimension of the agent state vectors will be normalised to have zero mean and unit variance over the training dataset. This can be useful when not all elements of the demonstration vector are on the same scale, or when some elements have too wide a variation to be captured by the fixed kernel width (1 for Gaussian kernel).

  4. kernel: changes the kernel used for non-parametric density estimation. gaussian and exponential are the best bets; see the sklearn docs for the rest.

import pprint

from imitation.algorithms import density as db
from imitation.data import types
from imitation.util import util
# Set FAST = False for longer training. Use True for testing and CI.
FAST = True

if FAST:
    N_VEC = 1
    N_TRAJECTORIES = 1
    N_ITERATIONS = 1
    N_RL_TRAIN_STEPS = 100

else:
    N_VEC = 8
    N_TRAJECTORIES = 10
    N_ITERATIONS = 10
    N_RL_TRAIN_STEPS = 100_000
from imitation.policies.serialize import load_policy
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3 import PPO
from imitation.data import rollout
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from imitation.data.wrappers import RolloutInfoWrapper
import gymnasium as gym
import numpy as np

SEED = 42

rng = np.random.default_rng(seed=SEED)
env_name = "Pendulum-v1"
rollout_env = DummyVecEnv(
    [lambda: RolloutInfoWrapper(gym.make(env_name)) for _ in range(N_VEC)]
)
expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name=env_name,
    venv=rollout_env,
)
rollouts = rollout.rollout(
    expert,
    rollout_env,
    rollout.make_sample_until(min_timesteps=2000, min_episodes=57),
    rng=rng,
)

env = util.make_vec_env(env_name, n_envs=N_VEC, rng=rng)


imitation_trainer = PPO(
    ActorCriticPolicy, env, learning_rate=3e-4, gamma=0.95, ent_coef=1e-4, n_steps=2048
)
density_trainer = db.DensityAlgorithm(
    venv=env,
    rng=rng,
    demonstrations=rollouts,
    rl_algo=imitation_trainer,
    density_type=db.DensityType.STATE_ACTION_DENSITY,
    is_stationary=True,
    kernel="gaussian",
    kernel_bandwidth=0.4,  # found using divination & some palm reading
    standardise_inputs=True,
)
density_trainer.train()
# evaluate the expert
expert_rewards, _ = evaluate_policy(expert, env, 100, return_episode_rewards=True)

# evaluate the learner before training
learner_rewards_before_training, _ = evaluate_policy(
    density_trainer.policy, env, 100, return_episode_rewards=True
)
def print_stats(density_trainer, n_trajectories, epoch=""):
    stats = density_trainer.test_policy(n_trajectories=n_trajectories)
    print("True reward function stats:")
    pprint.pprint(stats)
    stats_im = density_trainer.test_policy(
        true_reward=False,
        n_trajectories=n_trajectories,
    )
    print(f"Imitation reward function stats, epoch {epoch}:")
    pprint.pprint(stats_im)


novice_stats = density_trainer.test_policy(n_trajectories=N_TRAJECTORIES)
print("Stats before training:")
print_stats(density_trainer, 1)

print("Starting the training!")
for i in range(N_ITERATIONS):
    density_trainer.train_policy(N_RL_TRAIN_STEPS)
    print_stats(density_trainer, 1, epoch=str(i))
Stats before training:
True reward function stats:
{'len_max': 200,
 'len_mean': 200.0,
 'len_min': 200,
 'len_std': 0.0,
 'monitor_return_len': 1,
 'monitor_return_max': -1493.001723,
 'monitor_return_mean': -1493.001723,
 'monitor_return_min': -1493.001723,
 'monitor_return_std': 0.0,
 'n_traj': 1,
 'return_max': -1493.001723766327,
 'return_mean': -1493.001723766327,
 'return_min': -1493.001723766327,
 'return_std': 0.0}
Imitation reward function stats, epoch :
{'len_max': 200,
 'len_mean': 200.0,
 'len_min': 200,
 'len_std': 0.0,
 'monitor_return_len': 1,
 'monitor_return_max': -1749.369344,
 'monitor_return_mean': -1749.369344,
 'monitor_return_min': -1749.369344,
 'monitor_return_std': 0.0,
 'n_traj': 1,
 'return_max': -2212.1580998897552,
 'return_mean': -2212.1580998897552,
 'return_min': -2212.1580998897552,
 'return_std': 0.0}
Starting the training!
True reward function stats:
{'len_max': 200,
 'len_mean': 200.0,
 'len_min': 200,
 'len_std': 0.0,
 'monitor_return_len': 1,
 'monitor_return_max': -908.535786,
 'monitor_return_mean': -908.535786,
 'monitor_return_min': -908.535786,
 'monitor_return_std': 0.0,
 'n_traj': 1,
 'return_max': -908.5357865467668,
 'return_mean': -908.5357865467668,
 'return_min': -908.5357865467668,
 'return_std': 0.0}
Imitation reward function stats, epoch 0:
{'len_max': 200,
 'len_mean': 200.0,
 'len_min': 200,
 'len_std': 0.0,
 'monitor_return_len': 1,
 'monitor_return_max': -855.283381,
 'monitor_return_mean': -855.283381,
 'monitor_return_min': -855.283381,
 'monitor_return_std': 0.0,
 'n_traj': 1,
 'return_max': -2239.7023117542267,
 'return_mean': -2239.7023117542267,
 'return_min': -2239.7023117542267,
 'return_std': 0.0}
# evaluate the learner after training
learner_rewards_after_training, _ = evaluate_policy(
    density_trainer.policy, env, 100, return_episode_rewards=True
)

Here are the final results. If you set FAST = False in one of the initial cells, you should see that performance after training approaches that of an expert.

print("Mean expert reward:", np.mean(expert_rewards))
print("Mean reward before training:", np.mean(learner_rewards_before_training))
print("Mean reward after training:", np.mean(learner_rewards_after_training))
Mean expert reward: -212.67203443999998
Mean reward before training: -1235.5171938299998
Mean reward after training: -1145.53928535