#!/usr/bin/env python3
# coding: utf-8

"""
=======================
PPO on Wall Environment
=======================

This example shows how to train PPO the Wall Environment.
"""

###############################################################################
# Import required packages

import matplotlib.pyplot as plt
from rlenv.envs.wall.core import AccentaEnv

import torch as th
from stable_baselines3 import PPO


###############################################################################
# Make the environment

env = AccentaEnv()


###############################################################################
# Make the agent

# Custom actor (pi) and value function (vf) networks
# of two layers of size 32 each with Relu activation function
# https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html#custom-network-architecture
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=[dict(pi=[32, 32], vf=[32, 32])])

# Create the agent
model = PPO("MlpPolicy", env, policy_kwargs=policy_kwargs, verbose=1)


###############################################################################
# Train the agent

model.learn(total_timesteps=10000)


###############################################################################
# Save the agent (optional)

model.save("../rlagent/data/trained_model")


###############################################################################
# Load the agent (optional)
# the policy_kwargs are automatically loaded

del model
model = PPO.load("../rlagent/data/trained_model", env=env)


###############################################################################
# Assess the agent

score = AccentaEnv.eval(model)
print(score)

df = AccentaEnv.gen_one_episode(model)
df.plot()
plt.show()