#ifndef _PUDDLEWORLD_HPP_
#define _PUDDLEWORLD_HPP_

#include <random>
#include "Environment.hpp"

// Canonical puddleworld domain.
// I'm not sure what the right paper to cite for this is / where the official definition lies.
// One definition of the domain can be found in "Kernel-Based Models for Reinformcement Learning" by Jong and Stone

class PuddleWorld : public Environment {
public:
    PuddleWorld(std::mt19937_64 & generator);
    ~PuddleWorld() override;
    int getNumActions() const override;                      	// Get the number of allowed actions. The actions are integers from 0 to numActions-1.
    int getStateDim() const override;                        	// Get the number of state variables
    Eigen::VectorXd getState() const override;               	// Get the current state as a vector of *normalized* variables. That is, the state variables should all be in the range [0,1]
    void setState(const Eigen::VectorXd & state,
				  std::mt19937_64 & generator) override;		// Set the state.
    void newEpisode(std::mt19937_64 & generator) override;      // Reset the environment for a new episode
    bool terminate() const override;                         	// Is the agent in a terminal state currently? I.e., would the next state alway the absorbing zero-reward state?
    double update(int action,
				  std::mt19937_64 & generator) override;		// Apply the specified action and return the resulting reward    double getInitialValue() const;                 // How should the value function be initialized? To zero?
    double getInitialValue() const override;                 	// How should the value function be initialized? To zero?
    double getGamma() const override;                        	// Get the reward discount parameter
    int getNumMCSamplesForPolicyEvaluation() const override; 	// Get the number of episodes that should be sampled when evaluating a policy by Monte Carlo returns. If the environment is deterministic, it should be 1.
    int getMaxTForPolicyEvaluation() const override;         	// How long should episodes run, maximum, when doing Monte Carlo performance estimation?
    int getNumSamplesPerState() const override;              	// How many samples should be generated from each state in value iteration? This is to sample stochastic state transitions. It should be one if state transitions are deterministic
    double getPlottableStatistic() const override;           	// For some environments we don't want to plot return (it's hard to visualize). At the end of a trajectory, before newEpisode is called, this function can be called to get a statistic for how good that episode was.
    std::string getPlottableStatisticName() const override;		// A string to say what the plottable statistic encodes.

private:
    std::normal_distribution<double> * distribution;                // Used to randomize agent movement a little
    std::uniform_real_distribution<double> * initialDistribution;   // Used to pick initial agent position
    Eigen::VectorXd pos;                                            // Position

    // Track the sum of rewards as the plottable statistic
    double rSum;

    // Get the distance from the point (px,py) to the segment with endpoings (x1,y1) and (x2,y2)
    static double distToSegment(const double & px, const double & py, const double & x1, const double & y1, const double & x2, const double & y2);
};

#endif
