StoneT2000 · StoneT2000 · May 23, 2021 · May 2, 2021 · May 2, 2021 · May 23, 2021
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ docs
 # test artifacts
 coverage
 .nyc_output
+coverage.lcov
 
 # for macs
 .DS_Store

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -10,14 +10,18 @@
     "build": "npm run build:es5 && npm run build:es6",
     "lint": "eslint . --ext .ts",
     "lint:fix": "eslint . --fix --ext .ts",
-    "prettier": "prettier --write ."
+    "report": "nyc report",
+    "codecov": "nyc report --reporter=text-lcov > coverage.lcov",
+    "prettier": "prettier --write .",
+    "docs": "typedoc src/index.ts"
   },
   "keywords": [],
   "author": "",
   "license": "ISC",
   "devDependencies": {
     "@types/chai": "^4.2.15",
     "@types/mocha": "^8.2.2",
+    "@types/node": "^14.14.37",
     "@types/seedrandom": "^3.0.0",
     "@typescript-eslint/eslint-plugin": "^4.20.0",
     "chai": "^4.3.0",
@@ -33,8 +37,6 @@
     "lib/"
   ],
   "dependencies": {
-    "@types/node": "^14.14.37",
-    "@types/numjs": "^0.14.5",
     "seedrandom": "^3.0.5"
   }
 }
diff --git a/src/RL/Agent/index.ts b/src/RL/Agent/index.ts
@@ -7,4 +7,12 @@ export abstract class Agent<State, Action> {
    * @param observation
    */
   abstract action(observation: State): Action;
+
+  /**
+   * Override this function to let user's seed the agent's rng
+   * @param seed 
+   */
+  public seed(seed: number): void {
+    return;
+  }
 }
diff --git a/src/RL/DP/index.ts b/src/RL/DP/index.ts
@@ -1,5 +1,7 @@
 import { IterativePolicyEvaluation } from './iterativePolicyEvaluation';
+import { PolicyIteration } from './policyIteration';
 
 export class DP {
   static IterativePolicyEvaluation = IterativePolicyEvaluation;
+  static PolicyIteration = PolicyIteration;
 }
diff --git a/src/RL/DP/iterativePolicyEvaluation.ts b/src/RL/DP/iterativePolicyEvaluation.ts
@@ -1,43 +1,46 @@
 import { Agent } from '../Agent';
-import { Environment } from '../Environments';
+import { Dynamics, Environment } from '../Environments';
 import { Space } from '../Spaces';
 
 //TODO: Handle stochastic environments
+
 export class IterativePolicyEvaluation<
   ActionSpace extends Space<Action>,
   ObservationSpace extends Space<State>,
   Action,
   State
 > {
-  public env: Environment<ActionSpace, ObservationSpace, Action, State, number>;
   public valueFunction: Map<any, number> = new Map();
   public valueActionFunction: Map<any, { value: number; action: Action }> = new Map();
-  public dynamics: (sucessorState: State, reward: number, state: State, action: Action) => number;
+  public dynamics: null | Dynamics<State, Action>;
   constructor(
-    env: Environment<ActionSpace, ObservationSpace, Action, State, number>,
-    /** Function to map environment to a hashable state representation */
-    public envToStateRep: (envToConvert: any) => any,
-    /** Function to map state representation to a usable environment of the same class as this evaluator was constructed with */
-    public envFromStateRep: (stateString: any) => typeof env,
-    /** A list of all possible state representations */
-    public allStateReps: any[],
-    /** The policy function to evaluate */
-    public policy: (action: Action, observation: State) => number,
-    /** A list of all possible valid actions */
-    public allPossibleActions: Action[],
-    /** The dynamics of the environment. Does not to be given if environment has predefined dynamics */
-    dynamics?: (sucessorState: State, reward: number, state: State, action: Action) => number,
+    public configs: {
+      /** Function to map environment to a hashable state representation */
+      obsToStateRep: (state: State) => any;
+      /** Function to map state representation to a usable environment of the same class as this evaluator was constructed with */
+      envFromStateRep: (stateString: any) => Environment<ActionSpace, ObservationSpace, Action, State, number>;
+      /** A list of all possible state representations */
+      allStateReps: any[];
+      /** The policy function to evaluate */
+      policy: (action: Action, observation: State) => number;
+      /** A list of all possible valid actions */
+      allPossibleActions: Action[];
+      /** The dynamics of the environment. Does not to be given if environment has predefined dynamics */
+      dynamics?: (sucessorState: State, reward: number, state: State, action: Action) => number;
+    }
   ) {
-    this.env = env;
-    allStateReps.forEach((s) => {
+    this.configs.allStateReps.forEach((s) => {
       this.valueFunction.set(s, 0);
     });
-    if (!dynamics) {
-      this.dynamics = this.env.dynamics;
+    if (!this.configs.dynamics) {
+      this.dynamics = null;
     } else {
-      this.dynamics = dynamics;
+      this.dynamics = this.configs.dynamics;
     }
   }
+  setPolicy(policy: (action: Action, observation: State) => number) {
+    this.configs.policy = policy;
+  }
   /**
    * Estimates the value function of the given policy
    * @param params - the parameters object
@@ -63,24 +66,30 @@ export class IterativePolicyEvaluation<
         console.log(`Step ${step}`);
       }
       let delta = 0;
-      for (let stateString of this.allStateReps) {
+      for (let stateString of this.configs.allStateReps) {
         let val = 0;
-        let s = this.envFromStateRep(stateString);
+        let s = this.configs.envFromStateRep(stateString);
         let v_pi_s = 0;
-        for (let action of this.allPossibleActions) {
+        for (let action of this.configs.allPossibleActions) {
           let observation = s.reset();
           let stepOut = s.step(action);
-          let p_srsa = this.policy(action, observation);
+          let p_srsa = this.configs.policy(action, observation);
           let reward = stepOut.reward;
           let done = stepOut.done;
 
-          let sp_stateString = this.envToStateRep(s);
+          let sp_stateString = this.configs.obsToStateRep(stepOut.observation);
 
           let v_pi_sp = this.valueFunction.get(sp_stateString)!;
 
           // bind dynamics function to the current used environment
-          this.dynamics = this.dynamics.bind(s);
-          let p_sp_s_r = this.dynamics(stepOut.observation, reward, observation, action);
+
+          let p_sp_s_r = 0;
+          if (this.dynamics) {
+            p_sp_s_r = this.dynamics(stepOut.observation, reward, observation, action);
+          } else {
+            p_sp_s_r = s.dynamics(stepOut.observation, reward, observation, action);
+          }
+
           v_pi_s += p_srsa * p_sp_s_r * (reward + 1 * v_pi_sp);
         }