Adjust the rewards

2021-06-07 13:39:32 +02:00 · 2021-06-07 13:39:32 +02:00 · 3c0fe20132
commit 3c0fe20132
parent 88f13d7d0d
3 changed files with 13 additions and 3 deletions
--- a/survival/generators/resource_generator.py
+++ b/survival/generators/resource_generator.py
@ -59,7 +59,7 @@ class ResourceGenerator:
        world.delete_entity(resource_ent, immediate=True)
        if world.has_component(player, LearningComponent):
            learning = world.component_for_entity(player, LearningComponent)
-            learning.reward = 10
+            learning.reward += 10
            learning.score += 1
            ResourceGenerator.resources_amount -= 1
            if ResourceGenerator.resources_amount == 0:
--- a/survival/systems/consumption_system.py
+++ b/survival/systems/consumption_system.py
@ -20,7 +20,7 @@ class ConsumptionSystem(esper.Processor):
                # If no item was picked up
                if cons.last_inventory_state == inventory.total_items_count():
                    learning: LearningComponent = self.world.component_for_entity(ent, LearningComponent)
-                    learning.reward = -10
+                    learning.reward += -10
                    learning.done = True
                cons.last_inventory_state = inventory.total_items_count()
            else:
--- a/survival/systems/neural_system.py
+++ b/survival/systems/neural_system.py
@ -17,7 +17,7 @@ from survival.model import LinearQNetwork, QTrainer
 MAX_MEMORY = 100_000
 BATCH_SIZE = 1000
 LR = 0.001
-LEARN = True
+LEARN = False


 class NeuralSystem(esper.Processor):
@ -34,6 +34,7 @@ class NeuralSystem(esper.Processor):
            self.starting_epsilon = -1
        self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma)
        self.utils = LearningUtils()
+        self.best_action = None

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
@ -68,10 +69,15 @@ class NeuralSystem(esper.Processor):
                                                                               TimeComponent, LearningComponent):
            if not learning.made_step:
                learning.reset()
+                self.best_action = None

                # Get the closest resource | [entity, path, cost]
                resource: [int, list, int] = self.game_map.find_nearest_resource(self.world, ent, pos)

+                if resource is not None:
+                    # If resource was found get the best move chosen by A*
+                    self.best_action = resource[1][0]
+
                # Get current entity state
                old_state = get_state(self, ent, resource)
                # Predict the action
@ -81,6 +87,10 @@ class NeuralSystem(esper.Processor):
                # Perform the action
                act = Action.perform(self.world, ent, Action.from_array(action))
                self.utils.append_action(act, pos)
+
+                # Add reward if chosen action was the best action
+                if act == self.best_action:
+                    learning.reward += 1
                continue

            # Wait for the action to complete