Train the agent over a specified number of episodes.
77 def train(self, n_episodes=2000, N=5, max_steps_per_episode = 200, train_on_old_models = False, start_learn_after = 500, use_permutation = False):
78 """!
79 @brief Train the agent over a specified number of episodes
80 @param n_episodes Number of training episodes
81 @param N Frequency of learning steps
82 @param max_steps_per_episode Maximum number of steps per episode
83 @param train_on_old_models Whether to load existing models
84 @param start_learn_after After how many steps learning should begin
85 @param use_permutation Whether observations should be permuted (can stabilize training but costs time)
86 """
87 if train_on_old_models:
88 self.ppo_agent.load_models()
89 self.total_steps = 0
90
91 for episode in range(n_episodes):
92 obs = self.env.reset()
93 isTerminal = False
94 total_reward = 0
95 steps = 0
96 last_trailer_id = None
97 last_rack_id = None
98 last_action = None
99 positive_actions_reward = 0
100
101 while not isTerminal:
102 bool_heuristic = False
103 reward = 0
104
105
106 epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
107 np.exp(-self.epsilon_decay * self.total_steps)
108
109
110 if np.random.random() < epsilon:
111
112 valid_actions = self.get_valid_actions(obs)
113 if valid_actions:
114 high_level_action = np.random.choice(valid_actions)
115 high_level_action_str = self.action_mapping[high_level_action]
116
117
118 if use_permutation:
119 _, prob, val, dist = self.ppo_agent.choose_action(permute_high_level_observation(np.random.permutation(10), obs))
120 else:
121 _, prob, val, dist = self.ppo_agent.choose_action(obs)
122 else:
123
124 if use_permutation:
125 obs_ = None
126 permutation = np.random.permutation(10)
127 permuted_obs = permute_high_level_observation(permutation, obs)
128 high_level_action, prob, val, dist = self.ppo_agent.choose_action(permuted_obs)
129 else:
130 high_level_action, prob, val, dist = self.ppo_agent.choose_action(obs)
131 high_level_action_str = self.action_mapping[high_level_action]
132 else:
133
134 if use_permutation:
135 obs_ = None
136 permutation = np.random.permutation(10)
137 permuted_obs = permute_high_level_observation(permutation, obs)
138 high_level_action, prob, val, dist = self.ppo_agent.choose_action(permuted_obs)
139 else:
140 high_level_action, prob, val, dist = self.ppo_agent.choose_action(obs)
141 high_level_action_str = self.action_mapping[high_level_action]
142
143 probs = dist.probs.detach().cpu().numpy()
144
145
146 valid_actions = self.get_valid_actions(obs)
147
148
149 if not valid_actions:
150 print(f"Keine gültigen Aktionen für diesen Zustand möglich. Problem: {self.env.problem_name}")
151 isTerminal = True
152 reward -= 5000.0
153 break
154
155
156
157 if episode > 100 and self.total_steps > 500:
158 total_invalid = sum(self.invalid_action_counts.values())
159 if total_invalid > 0:
160 unstack_percentage = (self.invalid_action_counts[6] + self.invalid_action_counts[7]) / total_invalid
161 if unstack_percentage > 0.4:
162
163 unstack_idx = [6, 7]
164 scale_factor = 0.5
165 for idx in unstack_idx:
166 if idx < len(probs):
167 probs[idx] *= scale_factor
168
169 if np.sum(probs) > 0:
170 probs = probs / np.sum(probs)
171
172
173 high_level_action = np.argmax(probs)
174 high_level_action_str = self.action_mapping[high_level_action]
175 prob = probs[high_level_action]
176
177
178 if self.debug and steps % 10 == 0:
179 valid_action_names = [self.action_mapping[idx] for idx in valid_actions]
180 print(f"Gültige Aktionen: {valid_action_names}")
181
182
183 tried_actions = set()
184 while not self.env.check_action_execution(high_level_action_str, obs):
185
186 self.invalid_action_counts[high_level_action] += 1
187 tried_actions.add(high_level_action)
188
189 if len(tried_actions) >= len(self.action_mapping):
190 print(f"Alle Aktionen probiert, keine ist gültig. Problem: {self.env.problem_name}")
191 isTerminal = True
192 reward -= 10000.0
193 break
194
195 probs[high_level_action] = 0.0
196
197
198 if np.all(probs == 0):
199 untried_actions = [i for i in range(len(self.action_mapping)) if i not in tried_actions]
200 if untried_actions:
201 high_level_action = np.random.choice(untried_actions)
202 else:
203 print("No valid action found, stopping episode.")
204 isTerminal = True
205 reward -= 10000.0
206 break
207 else:
208 high_level_action = np.argmax(probs)
209
210 high_level_action_str = self.action_mapping[high_level_action]
211 prob = probs[high_level_action]
212
213 if not isTerminal:
214
215
216 action_name, params = decide_parameters(obs, high_level_action_str)
217
218 if action_name == "None":
219 root = MCTSNode(state=self.env.state, action=(high_level_action_str, None))
220 mcts = MCTS(root, depth=5, n_simulations=60)
221 best_node = mcts.search()
222
223 if best_node:
224 params = best_node.action[1]
225 else:
226 bool_heuristic = True
227
228
229
230 params_check = list(params.values()) if isinstance(params, dict) else list(params)
231 if params_check != []:
232 if (high_level_action == 4 and last_action == 6) or (high_level_action == 5 and last_action == 7) \
233 or (high_level_action == 6 and last_action == 4) or (high_level_action == 7 and last_action == 5):
234 if last_trailer_id == params_check[1] and last_rack_id == params_check[0]:
235 reward -= 200.0
236 last_action = high_level_action
237 if last_action in [4, 5, 6, 7]:
238 last_trailer_id = params_check[1]
239 last_rack_id = params_check[0]
240
241
242
243 obs_ , reward_main, isTerminal = self.env.step(high_level_action_str, params)
244 reward += reward_main
245
246
247 if bool_heuristic:
248
249 if high_level_action_str == "right_unstack_rack":
250 action_name, params = decide_parameters(obs_, "deliver_to_hangar")
251 if not action_name == "None":
252 obs_ , reward_heuristic, isTerminal = self.env.step("deliver_to_hangar", params)
253 reward += reward_heuristic
254 reward += 50.0
255 else:
256
257 reward -= 20.0
258 elif high_level_action_str == "left_unstack_rack":
259 action_name, params = decide_parameters(obs_, "load_beluga")
260 if not action_name == "None":
261 obs_ , reward_heuristic, isTerminal = self.env.step("load_beluga", params)
262 reward += reward_heuristic
263 reward += 50.0
264 else:
265
266 reward -= 20.0
267 else:
268
269 reward += 5.0
270
271
272 print_action = high_level_action_str
273
274 self.ppo_agent.remember(obs, high_level_action, prob, val, reward, isTerminal)
275
276 if reward > 0:
277 positive_actions_reward += reward
278 if not obs_ is None:
279 obs = obs_
280 total_reward += reward
281 steps += 1
282 self.total_steps += 1
283
284
285 if self.total_steps >= start_learn_after and self.total_steps % (N*2) == 0:
286 self.ppo_agent.learn()
287 self.learn_iters += 1
288
289
290 if steps >= self.env.get_max_steps() or total_reward <= -10000:
291 isTerminal = True
292
293
294 self.episode_rewards.append(total_reward)
295 avg_reward = np.mean(self.episode_rewards[-10:])
296 self.avg_rewards.append(avg_reward)
297 self.steps_per_episode.append(steps)
298
299
300
301 if len(self.episode_rewards) >= 6:
302 recent_rewards = self.episode_rewards[-6:]
303 if all(reward <= -10000 for reward in recent_rewards):
304 print("\nSehr schlechte Performance in den letzten 10 Episoden. Setze Epsilon zurück, um mehr zu explorieren.")
305 self.epsilon_start = 0.9
306 self.epsilon_decay = 0.00001
307 self.total_steps = 0
308
309
310 if avg_reward > self.best_score:
311 self.ppo_agent.save_models()
312 self.best_score = avg_reward
313
314
315 solved = self.env.state.is_terminal()
316 status_symbol = "✅" if solved else " "
317
318 print(f'{status_symbol} episode {episode}, score {total_reward:.1f}, avg score {avg_reward:.1f}, Best avg score {self.best_score:.1f}',
319 f'time_steps {steps}/{self.env.get_max_steps()}, learn_iters {self.learn_iters}, positive reward {positive_actions_reward:.1f}, problem {self.env.problem_name}, {self.env.base_index}')
320
321
322 if episode > 0 and episode % 100 == 0:
323 self.ppo_agent.save_models()
324
325
326