FoPra Beluga Challenge - Reinforcement Learning v1.0
Deep Reinforcement Learning solution for the Beluga Challenge shipping container optimization problem using PPO and MCTS
rl.training.trainer.Trainer Class Reference

Main training orchestrator for the Beluga Challenge. More...

Public Member Functions

 __init__ (self, Env env, PPOAgent ppo_agent, mcts_params=None, debug=False)
 Initialize the trainer.
 
 get_valid_actions (self, obs)
 Check which actions are valid in the current state.
 
 train (self, n_episodes=2000, N=5, max_steps_per_episode=200, train_on_old_models=False, start_learn_after=500, use_permutation=False)
 Train the agent over a specified number of episodes.
 
 evaluateModel (self, n_eval_episodes=10, max_steps_per_episode=200, plot=False)
 
 evaluateProblem (self, problem, max_steps=2000, loop_detection=True, exploration_rate=0.1, save_to_file=False)
 

Public Attributes

 env = env
 
PPOAgent ppo_agent = ppo_agent
 
 mcts = None
 
 debug = debug
 
list episode_rewards = []
 
list avg_rewards = []
 
list steps_per_episode = []
 
int best_score = -90000
 
list score_history = []
 
int learn_iters = 0
 
dict invalid_action_counts = {i: 0 for i in range(8)}
 
float epsilon_start = 0.9
 
float epsilon_end = 0.2
 
float epsilon_decay = 0.00001
 
int total_steps = 0
 
dict action_mapping
 

Protected Member Functions

 _save_results_to_file (self, problem, steps, max_steps, is_terminal, action_trace, action_counts, optimized_steps, original_steps, execution_time, formatted_time)
 
 _format_parameters (self, action, params)
 

Detailed Description

Main training orchestrator for the Beluga Challenge.

This class manages the training process, coordinating between the RL agent, MCTS, and environment components.

Constructor & Destructor Documentation

◆ __init__()

rl.training.trainer.Trainer.__init__ ( self,
Env env,
PPOAgent ppo_agent,
mcts_params = None,
debug = False )

Initialize the trainer.

Parameters
envEnvironment instance
ppo_agentPPO agent for high-level decisions
mcts_paramsParameters for MCTS (optional)
debugEnable debug output
25 def __init__(self, env: Env, ppo_agent: PPOAgent, mcts_params=None, debug=False):
26 """!
27 @brief Initialize the trainer
28 @param env Environment instance
29 @param ppo_agent PPO agent for high-level decisions
30 @param mcts_params Parameters for MCTS (optional)
31 @param debug Enable debug output
32 """
33 self.env = env
34 self.ppo_agent: PPOAgent = ppo_agent # High-Level-Agent
35 self.mcts = None
36 self.debug = debug # Debug mode for additional output
37
38 # Tracking metrics
39 self.episode_rewards = []
40 self.avg_rewards = []
41 self.steps_per_episode = []
42 self.best_score = -90000
43 self.score_history = []
44 self.learn_iters = 0
45 self.invalid_action_counts = {i: 0 for i in range(8)} # Counter for invalid actions by type
46
47 # Exploration parameters
48 self.epsilon_start = 0.9 # Initial value for epsilon (exploration probability)
49 self.epsilon_end = 0.2 # Final value for epsilon
50 self.epsilon_decay = 0.00001 # Rate at which epsilon is reduced
51 self.total_steps = 0 # Total number of steps taken
52
53 # Number to Action Mapping
54 self.action_mapping = {
55 0 : "load_beluga",
56 1 : "unload_beluga",
57 2 : "get_from_hangar",
58 3 : "deliver_to_hangar",
59 4 : "left_stack_rack",
60 5 : "right_stack_rack",
61 6 : "left_unstack_rack",
62 7 : "right_unstack_rack"
63 }
64

Member Function Documentation

◆ _format_parameters()

rl.training.trainer.Trainer._format_parameters ( self,
action,
params )
protected
@brief Formats parameters for better readability in output
@param action The action name
@param params The parameters to format
@return Dictionary with formatted parameters

Converts tuples and lists into meaningful dictionary formats.
Filters out None values and 'none' keys.
799 def _format_parameters(self, action, params):
800 """
801 @brief Formats parameters for better readability in output
802 @param action The action name
803 @param params The parameters to format
804 @return Dictionary with formatted parameters
805
806 Converts tuples and lists into meaningful dictionary formats.
807 Filters out None values and 'none' keys.
808 """
809 # If params is already a dictionary, filter out None values and 'none' keys
810 if isinstance(params, dict):
811 # Filter out None values and 'none' keys
812 filtered_params = {k: v for k, v in params.items()
813 if v is not None and k.lower() != 'none'}
814 return filtered_params
815
816 # If params is a list or tuple, convert depending on action
817 if isinstance(params, (list, tuple)):
818 if len(params) == 0:
819 return {}
820 elif action in ["left_stack_rack", "right_stack_rack"]:
821 if len(params) >= 2:
822 result = {"rack": params[0], "trailer": params[1]}
823 else:
824 result = {"rack": params[0] if len(params) > 0 else None}
825 # Filter out None values
826 return {k: v for k, v in result.items() if v is not None}
827 elif action in ["left_unstack_rack", "right_unstack_rack"]:
828 if len(params) >= 2:
829 result = {"rack": params[0], "trailer": params[1]}
830 else:
831 result = {"rack": params[0] if len(params) > 0 else None}
832 # Filter out None values
833 return {k: v for k, v in result.items() if v is not None}
834 elif action == "load_beluga":
835 if len(params) >= 1:
836 result = {"trailer": params[0]}
837 else:
838 result = {}
839 # Filter out None values
840 return {k: v for k, v in result.items() if v is not None}
841 elif action == "unload_beluga":
842 return {}
843 elif action in ["get_from_hangar", "deliver_to_hangar"]:
844 if len(params) >= 2:
845 result = {"hangar": params[0], "trailer": params[1]}
846 else:
847 result = {"hangar": params[0] if len(params) > 0 else None}
848 # Filter out None values
849 return {k: v for k, v in result.items() if v is not None}
850 else:
851 # Fallback for unknown actions
852 return {"params": params}
853
854 # Fallback for other types
855 return params

◆ _save_results_to_file()

rl.training.trainer.Trainer._save_results_to_file ( self,
problem,
steps,
max_steps,
is_terminal,
action_trace,
action_counts,
optimized_steps,
original_steps,
execution_time,
formatted_time )
protected
@brief Saves the results of problem solving to a formatted TXT file
@param problem Path to the problem JSON file
@param steps Number of steps performed
@param max_steps Maximum number of steps
@param is_terminal Whether the problem was successfully solved
@param action_trace List of performed actions with parameters
@param action_counts Dictionary with action counts (after optimization)
@param optimized_steps Number of steps after optimization
@param original_steps Original number of steps
@param execution_time Required time in seconds
@param formatted_time Formatted time as string
719 def _save_results_to_file(self, problem, steps, max_steps, is_terminal, action_trace, action_counts, optimized_steps, original_steps, execution_time, formatted_time):
720 """
721 @brief Saves the results of problem solving to a formatted TXT file
722 @param problem Path to the problem JSON file
723 @param steps Number of steps performed
724 @param max_steps Maximum number of steps
725 @param is_terminal Whether the problem was successfully solved
726 @param action_trace List of performed actions with parameters
727 @param action_counts Dictionary with action counts (after optimization)
728 @param optimized_steps Number of steps after optimization
729 @param original_steps Original number of steps
730 @param execution_time Required time in seconds
731 @param formatted_time Formatted time as string
732 """
733 import os
734 from datetime import datetime
735
736 # Create output directory if it doesn't exist
737 output_dir = "results"
738 if not os.path.exists(output_dir):
739 os.makedirs(output_dir)
740
741 # Extract problem name for filename
742 problem_name = os.path.basename(problem).replace('.json', '')
743 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
744 filename = f"{output_dir}/solution_{problem_name}_{timestamp}.txt"
745
746 with open(filename, 'w', encoding='utf-8') as f:
747 # Header
748 f.write("="*70 + "\n")
749 f.write("BELUGA CHALLENGE - LÖSUNGSPROTOKOLL\n")
750 f.write("="*70 + "\n\n")
751
752 # Problem information
753 f.write(f"Problem: {problem}\n")
754 f.write(f"Lösungsdatum: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}\n")
755 f.write(f"Anzahl Schritte: {steps}/{max_steps}\n")
756 f.write(f"Erfolgreicher Abschluss: {'Ja' if is_terminal else 'Nein - Maximale Schritte erreicht'}\n")
757 f.write(f"Benötigte Zeit: {formatted_time}\n\n")
758
759 # Action statistics (after optimization)
760 f.write("="*70 + "\n")
761 f.write("AKTIONSSTATISTIK (NACH OPTIMIERUNG)\n")
762 f.write("="*70 + "\n\n")
763
764 for action, count in action_counts.items():
765 percentage = count/len(action_trace)*100 if len(action_trace) > 0 else 0
766 f.write(f"{action:<25}: {count:>4} ({percentage:>5.1f}%)\n")
767
768 # Optimization
769 f.write(f"\n{'='*70}\n")
770 f.write("OPTIMIERUNG\n")
771 f.write(f"{'='*70}\n\n")
772 f.write(f"Ursprüngliche Anzahl Schritte: {original_steps}\n")
773 f.write(f"Optimierte Anzahl Schritte: {optimized_steps}\n")
774 optimization_percentage = (1 - optimized_steps/original_steps) * 100 if original_steps > 0 else 0
775 f.write(f"Optimierung/Reduktion: {optimization_percentage:.2f}%\n\n")
776
777 # Optimized action sequence
778 f.write("="*70 + "\n")
779 f.write("OPTIMIERTE AKTIONSSEQUENZ\n")
780 f.write("="*70 + "\n\n")
781
782 for i, (action, params) in enumerate(action_trace, 1):
783 # Format parameters for better readability
784 formatted_params = self._format_parameters(action, params)
785
786 # Format output
787 if formatted_params:
788 params_str = ", ".join([f"{k}={v}" for k, v in formatted_params.items()])
789 f.write(f"{i:>3}: {action:<25} | Parameter: {params_str}\n")
790 else:
791 f.write(f"{i:>3}: {action:<25} | Parameter: -\n")
792
793 f.write(f"\n{'='*70}\n")
794 f.write("ENDE DES PROTOKOLLS\n")
795 f.write(f"{'='*70}\n")
796
797 print(f"\nErgebnisse wurden gespeichert in: {filename}")
798

◆ evaluateModel()

rl.training.trainer.Trainer.evaluateModel ( self,
n_eval_episodes = 10,
max_steps_per_episode = 200,
plot = False )
@brief Evaluates the model over a specific number of episodes
@param n_eval_episodes Number of episodes for evaluation (default: 10)
@param max_steps_per_episode Maximum steps per episode (default: 200)
@param plot Whether to plot results (default: False)
@return tuple containing average reward, standard deviation, and steps data
327 def evaluateModel(self, n_eval_episodes=10, max_steps_per_episode=200, plot = False):
328 """
329 @brief Evaluates the model over a specific number of episodes
330 @param n_eval_episodes Number of episodes for evaluation (default: 10)
331 @param max_steps_per_episode Maximum steps per episode (default: 200)
332 @param plot Whether to plot results (default: False)
333 @return tuple containing average reward, standard deviation, and steps data
334 """
335 self.ppo_agent.load_models()
336 total_rewards = []
337 steps_list = []
338
339 for ep in range(n_eval_episodes):
340 obs = self.env.reset()
341 isTerminal = False
342 total_reward = 0
343 steps = 0
344 last_trailer_id = None
345 last_rack_id = None
346 last_action = None
347
348 while not isTerminal and steps < max_steps_per_episode:
349
350 # Choose action without learning
351 _, _, _, dist = self.ppo_agent.choose_action(obs)
352
353 probs = dist.probs.detach().cpu().numpy()
354 high_level_action = np.argmax(probs)
355 high_level_action_str = self.action_mapping[high_level_action]
356
357 # Check which actions are valid
358 valid_actions = self.get_valid_actions(obs)
359
360 # If no valid actions exist, end episode
361 if not valid_actions:
362 print(f"[Eval] Keine gültigen Aktionen für diesen Zustand. Problem: {self.env.problem_name}")
363 isTerminal = True
364 break
365
366 # Try at most all actions
367 tried_actions = set()
368 while not self.env.check_action_execution(high_level_action_str, obs):
369 tried_actions.add(high_level_action)
370
371 if len(tried_actions) >= len(self.action_mapping):
372 print(f"[Eval] Alle Aktionen probiert, keine ist gültig. Problem: {self.env.problem_name}")
373 isTerminal = True
374 break
375
376 # Set probability of current action to 0
377 probs[high_level_action] = 0.0
378
379 # If all remaining probabilities are 0
380 if np.all(probs == 0):
381 # Choose randomly from actions not yet tried
382 untried_actions = [i for i in range(len(self.action_mapping)) if i not in tried_actions]
383 if untried_actions:
384 high_level_action = np.random.choice(untried_actions)
385 else:
386 isTerminal = True
387 break
388 else:
389 # Choose action with highest probability
390 high_level_action = np.argmax(probs)
391
392 high_level_action_str = self.action_mapping[high_level_action]
393
394 if isTerminal:
395 break
396
397 # Low-Level Agent
398 action_name, params = decide_parameters(obs, high_level_action_str)
399 if action_name == "None":
400 root = MCTSNode(state=self.env.state, action=(high_level_action_str, None))
401 mcts = MCTS(root, depth=3, n_simulations=3) # Reduced parameters for faster execution
402 best_node = mcts.search()
403 if best_node:
404 params = best_node.action[1]
405
406 # Loop prevention
407 params_check = list(params.values()) if isinstance(params, dict) else list(params)
408 if params_check != []:
409 if (high_level_action == 4 and last_action == 6) or (high_level_action == 5 and last_action == 7) \
410 or (high_level_action == 6 and last_action == 4) or (high_level_action == 7 and last_action == 5):
411 if last_trailer_id == params_check[1] and last_rack_id == params_check[0]:
412 total_reward -= 1000.0
413 last_action = high_level_action
414 if last_action in [4, 5, 6, 7]:
415 last_trailer_id = params_check[1]
416 last_rack_id = params_check[0]
417
418 obs, reward, isTerminal = self.env.step(high_level_action_str, params)
419 total_reward += reward
420 steps += 1
421
422 total_rewards.append(total_reward)
423 steps_list.append(steps)
424 print(f"[Eval] Episode {ep+1}: Reward = {total_reward:.2f}, Steps = {steps}")
425
426 avg_reward = np.mean(total_rewards)
427 std_reward = np.std(total_rewards) # Standard deviation of rewards
428 avg_steps = np.mean(steps_list)
429
430 print(f"\n⮞ Durchschnittlicher Reward: {avg_reward:.2f} ± {std_reward:.2f}")
431 print(f"⮞ Durchschnittliche Schritte: {avg_steps:.2f}")
432
433 if plot:
434 plt.figure(figsize=(10, 5))
435 plt.subplot(2, 1, 1)
436 plt.plot(total_rewards, 'r-o', label='Episode Reward')
437 plt.fill_between(
438 range(len(total_rewards)),
439 np.array(total_rewards) - np.std(std_reward),
440 np.array(total_rewards) + np.std(std_reward),
441 color='red', alpha=0.1
442 )
443 plt.title('Model Evaluation Results')
444 plt.ylabel('Total Reward')
445 plt.legend()
446 plt.grid(True, linestyle='--', alpha=0.5)
447 plt.subplot(2, 1, 2)
448 plt.bar(range(len(steps_list)), steps_list, color='blue', alpha=0.6)
449 plt.xlabel('Episode')
450 plt.ylabel('Steps')
451 plt.grid(True, linestyle='--', alpha=0.3)
452 plt.tight_layout()
453 plt.show()
454
455

◆ evaluateProblem()

rl.training.trainer.Trainer.evaluateProblem ( self,
problem,
max_steps = 2000,
loop_detection = True,
exploration_rate = 0.1,
save_to_file = False )
@brief Solves a specific problem with the trained model
@param problem Path to the problem JSON file
@param max_steps Maximum number of steps to avoid infinite loops (default: 2000)
@param loop_detection Enables detection and avoidance of action loops (default: True)
@param exploration_rate Probability of choosing a random action to break out of loops (default: 0.1)
@param save_to_file Saves results to TXT file (default: False)
@return tuple containing action sequence, parameters, and execution info
456 def evaluateProblem(self, problem, max_steps=2000, loop_detection=True, exploration_rate=0.1, save_to_file=False):
457 """
458 @brief Solves a specific problem with the trained model
459 @param problem Path to the problem JSON file
460 @param max_steps Maximum number of steps to avoid infinite loops (default: 2000)
461 @param loop_detection Enables detection and avoidance of action loops (default: True)
462 @param exploration_rate Probability of choosing a random action to break out of loops (default: 0.1)
463 @param save_to_file Saves results to TXT file (default: False)
464 @return tuple containing action sequence, parameters, and execution info
465 """
466 import time
467
468 # Start time measurement
469 start_time = time.time()
470
471 obs = self.env.reset_specific_problem(problem)
472 self.ppo_agent.load_models()
473
474 isTerminal = False
475 action_trace = []
476 steps = 0
477
478 # List to capture hash values of all visited states
479 visited_states = []
480 # Store hash value of environment state instead of observation
481 visited_states.append(hash(str(self.env.state)))
482
483 # For loop detection
484 action_history = []
485 repetition_count = {}
486 last_action = None
487
488 # Temperature for Boltzmann exploration (increases with repeated actions)
489 temperature = 1.0
490
491 print("Problem wird gelöst: " + problem)
492
493 while not isTerminal and steps < max_steps:
494 steps += 1
495 # Get action probabilities from agent
496 _, _, _, dist = self.ppo_agent.choose_action(obs)
497 probs = dist.probs.detach().cpu().numpy()
498
499 # Loop detection: Check if we're stuck in an action loop
500 if loop_detection and len(action_history) >= 6:
501 # Check last 6 actions for repeated patterns
502 last_6_actions = ''.join([str(a) for a in action_history[-6:]])
503 for pattern_length in [2, 3]: # Search for 2- or 3-patterns
504 if len(last_6_actions) >= pattern_length*2:
505 pattern = last_6_actions[-pattern_length*2:-pattern_length]
506 if pattern == last_6_actions[-pattern_length:]:
507 #print(f"[LOOP DETECTED] Muster: {pattern}")
508 # Increase temperature to break out of loop
509 temperature = min(10.0, temperature * 1.5) # Increase temperature, but not above 10
510 #print(f"Temperatur auf {temperature:.2f} erhöht")
511
512 # Decide whether to explore (random action) or exploit (best action)
513 if np.random.random() < exploration_rate or temperature > 1.5: # Increased exploration at high temperature
514 # Exploration: Choose action based on Boltzmann distribution or randomly
515 valid_actions = self.get_valid_actions(obs)
516 if valid_actions:
517 if temperature > 1.2:
518 # Boltzmann exploration with current temperature
519 # Normalize probabilities and apply temperature
520 valid_probs = np.array([probs[a] for a in valid_actions])
521 if np.sum(valid_probs) > 0:
522 scaled_probs = np.exp(np.log(valid_probs + 1e-10) / temperature)
523 scaled_probs = scaled_probs / np.sum(scaled_probs)
524 high_level_action = np.random.choice(valid_actions, p=scaled_probs)
525 #print(f"[BOLTZMANN EXPLORATION] Temp={temperature:.2f}")
526 else:
527 high_level_action = np.random.choice(valid_actions)
528 else:
529 # Simple random exploration
530 high_level_action = np.random.choice(valid_actions)
531
532 high_level_action_str = self.action_mapping[high_level_action]
533 #print(f"[EXPLORATION] Wähle: {high_level_action_str}")
534 else:
535 # No valid actions available
536 print(f"Keine gültigen Aktionen für diesen Zustand möglich. Problem: {problem}")
537 return
538 else:
539 # Exploitation: Normal process with best action
540 # Probabilities were already retrieved above
541
542 # Check which actions are valid
543 valid_actions = self.get_valid_actions(obs)
544
545 # If no valid actions exist, end episode
546 if not valid_actions:
547 print(f"Keine gültigen Aktionen für diesen Zustand möglich. Problem: {problem}")
548 return
549
550 # Choose best valid action from available ones
551 # Create mask for valid actions
552 valid_mask = np.zeros_like(probs)
553 for valid_action in valid_actions:
554 valid_mask[valid_action] = 1
555
556 # Multiply probabilities with mask and choose best
557 masked_probs = probs * valid_mask
558 if np.sum(masked_probs) > 0:
559 high_level_action = np.argmax(masked_probs)
560 else:
561 # Fallback: Choose randomly from valid actions
562 high_level_action = np.random.choice(valid_actions)
563
564 high_level_action_str = self.action_mapping[high_level_action]
565
566 # Try at most all actions
567 tried_actions = set()
568 while not self.env.check_action_execution(high_level_action_str, obs):
569 tried_actions.add(high_level_action)
570
571 if len(tried_actions) >= len(self.action_mapping):
572 print(f"Alle Aktionen probiert, keine ist gültig. Problem: {problem}")
573 return
574
575 # Set probability of current action to 0
576 probs[high_level_action] = 0.0
577
578 # If all remaining probabilities are 0
579 if np.all(probs == 0):
580 # Choose randomly from actions not yet tried
581 untried_actions = [i for i in range(len(self.action_mapping)) if i not in tried_actions]
582 if untried_actions:
583 high_level_action = np.random.choice(untried_actions)
584 else:
585 print("Keine gültige Aktion verfügbar. PROBLEM STUCK!")
586 return
587 else:
588 # Choose action with highest probability
589 high_level_action = np.argmax(probs)
590
591 high_level_action_str = self.action_mapping[high_level_action]
592
593 # Heuristic parameter decision
594 action_name, params = decide_parameters(obs, high_level_action_str)
595 if action_name == "None":
596 root = MCTSNode(state=self.env.state, action=(high_level_action_str, None))
597 mcts = MCTS(root, depth=3, n_simulations=3) # Reduced parameters for faster execution
598 best_node = mcts.search()
599 if best_node:
600 params = best_node.action[1]
601
602 # Execute action
603 obs, reward, isTerminal = self.env.step(high_level_action_str, params)
604
605 # Add current state as hash to list
606 visited_states.append(hash(str(self.env.state)))
607
608 # Store action and parameters
609 action_trace.append((high_level_action_str, params))
610
611 # For loop detection: Store action in history
612 action_history.append(high_level_action)
613
614 # Detect special patterns (e.g., alternating stack/unstack)
615 if last_action is not None:
616 action_pair = (last_action, high_level_action)
617 if action_pair in repetition_count:
618 repetition_count[action_pair] += 1
619 # If pattern is repeated too often, increase temperature
620 if repetition_count[action_pair] > 3: # After 3 repetitions
621 temperature = min(5.0, temperature + 0.5)
622 #print(f"[PATTERN DETECTED] {self.action_mapping[action_pair[0]]} -> {self.action_mapping[action_pair[1]]}")
623 #print(f"Temperatur auf {temperature:.2f} erhöht")
624 else:
625 repetition_count[action_pair] = 1
626
627 # Store current action for next iteration
628 last_action = high_level_action
629
630 # Cool down temperature over time if no patterns are detected
631 if temperature > 1.0:
632 temperature = max(1.0, temperature - 0.1)
633
634 # Output results
635 print("\n" + "="*50)
636 print(f"ERGEBNIS FÜR PROBLEM: {problem}")
637 print(f"Anzahl Schritte: {steps}/{max_steps}")
638 print(f"Erfolgreicher Abschluss: {'Ja' if isTerminal else 'Nein - Maximale Schritte erreicht'}")
639 print("="*50)
640
641 # Statistics of actions
642 action_counts = {}
643 for action, _ in action_trace:
644 if action in action_counts:
645 action_counts[action] += 1
646 else:
647 action_counts[action] = 1
648
649 print("\nAktionsstatistik:")
650 for action, count in action_counts.items():
651 print(f"{action}: {count} ({count/len(action_trace)*100:.1f}%)")
652
653 initial_state_count = len(visited_states)
654 # Loop-Detection and removal of unnecessary states
655 if loop_detection:
656 state_count = len(visited_states)
657
658 for i in range(state_count):
659 if i >= state_count:
660 break
661 index = -1
662 for j in range(1, state_count - i -1):
663 #print(j+i)
664 if j + i >= len(visited_states):
665 break
666 if visited_states[i] == visited_states[j + i]:
667 index = j + i
668 if index != -1:
669 del visited_states[i : index]
670 del action_trace[i : index]
671 state_count -= (index - i - 1)
672
673 print("\n" + "="*50)
674 print("Anzahl der Aktionen nach Post-Processing:", len(action_trace), "\nOptimierung/Reduktion:" , f"{(1 - len(action_trace)/steps) * 100: .2f}", "%")
675 print("="*50)
676
677 # End time measurement
678 end_time = time.time()
679 execution_time = end_time - start_time
680
681 # Format time readably
682 def format_time(seconds):
683 if seconds < 60:
684 return f"{seconds:.2f} Sekunden"
685 elif seconds < 3600:
686 minutes = int(seconds // 60)
687 secs = seconds % 60
688 return f"{minutes} Min {secs:.1f} Sek"
689 else:
690 hours = int(seconds // 3600)
691 minutes = int((seconds % 3600) // 60)
692 secs = seconds % 60
693 return f"{hours} Std {minutes} Min {secs:.1f} Sek"
694
695 formatted_time = format_time(execution_time)
696 print(f"\nBenötigte Zeit: {formatted_time}")
697
698 # Calculate optimized action statistics
699 optimized_action_counts = {}
700 for action, _ in action_trace:
701 if action in optimized_action_counts:
702 optimized_action_counts[action] += 1
703 else:
704 optimized_action_counts[action] = 1
705
706 print("\nOptimierte Aktionsstatistik:")
707 for action, count in optimized_action_counts.items():
708 percentage = count/len(action_trace)*100 if len(action_trace) > 0 else 0
709 print(f"{action}: {count} ({percentage:.1f}%)")
710
711 # Save results to file if desired
712 if save_to_file:
713 self._save_results_to_file(problem, steps, max_steps, isTerminal, action_trace, optimized_action_counts, len(action_trace), steps, execution_time, formatted_time)
714
715 # Return extended with visited_states (all visited states)
716 return isTerminal, len(action_trace), visited_states
717
718

◆ get_valid_actions()

rl.training.trainer.Trainer.get_valid_actions ( self,
obs )

Check which actions are valid in the current state.

Parameters
obsCurrent observation
Returns
List of valid action indices
65 def get_valid_actions(self, obs):
66 """!
67 @brief Check which actions are valid in the current state
68 @param obs Current observation
69 @return List of valid action indices
70 """
71 valid_actions = []
72 for action_idx in range(len(self.action_mapping)):
73 if self.env.check_action_execution(self.action_mapping[action_idx], obs):
74 valid_actions.append(action_idx)
75 return valid_actions
76

◆ train()

rl.training.trainer.Trainer.train ( self,
n_episodes = 2000,
N = 5,
max_steps_per_episode = 200,
train_on_old_models = False,
start_learn_after = 500,
use_permutation = False )

Train the agent over a specified number of episodes.

Parameters
n_episodesNumber of training episodes
NFrequency of learning steps
max_steps_per_episodeMaximum number of steps per episode
train_on_old_modelsWhether to load existing models
start_learn_afterAfter how many steps learning should begin
use_permutationWhether observations should be permuted (can stabilize training but costs time)
77 def train(self, n_episodes=2000, N=5, max_steps_per_episode = 200, train_on_old_models = False, start_learn_after = 500, use_permutation = False):
78 """!
79 @brief Train the agent over a specified number of episodes
80 @param n_episodes Number of training episodes
81 @param N Frequency of learning steps
82 @param max_steps_per_episode Maximum number of steps per episode
83 @param train_on_old_models Whether to load existing models
84 @param start_learn_after After how many steps learning should begin
85 @param use_permutation Whether observations should be permuted (can stabilize training but costs time)
86 """
87 if train_on_old_models:
88 self.ppo_agent.load_models() # Load the PPO agent's models
89 self.total_steps = 0
90
91 for episode in range(n_episodes):
92 obs = self.env.reset()
93 isTerminal = False
94 total_reward = 0
95 steps = 0
96 last_trailer_id = None
97 last_rack_id = None
98 last_action = None
99 positive_actions_reward = 0
100
101 while not isTerminal:
102 bool_heuristic = False
103 reward = 0
104 # High-Level decision (PPO)
105 # Calculate current epsilon value for exploration
106 epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
107 np.exp(-self.epsilon_decay * self.total_steps)
108
109 # Epsilon-Greedy strategy for exploration
110 if np.random.random() < epsilon:
111 # Explorative action: Choose a random valid action
112 valid_actions = self.get_valid_actions(obs)
113 if valid_actions:
114 high_level_action = np.random.choice(valid_actions)
115 high_level_action_str = self.action_mapping[high_level_action]
116
117 # To maintain PPO logic, we need the distribution
118 if use_permutation:
119 _, prob, val, dist = self.ppo_agent.choose_action(permute_high_level_observation(np.random.permutation(10), obs))
120 else:
121 _, prob, val, dist = self.ppo_agent.choose_action(obs)
122 else:
123 # If no valid actions are available, use normal strategies
124 if use_permutation:
125 obs_ = None # Reset observation for next iteration
126 permutation = np.random.permutation(10)
127 permuted_obs = permute_high_level_observation(permutation, obs)
128 high_level_action, prob, val, dist = self.ppo_agent.choose_action(permuted_obs)
129 else:
130 high_level_action, prob, val, dist = self.ppo_agent.choose_action(obs)
131 high_level_action_str = self.action_mapping[high_level_action]
132 else:
133 # Exploitative action: Use PPO policy
134 if use_permutation:
135 obs_ = None # Reset observation for next iteration
136 permutation = np.random.permutation(10)
137 permuted_obs = permute_high_level_observation(permutation, obs)
138 high_level_action, prob, val, dist = self.ppo_agent.choose_action(permuted_obs)
139 else:
140 high_level_action, prob, val, dist = self.ppo_agent.choose_action(obs)
141 high_level_action_str = self.action_mapping[high_level_action] # Action mapping
142
143 probs = dist.probs.detach().cpu().numpy()
144
145 # Check which actions are valid
146 valid_actions = self.get_valid_actions(obs)
147
148 # If no valid actions exist, error message and end episode
149 if not valid_actions:
150 print(f"Keine gültigen Aktionen für diesen Zustand möglich. Problem: {self.env.problem_name}")
151 isTerminal = True
152 reward -= 5000.0 # Reduced penalty since it's really impossible
153 break
154
155 # Balance actions - if too many unstack actions were used,
156 # reduce their probability in favor of other actions
157 if episode > 100 and self.total_steps > 500: # After a warm-up phase
158 total_invalid = sum(self.invalid_action_counts.values())
159 if total_invalid > 0:
160 unstack_percentage = (self.invalid_action_counts[6] + self.invalid_action_counts[7]) / total_invalid
161 if unstack_percentage > 0.4: # If more than 40% of invalid actions are unstacks
162 # Reduce probability for unstack actions
163 unstack_idx = [6, 7] # left_unstack_rack, right_unstack_rack
164 scale_factor = 0.5 # Scale probability down
165 for idx in unstack_idx:
166 if idx < len(probs):
167 probs[idx] *= scale_factor
168 # Normalize probabilities again
169 if np.sum(probs) > 0:
170 probs = probs / np.sum(probs)
171
172 # Update action choice
173 high_level_action = np.argmax(probs)
174 high_level_action_str = self.action_mapping[high_level_action]
175 prob = probs[high_level_action]
176
177 # Debug output for valid actions, if enabled
178 if self.debug and steps % 10 == 0: # Don't output too often
179 valid_action_names = [self.action_mapping[idx] for idx in valid_actions]
180 print(f"Gültige Aktionen: {valid_action_names}")
181
182 # Try at most all actions
183 tried_actions = set()
184 while not self.env.check_action_execution(high_level_action_str, obs):
185 # Count invalid actions for later analysis
186 self.invalid_action_counts[high_level_action] += 1
187 tried_actions.add(high_level_action)
188
189 if len(tried_actions) >= len(self.action_mapping):
190 print(f"Alle Aktionen probiert, keine ist gültig. Problem: {self.env.problem_name}")
191 isTerminal = True
192 reward -= 10000.0
193 break
194
195 probs[high_level_action] = 0.0
196
197 # If all remaining probabilities are 0, choose randomly from untried actions
198 if np.all(probs == 0):
199 untried_actions = [i for i in range(len(self.action_mapping)) if i not in tried_actions]
200 if untried_actions:
201 high_level_action = np.random.choice(untried_actions)
202 else:
203 print("No valid action found, stopping episode.")
204 isTerminal = True
205 reward -= 10000.0
206 break
207 else:
208 high_level_action = np.argmax(probs)
209
210 high_level_action_str = self.action_mapping[high_level_action]
211 prob = probs[high_level_action]
212
213 if not isTerminal:
214 # Low-Level-Agent:
215 # Heuristik
216 action_name, params = decide_parameters(obs, high_level_action_str)
217 # If no heuristic found, use MCTS
218 if action_name == "None":
219 root = MCTSNode(state=self.env.state, action=(high_level_action_str, None))
220 mcts = MCTS(root, depth=5, n_simulations=60)
221 best_node = mcts.search()
222
223 if best_node:
224 params = best_node.action[1]
225 else:
226 bool_heuristic = True
227
228
229 # Check if there is a loop in the actions
230 params_check = list(params.values()) if isinstance(params, dict) else list(params)
231 if params_check != []:
232 if (high_level_action == 4 and last_action == 6) or (high_level_action == 5 and last_action == 7) \
233 or (high_level_action == 6 and last_action == 4) or (high_level_action == 7 and last_action == 5):
234 if last_trailer_id == params_check[1] and last_rack_id == params_check[0]:
235 reward -= 200.0
236 last_action = high_level_action
237 if last_action in [4, 5, 6, 7]:
238 last_trailer_id = params_check[1]
239 last_rack_id = params_check[0]
240
241
242 # Führe Aktion aus
243 obs_ , reward_main, isTerminal = self.env.step(high_level_action_str, params)
244 reward += reward_main
245
246 # Wenn besondere Heuristik verwendet wurde, dann führe folge von Aktionen aus: left_unstack -> load_beluga; right_unstack -> deliver_to_hangar
247 if bool_heuristic:
248 # Reduziere die Belohnung für "unstack" Aktionen, wenn die Folgeaktion nicht ausgeführt werden kann
249 if high_level_action_str == "right_unstack_rack":
250 action_name, params = decide_parameters(obs_, "deliver_to_hangar")
251 if not action_name == "None":
252 obs_ , reward_heuristic, isTerminal = self.env.step("deliver_to_hangar", params)
253 reward += reward_heuristic
254 reward += 50.0 # Erhöhte Belohnung für erfolgreiche Aktionskette
255 else:
256 # Bestrafe das Unstacking ohne Folgeaktion
257 reward -= 20.0
258 elif high_level_action_str == "left_unstack_rack":
259 action_name, params = decide_parameters(obs_, "load_beluga")
260 if not action_name == "None":
261 obs_ , reward_heuristic, isTerminal = self.env.step("load_beluga", params)
262 reward += reward_heuristic
263 reward += 50.0 # Erhöhte Belohnung für erfolgreiche Aktionskette
264 else:
265 # Penalize unstacking without follow-up action
266 reward -= 20.0
267 else:
268 # Other heuristics receive smaller rewards
269 reward += 5.0
270
271
272 print_action = high_level_action_str # Store last action for debugging
273 # Store experience for PPO
274 self.ppo_agent.remember(obs, high_level_action, prob, val, reward, isTerminal)
275
276 if reward > 0:
277 positive_actions_reward += reward
278 if not obs_ is None:
279 obs = obs_
280 total_reward += reward
281 steps += 1
282 self.total_steps += 1
283
284 # PPO learning step at end of episode (optimized frequency)
285 if self.total_steps >= start_learn_after and self.total_steps % (N*2) == 0:
286 self.ppo_agent.learn()
287 self.learn_iters += 1
288
289 # debuglog(steps) # Debug output disabled
290 if steps >= self.env.get_max_steps() or total_reward <= -10000:
291 isTerminal = True # Adjusted termination condition with less strict reward limit
292
293 # Save metrics
294 self.episode_rewards.append(total_reward)
295 avg_reward = np.mean(self.episode_rewards[-10:])
296 self.avg_rewards.append(avg_reward)
297 self.steps_per_episode.append(steps)
298
299 # Check if epsilon reset is needed
300 # If the last 6 episodes all have very bad rewards, reset epsilon
301 if len(self.episode_rewards) >= 6:
302 recent_rewards = self.episode_rewards[-6:]
303 if all(reward <= -10000 for reward in recent_rewards):
304 print("\nSehr schlechte Performance in den letzten 10 Episoden. Setze Epsilon zurück, um mehr zu explorieren.")
305 self.epsilon_start = 0.9 # Zurücksetzen auf Anfangswert
306 self.epsilon_decay = 0.00001 # Zurücksetzen der Decay-Rate
307 self.total_steps = 0 # Zurücksetzen der Schritte für die Epsilon-Berechnung
308
309 # Save model if average reward improves
310 if avg_reward > self.best_score:
311 self.ppo_agent.save_models()
312 self.best_score = avg_reward
313
314 # Check if the problem is solved
315 solved = self.env.state.is_terminal()
316 status_symbol = "✅" if solved else " "
317
318 print(f'{status_symbol} episode {episode}, score {total_reward:.1f}, avg score {avg_reward:.1f}, Best avg score {self.best_score:.1f}',
319 f'time_steps {steps}/{self.env.get_max_steps()}, learn_iters {self.learn_iters}, positive reward {positive_actions_reward:.1f}, problem {self.env.problem_name}, {self.env.base_index}')
320
321 # Save model every 100 episodes
322 if episode > 0 and episode % 100 == 0:
323 self.ppo_agent.save_models()
324
325
326

Member Data Documentation

◆ action_mapping

rl.training.trainer.Trainer.action_mapping
Initial value:
= {
0 : "load_beluga",
1 : "unload_beluga",
2 : "get_from_hangar",
3 : "deliver_to_hangar",
4 : "left_stack_rack",
5 : "right_stack_rack",
6 : "left_unstack_rack",
7 : "right_unstack_rack"
}

◆ avg_rewards

list rl.training.trainer.Trainer.avg_rewards = []

◆ best_score

rl.training.trainer.Trainer.best_score = -90000

◆ debug

rl.training.trainer.Trainer.debug = debug

◆ env

rl.training.trainer.Trainer.env = env

◆ episode_rewards

rl.training.trainer.Trainer.episode_rewards = []

◆ epsilon_decay

float rl.training.trainer.Trainer.epsilon_decay = 0.00001

◆ epsilon_end

float rl.training.trainer.Trainer.epsilon_end = 0.2

◆ epsilon_start

float rl.training.trainer.Trainer.epsilon_start = 0.9

◆ invalid_action_counts

dict rl.training.trainer.Trainer.invalid_action_counts = {i: 0 for i in range(8)}

◆ learn_iters

int rl.training.trainer.Trainer.learn_iters = 0

◆ mcts

rl.training.trainer.Trainer.mcts = None

◆ ppo_agent

PPOAgent rl.training.trainer.Trainer.ppo_agent = ppo_agent

◆ score_history

list rl.training.trainer.Trainer.score_history = []

◆ steps_per_episode

list rl.training.trainer.Trainer.steps_per_episode = []

◆ total_steps

int rl.training.trainer.Trainer.total_steps = 0

The documentation for this class was generated from the following file: