🏋️ ODETrainer API¶

The ODETrainer class acts as the high-level orchestrator for the training pipeline. It wraps PyTorch Lightning to provide a standardized interface for training Neural ODEs on flight data.

It functions as the central bridge in the pipeline: it validates the configuration, retrieves the specific model architecture from the registry, initializes the training environment, and manages the lifecycle of model checkpoints and metadata artifacts.

📘 Class Reference¶

`ode_trainer` ¶

Training utilities for neural ODE-based flight dynamics models.

`ODETrainer` ¶

Handle data preparation, training loops, and checkpointing for ODE models.

Source code in src/node_fdm/ode_trainer.py

class ODETrainer:
    """Handle data preparation, training loops, and checkpointing for ODE models."""

    def __init__(
        self,
        data_df: pd.DataFrame,
        model_config: Dict[str, Any],
        model_dir: Any,
        num_workers: int = 4,
        load_parallel: bool = True,
        train_val_num: Tuple[int, int] = (5000, 500),
    ) -> None:
        """Initialize trainer with data, model configuration, and I/O paths.

        Args:
            data_df: DataFrame containing file paths and split labels.
            model_config: Dictionary describing architecture, hyperparameters, and loader settings.
            model_dir: Base directory to store checkpoints and metadata.
            num_workers: Number of workers for DataLoaders.
            load_parallel: Whether to load flights in parallel.
            train_val_num: Tuple specifying how many train/val files to load.
        """

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        architecture, self.model_cols, custom_fn = get_architecture_from_name(
            model_config["architecture_name"]
        )
        self.x_cols, self.u_cols, self.e0_cols, self.e_cols, self.dx_cols = (
            self.model_cols
        )
        self.model_dir = model_dir / model_config["model_name"]
        os.makedirs(self.model_dir, exist_ok=True)
        self.architecture = architecture
        self.model_config = model_config
        self.architecture_name = model_config["architecture_name"]
        self.model_params = model_config["model_params"]

        self.train_dataset, self.val_dataset = get_train_val_data(
            data_df,
            self.model_cols,
            shift=model_config["shift"],
            seq_len=model_config["seq_len"],
            custom_fn=custom_fn,
            load_parallel=load_parallel,
            train_val_num=train_val_num,
        )
        self.step = model_config["step"]
        self.num_workers = num_workers

        self.stats_dict = self.train_dataset.stats_dict

        self.model = self.get_or_create_model(*model_config["loading_args"])

        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=model_config["lr"],
            weight_decay=model_config["weight_decay"],
        )
        self.epoch = 1
        self.save_meta()

    def get_or_create_model(
        self, load: bool = False, load_loss: bool = False
    ) -> FlightDynamicsModel:
        """Instantiate a new model or load existing checkpoints.

        Args:
            load: Whether to attempt loading existing checkpoints.
            load_loss: Whether to restore tracked best validation loss when loading.

        Returns:
            Initialized or restored `FlightDynamicsModel` instance.
        """
        self.best_val_loss = float("inf")
        if load and os.path.exists(self.model_dir / "meta.json"):
            model = self.load_best_checkpoint(load_loss=load_loss)
        else:
            print("Creating new model.")
            model = FlightDynamicsModel(
                self.architecture,
                self.stats_dict,
                self.model_cols,
                model_params=self.model_params,
            ).to(self.device)
        return model

    def load_best_checkpoint(self, load_loss: bool = False) -> FlightDynamicsModel:
        """Create and populate a model from saved checkpoints.

        Args:
            load_loss: Whether to restore tracked best validation loss.

        Returns:
            Model with layer weights loaded when available.
        """
        model = FlightDynamicsModel(
            self.architecture,
            self.stats_dict,
            self.model_cols,
            model_params=self.model_params,
        ).to(self.device)

        for name in model.layers_name:
            checkpoint = self.load_layer_checkpoint(name)
            if checkpoint is not None:
                model.layers_dict[name].load_state_dict(
                    checkpoint["layer_state"], strict=False
                )
                best_val_loss = checkpoint.get("best_val_loss", float("inf"))
                self.epoch = checkpoint.get("epoch", 0)
            else:
                best_val_loss = float("inf")
                self.epoch = 0

        if load_loss:
            self.best_val_loss = best_val_loss

        print("Best val loss per layer:", self.best_val_loss)
        print(f"Loaded modular model from {self.model_dir}")

        return model

    def load_layer_checkpoint(self, layer_name: str) -> Optional[Dict[str, Any]]:
        """Load checkpoint dictionary for a specific layer if available.

        Args:
            layer_name: Name of the layer to load.

        Returns:
            Checkpoint dictionary if found, otherwise None.
        """
        path = os.path.join(self.model_dir, f"{layer_name}.pt")
        if not os.path.exists(path):
            print(f"No checkpoint found for layer {layer_name}, skipping load.")
            return None
        else:
            print(f"checkpoint found for layer {layer_name}")
        checkpoint = torch.load(path, map_location=self.device)
        return checkpoint

    def save_meta(self) -> None:
        """Persist training metadata and statistics to disk.

        Creates or updates `meta.json` within the model directory.
        """
        saved_stats_dict = {str(col): value for col, value in self.stats_dict.items()}

        meta_dict = {
            "architecture_name": self.architecture_name,
            "model_params": self.model_config["model_params"],
            "step": self.model_config["step"],
            "shift": self.model_config["shift"],
            "lr": self.model_config["lr"],
            "seq_len": self.model_config["seq_len"],
            "batch_size": self.model_config["batch_size"],
            "stats_dict": saved_stats_dict,
        }
        print(self.model_dir / "meta.json")
        with open(self.model_dir / "meta.json", "w") as f:
            json.dump(meta_dict, f, indent=4)

    def save_layer_checkpoint(self, layer_name: str, epoch: int) -> None:
        """Save checkpoint for an individual layer.

        Args:
            layer_name: Name of the layer to checkpoint.
            epoch: Current epoch offset for tracking.
        """
        layer = self.model.layers_dict[layer_name]
        save_dict = {
            "layer_state": layer.state_dict(),
            "optimizer_state": self.optimizer.state_dict(),
            "best_val_loss": self.best_val_loss,
            "epoch": self.epoch + epoch,
        }
        torch.save(save_dict, self.model_dir / f"{layer_name}.pt")

    def save_model(self, epoch: int) -> None:
        """Save checkpoints for all layers.

        Args:
            epoch: Epoch index used when saving checkpoints.
        """
        for name in self.model.layers_name:
            self.save_layer_checkpoint(name, epoch)

    def norm_vect(self, vect: torch.Tensor, col: Any) -> torch.Tensor:
        """Normalize tensor using stored statistics for a column.

        Args:
            vect: Tensor to normalize.
            col: Column identifier used to fetch statistics.

        Returns:
            Normalized tensor.
        """
        return (vect - self.stats_dict[col]["mean"]) / (
            self.stats_dict[col]["std"] + 1e-3
        )

    def cat_to_dict_vects(
        self,
        vect_list: Sequence[torch.Tensor],
        col_list: Sequence[Any],
        alpha_dict: Dict[Any, float],
        normalize: bool = True,
    ) -> Dict[Any, torch.Tensor]:
        """Concatenate vectors and build a dict keyed by column definitions.

        Args:
            vect_list: Sequence of tensors to concatenate along the feature axis.
            col_list: Column identifiers matching the concatenated tensors.
            alpha_dict: Optional scaling factors applied per column.
            normalize: Whether to normalize columns that request it.

        Returns:
            Dictionary mapping columns to (optionally) scaled and normalized tensors.
        """

        def modifier(el: torch.Tensor, col: Any) -> torch.Tensor:
            if (col.normalize_mode == "normal") & (normalize):
                return self.norm_vect(el, col)
            return el

        coeff_list = [
            alpha_dict[col] if col in alpha_dict.keys() else 0.0 for col in col_list
        ]

        vects = torch.cat(vect_list, dim=2)
        vects_dict = {
            col: coeff * modifier(vects[..., i], col).unsqueeze(-1)
            for i, (col, coeff) in enumerate(zip(col_list, coeff_list))
        }
        return vects_dict

    def ode_step(
        self,
        x_seq: torch.Tensor,
        u_seq: torch.Tensor,
        e_seq: torch.Tensor,
        method: str,
        alpha_dict: Dict[Any, float],
    ) -> Tuple[torch.Tensor, torch.Tensor, Sequence[Any]]:
        """Integrate one ODE step and return true/predicted trajectories.

        Args:
            x_seq: State sequences for the batch.
            u_seq: Control sequences for the batch.
            e_seq: Environment sequences for the batch.
            method: ODE solver method passed to `odeint`.
            alpha_dict: Scaling factors per monitored column.

        Returns:
            Tuple of (true trajectories, predicted trajectories, monitored columns).
        """
        seq_len = x_seq.shape[1]

        assert not torch.isnan(x_seq).any(), "NaN in x_seq"
        assert not torch.isnan(u_seq).any(), "NaN in u_seq"
        assert not torch.isnan(e_seq).any(), "NaN in e_seq"

        x0 = x_seq[:, 0, :]

        t_grid = torch.arange(
            0, seq_len * self.step, self.step, dtype=torch.float32, device=self.device
        )

        func = BatchNeuralODE(self.model, u_seq, e_seq, t_grid)

        odeint(func, x0, t_grid, method=method)

        vects = torch.cat([x_seq, u_seq, e_seq], dim=2)
        vect_dict = {
            col: vects[..., i].unsqueeze(-1)
            for i, col in enumerate(
                self.x_cols + self.u_cols + self.e0_cols + self.e_cols
            )
        }

        vects_dict = dict()

        monitor_cols = self.x_cols + self.e_cols

        for case in ["true", "pred"]:
            if case == "pred":
                vect_list = [
                    self.model.history[col].unsqueeze(-1) for col in monitor_cols
                ]
            else:
                vect_list = [vect_dict[col][:, 1:] for col in monitor_cols]

            vects_dict[case] = self.cat_to_dict_vects(
                vect_list,
                monitor_cols,
                alpha_dict=alpha_dict,
            )
        true_vect = torch.cat([vects_dict["true"][col] for col in monitor_cols], dim=2)
        pred_vect = torch.cat([vects_dict["pred"][col] for col in monitor_cols], dim=2)
        return true_vect, pred_vect, monitor_cols

    def compute_loss_ode_step(
        self,
        batch: Sequence[torch.Tensor],
        alpha_dict: Dict[Any, float],
        method: str = "rk4",
    ) -> torch.Tensor:
        """Compute loss for a single ODE rollout batch.

        Args:
            batch: Tuple of tensors `(x_seq, u_seq, e_seq, dx_seq)` from the DataLoader.
            alpha_dict: Scaling factors per monitored column.
            method: ODE solver method.

        Returns:
            Scalar loss tensor for the batch.
        """
        x_seq, u_seq, e_seq, _ = [b.to(self.device) for b in batch]
        true_vect, pred_vect, monitor_cols = self.ode_step(
            x_seq,
            u_seq,
            e_seq,
            method,
            alpha_dict,
        )

        loss = 0.0
        for i, col in enumerate(monitor_cols):
            if col in alpha_dict.keys():
                loss_fn = get_loss(col.loss_name)
                assert not torch.isnan(pred_vect[..., i]).any(), "NaN in pred_vect"
                assert not torch.isnan(true_vect[..., i]).any(), "NaN in true_vect"
                res = loss_fn(pred_vect[..., i], true_vect[..., i])
                loss += res

        if torch.isnan(loss) or torch.isinf(loss):
            print("NaN or Inf in loss!")

        return loss

    def train(
        self,
        epochs: int = 800,
        batch_size: int = 512,
        val_batch_size: int = 10000,
        scheduler: Optional[Any] = None,
        method: str = "rk4",
        alpha_dict: Optional[Dict[Any, float]] = None,
    ) -> None:
        """Train the ODE model and persist checkpoints/metrics.

        Args:
            epochs: Number of training epochs.
            batch_size: Training batch size.
            val_batch_size: Validation batch size.
            scheduler: Optional learning-rate scheduler.
            method: ODE solver method.
            alpha_dict: Optional scaling factors per monitored column.
        """
        self.train_loader = DataLoader(
            self.train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=self.num_workers,
        )
        self.val_loader = DataLoader(
            self.val_dataset,
            batch_size=val_batch_size,
            shuffle=False,
            num_workers=self.num_workers,
        )

        if alpha_dict is None:
            alpha_dict = {col: 1.0 for col in self.x_cols}

        self.stats_dict = self.train_dataset.stats_dict

        losses = []
        loss_csv_path = os.path.join(self.model_dir, "training_losses.csv")
        fig_path = os.path.join(self.model_dir, "training_curve.png")

        for epoch in range(epochs):
            # --- TRAIN LOOP ---
            self.model.train()
            total_loss, total_batches = 0, 0
            for batch in self.train_loader:
                loss = self.compute_loss_ode_step(
                    batch, alpha_dict=alpha_dict, method=method
                )
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                self.optimizer.step()
                total_loss += loss.item()
                total_batches += 1
            avg_train_loss = total_loss / total_batches

            # --- VALIDATION LOOP ---
            self.model.eval()
            val_loss, val_batches = 0, 0
            with torch.no_grad():
                for batch in self.val_loader:
                    loss = self.compute_loss_ode_step(
                        batch, alpha_dict=alpha_dict, method=method
                    )
                    val_loss += loss.item()
                    val_batches += 1
            avg_val_loss = val_loss / val_batches

            if scheduler is not None:
                scheduler.step(avg_val_loss)

            losses.append(
                {
                    "epoch": epoch + 1,
                    "train_loss": avg_train_loss,
                    "val_loss": avg_val_loss,
                }
            )

            print(
                f"Epoch {epoch+1}/{epochs} | train loss: {avg_train_loss:.5f} | val loss: {avg_val_loss:.5f}"
            )

            # --- SAVE BEST MODEL ---
            if avg_val_loss < self.best_val_loss:
                print(f"  New best validation loss: {avg_val_loss:.5f}. Saving model.")
                self.best_val_loss = avg_val_loss
                self.save_model(epoch)

        df_losses = pd.DataFrame(losses)
        df_losses.to_csv(loss_csv_path, index=False)
        print(f"✅ Saved training log to {loss_csv_path}")

        plt.figure(figsize=(7, 4))
        plt.semilogy(
            df_losses["epoch"],
            df_losses["train_loss"],
            label="Training loss",
            color="#1f77b4",
            linewidth=2,
        )
        plt.semilogy(
            df_losses["epoch"],
            df_losses["val_loss"],
            label="Validation loss",
            color="#ff7f0e",
            linewidth=2,
            linestyle="--",
        )

        plt.title("Training and validation losses", fontsize=13)
        plt.xlabel("Epoch", fontsize=11)
        plt.ylabel("Loss (log scale)", fontsize=11)
        plt.grid(True, which="both", linestyle=":", linewidth=0.8, alpha=0.7)
        plt.legend(frameon=False, fontsize=10)
        plt.tight_layout()
        plt.savefig(fig_path, dpi=200)
        plt.close()

        print(f"✅ Saved training curve to {fig_path}")

`init(data_df, model_config, model_dir, num_workers=4, load_parallel=True, train_val_num=(5000, 500))` ¶

Initialize trainer with data, model configuration, and I/O paths.

Parameters:

Name	Type	Description	Default
`data_df`	`DataFrame`	DataFrame containing file paths and split labels.	required
`model_config`	`Dict[str, Any]`	Dictionary describing architecture, hyperparameters, and loader settings.	required
`model_dir`	`Any`	Base directory to store checkpoints and metadata.	required
`num_workers`	`int`	Number of workers for DataLoaders.	`4`
`load_parallel`	`bool`	Whether to load flights in parallel.	`True`
`train_val_num`	`Tuple[int, int]`	Tuple specifying how many train/val files to load.	`(5000, 500)`

Source code in src/node_fdm/ode_trainer.py

def __init__(
    self,
    data_df: pd.DataFrame,
    model_config: Dict[str, Any],
    model_dir: Any,
    num_workers: int = 4,
    load_parallel: bool = True,
    train_val_num: Tuple[int, int] = (5000, 500),
) -> None:
    """Initialize trainer with data, model configuration, and I/O paths.

    Args:
        data_df: DataFrame containing file paths and split labels.
        model_config: Dictionary describing architecture, hyperparameters, and loader settings.
        model_dir: Base directory to store checkpoints and metadata.
        num_workers: Number of workers for DataLoaders.
        load_parallel: Whether to load flights in parallel.
        train_val_num: Tuple specifying how many train/val files to load.
    """

    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    architecture, self.model_cols, custom_fn = get_architecture_from_name(
        model_config["architecture_name"]
    )
    self.x_cols, self.u_cols, self.e0_cols, self.e_cols, self.dx_cols = (
        self.model_cols
    )
    self.model_dir = model_dir / model_config["model_name"]
    os.makedirs(self.model_dir, exist_ok=True)
    self.architecture = architecture
    self.model_config = model_config
    self.architecture_name = model_config["architecture_name"]
    self.model_params = model_config["model_params"]

    self.train_dataset, self.val_dataset = get_train_val_data(
        data_df,
        self.model_cols,
        shift=model_config["shift"],
        seq_len=model_config["seq_len"],
        custom_fn=custom_fn,
        load_parallel=load_parallel,
        train_val_num=train_val_num,
    )
    self.step = model_config["step"]
    self.num_workers = num_workers

    self.stats_dict = self.train_dataset.stats_dict

    self.model = self.get_or_create_model(*model_config["loading_args"])

    self.optimizer = torch.optim.AdamW(
        self.model.parameters(),
        lr=model_config["lr"],
        weight_decay=model_config["weight_decay"],
    )
    self.epoch = 1
    self.save_meta()

`cat_to_dict_vects(vect_list, col_list, alpha_dict, normalize=True)` ¶

Concatenate vectors and build a dict keyed by column definitions.

Parameters:

Name	Type	Description	Default
`vect_list`	`Sequence[Tensor]`	Sequence of tensors to concatenate along the feature axis.	required
`col_list`	`Sequence[Any]`	Column identifiers matching the concatenated tensors.	required
`alpha_dict`	`Dict[Any, float]`	Optional scaling factors applied per column.	required
`normalize`	`bool`	Whether to normalize columns that request it.	`True`

Returns:

Type	Description
`Dict[Any, Tensor]`	Dictionary mapping columns to (optionally) scaled and normalized tensors.

Source code in src/node_fdm/ode_trainer.py

def cat_to_dict_vects(
    self,
    vect_list: Sequence[torch.Tensor],
    col_list: Sequence[Any],
    alpha_dict: Dict[Any, float],
    normalize: bool = True,
) -> Dict[Any, torch.Tensor]:
    """Concatenate vectors and build a dict keyed by column definitions.

    Args:
        vect_list: Sequence of tensors to concatenate along the feature axis.
        col_list: Column identifiers matching the concatenated tensors.
        alpha_dict: Optional scaling factors applied per column.
        normalize: Whether to normalize columns that request it.

    Returns:
        Dictionary mapping columns to (optionally) scaled and normalized tensors.
    """

    def modifier(el: torch.Tensor, col: Any) -> torch.Tensor:
        if (col.normalize_mode == "normal") & (normalize):
            return self.norm_vect(el, col)
        return el

    coeff_list = [
        alpha_dict[col] if col in alpha_dict.keys() else 0.0 for col in col_list
    ]

    vects = torch.cat(vect_list, dim=2)
    vects_dict = {
        col: coeff * modifier(vects[..., i], col).unsqueeze(-1)
        for i, (col, coeff) in enumerate(zip(col_list, coeff_list))
    }
    return vects_dict

`compute_loss_ode_step(batch, alpha_dict, method='rk4')` ¶

Compute loss for a single ODE rollout batch.

Parameters:

Name	Type	Description	Default
`batch`	`Sequence[Tensor]`	Tuple of tensors `(x_seq, u_seq, e_seq, dx_seq)` from the DataLoader.	required
`alpha_dict`	`Dict[Any, float]`	Scaling factors per monitored column.	required
`method`	`str`	ODE solver method.	`'rk4'`

Returns:

Type	Description
`Tensor`	Scalar loss tensor for the batch.

Source code in src/node_fdm/ode_trainer.py

def compute_loss_ode_step(
    self,
    batch: Sequence[torch.Tensor],
    alpha_dict: Dict[Any, float],
    method: str = "rk4",
) -> torch.Tensor:
    """Compute loss for a single ODE rollout batch.

    Args:
        batch: Tuple of tensors `(x_seq, u_seq, e_seq, dx_seq)` from the DataLoader.
        alpha_dict: Scaling factors per monitored column.
        method: ODE solver method.

    Returns:
        Scalar loss tensor for the batch.
    """
    x_seq, u_seq, e_seq, _ = [b.to(self.device) for b in batch]
    true_vect, pred_vect, monitor_cols = self.ode_step(
        x_seq,
        u_seq,
        e_seq,
        method,
        alpha_dict,
    )

    loss = 0.0
    for i, col in enumerate(monitor_cols):
        if col in alpha_dict.keys():
            loss_fn = get_loss(col.loss_name)
            assert not torch.isnan(pred_vect[..., i]).any(), "NaN in pred_vect"
            assert not torch.isnan(true_vect[..., i]).any(), "NaN in true_vect"
            res = loss_fn(pred_vect[..., i], true_vect[..., i])
            loss += res

    if torch.isnan(loss) or torch.isinf(loss):
        print("NaN or Inf in loss!")

    return loss

`get_or_create_model(load=False, load_loss=False)` ¶

Instantiate a new model or load existing checkpoints.

Parameters:

Name	Type	Description	Default
`load`	`bool`	Whether to attempt loading existing checkpoints.	`False`
`load_loss`	`bool`	Whether to restore tracked best validation loss when loading.	`False`

Returns:

Type	Description
`FlightDynamicsModel`	Initialized or restored `FlightDynamicsModel` instance.

Source code in src/node_fdm/ode_trainer.py

def get_or_create_model(
    self, load: bool = False, load_loss: bool = False
) -> FlightDynamicsModel:
    """Instantiate a new model or load existing checkpoints.

    Args:
        load: Whether to attempt loading existing checkpoints.
        load_loss: Whether to restore tracked best validation loss when loading.

    Returns:
        Initialized or restored `FlightDynamicsModel` instance.
    """
    self.best_val_loss = float("inf")
    if load and os.path.exists(self.model_dir / "meta.json"):
        model = self.load_best_checkpoint(load_loss=load_loss)
    else:
        print("Creating new model.")
        model = FlightDynamicsModel(
            self.architecture,
            self.stats_dict,
            self.model_cols,
            model_params=self.model_params,
        ).to(self.device)
    return model

`load_best_checkpoint(load_loss=False)` ¶

Create and populate a model from saved checkpoints.

Parameters:

Name	Type	Description	Default
`load_loss`	`bool`	Whether to restore tracked best validation loss.	`False`

Returns:

Type	Description
`FlightDynamicsModel`	Model with layer weights loaded when available.

Source code in src/node_fdm/ode_trainer.py

def load_best_checkpoint(self, load_loss: bool = False) -> FlightDynamicsModel:
    """Create and populate a model from saved checkpoints.

    Args:
        load_loss: Whether to restore tracked best validation loss.

    Returns:
        Model with layer weights loaded when available.
    """
    model = FlightDynamicsModel(
        self.architecture,
        self.stats_dict,
        self.model_cols,
        model_params=self.model_params,
    ).to(self.device)

    for name in model.layers_name:
        checkpoint = self.load_layer_checkpoint(name)
        if checkpoint is not None:
            model.layers_dict[name].load_state_dict(
                checkpoint["layer_state"], strict=False
            )
            best_val_loss = checkpoint.get("best_val_loss", float("inf"))
            self.epoch = checkpoint.get("epoch", 0)
        else:
            best_val_loss = float("inf")
            self.epoch = 0

    if load_loss:
        self.best_val_loss = best_val_loss

    print("Best val loss per layer:", self.best_val_loss)
    print(f"Loaded modular model from {self.model_dir}")

    return model

`load_layer_checkpoint(layer_name)` ¶

Load checkpoint dictionary for a specific layer if available.

Parameters:

Name	Type	Description	Default
`layer_name`	`str`	Name of the layer to load.	required

Returns:

Type	Description
`Optional[Dict[str, Any]]`	Checkpoint dictionary if found, otherwise None.

Source code in src/node_fdm/ode_trainer.py

def load_layer_checkpoint(self, layer_name: str) -> Optional[Dict[str, Any]]:
    """Load checkpoint dictionary for a specific layer if available.

    Args:
        layer_name: Name of the layer to load.

    Returns:
        Checkpoint dictionary if found, otherwise None.
    """
    path = os.path.join(self.model_dir, f"{layer_name}.pt")
    if not os.path.exists(path):
        print(f"No checkpoint found for layer {layer_name}, skipping load.")
        return None
    else:
        print(f"checkpoint found for layer {layer_name}")
    checkpoint = torch.load(path, map_location=self.device)
    return checkpoint

`norm_vect(vect, col)` ¶

Normalize tensor using stored statistics for a column.

Parameters:

Name	Type	Description	Default
`vect`	`Tensor`	Tensor to normalize.	required
`col`	`Any`	Column identifier used to fetch statistics.	required

Returns:

Type	Description
`Tensor`	Normalized tensor.

Source code in src/node_fdm/ode_trainer.py

def norm_vect(self, vect: torch.Tensor, col: Any) -> torch.Tensor:
    """Normalize tensor using stored statistics for a column.

    Args:
        vect: Tensor to normalize.
        col: Column identifier used to fetch statistics.

    Returns:
        Normalized tensor.
    """
    return (vect - self.stats_dict[col]["mean"]) / (
        self.stats_dict[col]["std"] + 1e-3
    )

`ode_step(x_seq, u_seq, e_seq, method, alpha_dict)` ¶

Integrate one ODE step and return true/predicted trajectories.

Parameters:

Name	Type	Description	Default
`x_seq`	`Tensor`	State sequences for the batch.	required
`u_seq`	`Tensor`	Control sequences for the batch.	required
`e_seq`	`Tensor`	Environment sequences for the batch.	required
`method`	`str`	ODE solver method passed to `odeint`.	required
`alpha_dict`	`Dict[Any, float]`	Scaling factors per monitored column.	required

Returns:

Type	Description
`Tuple[Tensor, Tensor, Sequence[Any]]`	Tuple of (true trajectories, predicted trajectories, monitored columns).

Source code in src/node_fdm/ode_trainer.py

def ode_step(
    self,
    x_seq: torch.Tensor,
    u_seq: torch.Tensor,
    e_seq: torch.Tensor,
    method: str,
    alpha_dict: Dict[Any, float],
) -> Tuple[torch.Tensor, torch.Tensor, Sequence[Any]]:
    """Integrate one ODE step and return true/predicted trajectories.

    Args:
        x_seq: State sequences for the batch.
        u_seq: Control sequences for the batch.
        e_seq: Environment sequences for the batch.
        method: ODE solver method passed to `odeint`.
        alpha_dict: Scaling factors per monitored column.

    Returns:
        Tuple of (true trajectories, predicted trajectories, monitored columns).
    """
    seq_len = x_seq.shape[1]

    assert not torch.isnan(x_seq).any(), "NaN in x_seq"
    assert not torch.isnan(u_seq).any(), "NaN in u_seq"
    assert not torch.isnan(e_seq).any(), "NaN in e_seq"

    x0 = x_seq[:, 0, :]

    t_grid = torch.arange(
        0, seq_len * self.step, self.step, dtype=torch.float32, device=self.device
    )

    func = BatchNeuralODE(self.model, u_seq, e_seq, t_grid)

    odeint(func, x0, t_grid, method=method)

    vects = torch.cat([x_seq, u_seq, e_seq], dim=2)
    vect_dict = {
        col: vects[..., i].unsqueeze(-1)
        for i, col in enumerate(
            self.x_cols + self.u_cols + self.e0_cols + self.e_cols
        )
    }

    vects_dict = dict()

    monitor_cols = self.x_cols + self.e_cols

    for case in ["true", "pred"]:
        if case == "pred":
            vect_list = [
                self.model.history[col].unsqueeze(-1) for col in monitor_cols
            ]
        else:
            vect_list = [vect_dict[col][:, 1:] for col in monitor_cols]

        vects_dict[case] = self.cat_to_dict_vects(
            vect_list,
            monitor_cols,
            alpha_dict=alpha_dict,
        )
    true_vect = torch.cat([vects_dict["true"][col] for col in monitor_cols], dim=2)
    pred_vect = torch.cat([vects_dict["pred"][col] for col in monitor_cols], dim=2)
    return true_vect, pred_vect, monitor_cols

`save_layer_checkpoint(layer_name, epoch)` ¶

Save checkpoint for an individual layer.

Parameters:

Name	Type	Description	Default
`layer_name`	`str`	Name of the layer to checkpoint.	required
`epoch`	`int`	Current epoch offset for tracking.	required

Source code in src/node_fdm/ode_trainer.py

def save_layer_checkpoint(self, layer_name: str, epoch: int) -> None:
    """Save checkpoint for an individual layer.

    Args:
        layer_name: Name of the layer to checkpoint.
        epoch: Current epoch offset for tracking.
    """
    layer = self.model.layers_dict[layer_name]
    save_dict = {
        "layer_state": layer.state_dict(),
        "optimizer_state": self.optimizer.state_dict(),
        "best_val_loss": self.best_val_loss,
        "epoch": self.epoch + epoch,
    }
    torch.save(save_dict, self.model_dir / f"{layer_name}.pt")

`save_meta()` ¶

Persist training metadata and statistics to disk.

Creates or updates meta.json within the model directory.

Source code in src/node_fdm/ode_trainer.py

def save_meta(self) -> None:
    """Persist training metadata and statistics to disk.

    Creates or updates `meta.json` within the model directory.
    """
    saved_stats_dict = {str(col): value for col, value in self.stats_dict.items()}

    meta_dict = {
        "architecture_name": self.architecture_name,
        "model_params": self.model_config["model_params"],
        "step": self.model_config["step"],
        "shift": self.model_config["shift"],
        "lr": self.model_config["lr"],
        "seq_len": self.model_config["seq_len"],
        "batch_size": self.model_config["batch_size"],
        "stats_dict": saved_stats_dict,
    }
    print(self.model_dir / "meta.json")
    with open(self.model_dir / "meta.json", "w") as f:
        json.dump(meta_dict, f, indent=4)

`save_model(epoch)` ¶

Save checkpoints for all layers.

Parameters:

Name	Type	Description	Default
`epoch`	`int`	Epoch index used when saving checkpoints.	required

Source code in src/node_fdm/ode_trainer.py

def save_model(self, epoch: int) -> None:
    """Save checkpoints for all layers.

    Args:
        epoch: Epoch index used when saving checkpoints.
    """
    for name in self.model.layers_name:
        self.save_layer_checkpoint(name, epoch)

`train(epochs=800, batch_size=512, val_batch_size=10000, scheduler=None, method='rk4', alpha_dict=None)` ¶

Train the ODE model and persist checkpoints/metrics.

Parameters:

Name	Type	Description	Default
`epochs`	`int`	Number of training epochs.	`800`
`batch_size`	`int`	Training batch size.	`512`
`val_batch_size`	`int`	Validation batch size.	`10000`
`scheduler`	`Optional[Any]`	Optional learning-rate scheduler.	`None`
`method`	`str`	ODE solver method.	`'rk4'`
`alpha_dict`	`Optional[Dict[Any, float]]`	Optional scaling factors per monitored column.	`None`

Source code in src/node_fdm/ode_trainer.py

def train(
    self,
    epochs: int = 800,
    batch_size: int = 512,
    val_batch_size: int = 10000,
    scheduler: Optional[Any] = None,
    method: str = "rk4",
    alpha_dict: Optional[Dict[Any, float]] = None,
) -> None:
    """Train the ODE model and persist checkpoints/metrics.

    Args:
        epochs: Number of training epochs.
        batch_size: Training batch size.
        val_batch_size: Validation batch size.
        scheduler: Optional learning-rate scheduler.
        method: ODE solver method.
        alpha_dict: Optional scaling factors per monitored column.
    """
    self.train_loader = DataLoader(
        self.train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=self.num_workers,
    )
    self.val_loader = DataLoader(
        self.val_dataset,
        batch_size=val_batch_size,
        shuffle=False,
        num_workers=self.num_workers,
    )

    if alpha_dict is None:
        alpha_dict = {col: 1.0 for col in self.x_cols}

    self.stats_dict = self.train_dataset.stats_dict

    losses = []
    loss_csv_path = os.path.join(self.model_dir, "training_losses.csv")
    fig_path = os.path.join(self.model_dir, "training_curve.png")

    for epoch in range(epochs):
        # --- TRAIN LOOP ---
        self.model.train()
        total_loss, total_batches = 0, 0
        for batch in self.train_loader:
            loss = self.compute_loss_ode_step(
                batch, alpha_dict=alpha_dict, method=method
            )
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            total_loss += loss.item()
            total_batches += 1
        avg_train_loss = total_loss / total_batches

        # --- VALIDATION LOOP ---
        self.model.eval()
        val_loss, val_batches = 0, 0
        with torch.no_grad():
            for batch in self.val_loader:
                loss = self.compute_loss_ode_step(
                    batch, alpha_dict=alpha_dict, method=method
                )
                val_loss += loss.item()
                val_batches += 1
        avg_val_loss = val_loss / val_batches

        if scheduler is not None:
            scheduler.step(avg_val_loss)

        losses.append(
            {
                "epoch": epoch + 1,
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss,
            }
        )

        print(
            f"Epoch {epoch+1}/{epochs} | train loss: {avg_train_loss:.5f} | val loss: {avg_val_loss:.5f}"
        )

        # --- SAVE BEST MODEL ---
        if avg_val_loss < self.best_val_loss:
            print(f"  New best validation loss: {avg_val_loss:.5f}. Saving model.")
            self.best_val_loss = avg_val_loss
            self.save_model(epoch)

    df_losses = pd.DataFrame(losses)
    df_losses.to_csv(loss_csv_path, index=False)
    print(f"✅ Saved training log to {loss_csv_path}")

    plt.figure(figsize=(7, 4))
    plt.semilogy(
        df_losses["epoch"],
        df_losses["train_loss"],
        label="Training loss",
        color="#1f77b4",
        linewidth=2,
    )
    plt.semilogy(
        df_losses["epoch"],
        df_losses["val_loss"],
        label="Validation loss",
        color="#ff7f0e",
        linewidth=2,
        linestyle="--",
    )

    plt.title("Training and validation losses", fontsize=13)
    plt.xlabel("Epoch", fontsize=11)
    plt.ylabel("Loss (log scale)", fontsize=11)
    plt.grid(True, which="both", linestyle=":", linewidth=0.8, alpha=0.7)
    plt.legend(frameon=False, fontsize=10)
    plt.tight_layout()
    plt.savefig(fig_path, dpi=200)
    plt.close()

    print(f"✅ Saved training curve to {fig_path}")

🏋️ ODETrainer API¶

📘 Class Reference¶

ode_trainer ¶

ODETrainer ¶

__init__(data_df, model_config, model_dir, num_workers=4, load_parallel=True, train_val_num=(5000, 500)) ¶

cat_to_dict_vects(vect_list, col_list, alpha_dict, normalize=True) ¶

compute_loss_ode_step(batch, alpha_dict, method='rk4') ¶

get_or_create_model(load=False, load_loss=False) ¶

load_best_checkpoint(load_loss=False) ¶

load_layer_checkpoint(layer_name) ¶

norm_vect(vect, col) ¶

ode_step(x_seq, u_seq, e_seq, method, alpha_dict) ¶

save_layer_checkpoint(layer_name, epoch) ¶

save_meta() ¶

save_model(epoch) ¶

train(epochs=800, batch_size=512, val_batch_size=10000, scheduler=None, method='rk4', alpha_dict=None) ¶

`ode_trainer` ¶

`ODETrainer` ¶

`init(data_df, model_config, model_dir, num_workers=4, load_parallel=True, train_val_num=(5000, 500))` ¶

`cat_to_dict_vects(vect_list, col_list, alpha_dict, normalize=True)` ¶

`compute_loss_ode_step(batch, alpha_dict, method='rk4')` ¶

`get_or_create_model(load=False, load_loss=False)` ¶

`load_best_checkpoint(load_loss=False)` ¶

`load_layer_checkpoint(layer_name)` ¶

`norm_vect(vect, col)` ¶

`ode_step(x_seq, u_seq, e_seq, method, alpha_dict)` ¶

`save_layer_checkpoint(layer_name, epoch)` ¶

`save_meta()` ¶

`save_model(epoch)` ¶

`train(epochs=800, batch_size=512, val_batch_size=10000, scheduler=None, method='rk4', alpha_dict=None)` ¶