Skip to content

🏋️ ODETrainer API

The ODETrainer class acts as the high-level orchestrator for the training pipeline. It wraps PyTorch Lightning to provide a standardized interface for training Neural ODEs on flight data.

It functions as the central bridge in the pipeline: it validates the configuration, retrieves the specific model architecture from the registry, initializes the training environment, and manages the lifecycle of model checkpoints and metadata artifacts.


📘 Class Reference

ode_trainer

Training utilities for neural ODE-based flight dynamics models.

ODETrainer

Handle data preparation, training loops, and checkpointing for ODE models.

Source code in src/node_fdm/ode_trainer.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
class ODETrainer:
    """Handle data preparation, training loops, and checkpointing for ODE models."""

    def __init__(
        self,
        data_df: pd.DataFrame,
        model_config: Dict[str, Any],
        model_dir: Any,
        num_workers: int = 4,
        load_parallel: bool = True,
        train_val_num: Tuple[int, int] = (5000, 500),
    ) -> None:
        """Initialize trainer with data, model configuration, and I/O paths.

        Args:
            data_df: DataFrame containing file paths and split labels.
            model_config: Dictionary describing architecture, hyperparameters, and loader settings.
            model_dir: Base directory to store checkpoints and metadata.
            num_workers: Number of workers for DataLoaders.
            load_parallel: Whether to load flights in parallel.
            train_val_num: Tuple specifying how many train/val files to load.
        """

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        architecture, self.model_cols, custom_fn = get_architecture_from_name(
            model_config["architecture_name"]
        )
        self.x_cols, self.u_cols, self.e0_cols, self.e_cols, self.dx_cols = (
            self.model_cols
        )
        self.model_dir = model_dir / model_config["model_name"]
        os.makedirs(self.model_dir, exist_ok=True)
        self.architecture = architecture
        self.model_config = model_config
        self.architecture_name = model_config["architecture_name"]
        self.model_params = model_config["model_params"]

        self.train_dataset, self.val_dataset = get_train_val_data(
            data_df,
            self.model_cols,
            shift=model_config["shift"],
            seq_len=model_config["seq_len"],
            custom_fn=custom_fn,
            load_parallel=load_parallel,
            train_val_num=train_val_num,
        )
        self.step = model_config["step"]
        self.num_workers = num_workers

        self.stats_dict = self.train_dataset.stats_dict

        self.model = self.get_or_create_model(*model_config["loading_args"])

        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=model_config["lr"],
            weight_decay=model_config["weight_decay"],
        )
        self.epoch = 1
        self.save_meta()

    def get_or_create_model(
        self, load: bool = False, load_loss: bool = False
    ) -> FlightDynamicsModel:
        """Instantiate a new model or load existing checkpoints.

        Args:
            load: Whether to attempt loading existing checkpoints.
            load_loss: Whether to restore tracked best validation loss when loading.

        Returns:
            Initialized or restored `FlightDynamicsModel` instance.
        """
        self.best_val_loss = float("inf")
        if load and os.path.exists(self.model_dir / "meta.json"):
            model = self.load_best_checkpoint(load_loss=load_loss)
        else:
            print("Creating new model.")
            model = FlightDynamicsModel(
                self.architecture,
                self.stats_dict,
                self.model_cols,
                model_params=self.model_params,
            ).to(self.device)
        return model

    def load_best_checkpoint(self, load_loss: bool = False) -> FlightDynamicsModel:
        """Create and populate a model from saved checkpoints.

        Args:
            load_loss: Whether to restore tracked best validation loss.

        Returns:
            Model with layer weights loaded when available.
        """
        model = FlightDynamicsModel(
            self.architecture,
            self.stats_dict,
            self.model_cols,
            model_params=self.model_params,
        ).to(self.device)

        for name in model.layers_name:
            checkpoint = self.load_layer_checkpoint(name)
            if checkpoint is not None:
                model.layers_dict[name].load_state_dict(
                    checkpoint["layer_state"], strict=False
                )
                best_val_loss = checkpoint.get("best_val_loss", float("inf"))
                self.epoch = checkpoint.get("epoch", 0)
            else:
                best_val_loss = float("inf")
                self.epoch = 0

        if load_loss:
            self.best_val_loss = best_val_loss

        print("Best val loss per layer:", self.best_val_loss)
        print(f"Loaded modular model from {self.model_dir}")

        return model

    def load_layer_checkpoint(self, layer_name: str) -> Optional[Dict[str, Any]]:
        """Load checkpoint dictionary for a specific layer if available.

        Args:
            layer_name: Name of the layer to load.

        Returns:
            Checkpoint dictionary if found, otherwise None.
        """
        path = os.path.join(self.model_dir, f"{layer_name}.pt")
        if not os.path.exists(path):
            print(f"No checkpoint found for layer {layer_name}, skipping load.")
            return None
        else:
            print(f"checkpoint found for layer {layer_name}")
        checkpoint = torch.load(path, map_location=self.device)
        return checkpoint

    def save_meta(self) -> None:
        """Persist training metadata and statistics to disk.

        Creates or updates `meta.json` within the model directory.
        """
        saved_stats_dict = {str(col): value for col, value in self.stats_dict.items()}

        meta_dict = {
            "architecture_name": self.architecture_name,
            "model_params": self.model_config["model_params"],
            "step": self.model_config["step"],
            "shift": self.model_config["shift"],
            "lr": self.model_config["lr"],
            "seq_len": self.model_config["seq_len"],
            "batch_size": self.model_config["batch_size"],
            "stats_dict": saved_stats_dict,
        }
        print(self.model_dir / "meta.json")
        with open(self.model_dir / "meta.json", "w") as f:
            json.dump(meta_dict, f, indent=4)

    def save_layer_checkpoint(self, layer_name: str, epoch: int) -> None:
        """Save checkpoint for an individual layer.

        Args:
            layer_name: Name of the layer to checkpoint.
            epoch: Current epoch offset for tracking.
        """
        layer = self.model.layers_dict[layer_name]
        save_dict = {
            "layer_state": layer.state_dict(),
            "optimizer_state": self.optimizer.state_dict(),
            "best_val_loss": self.best_val_loss,
            "epoch": self.epoch + epoch,
        }
        torch.save(save_dict, self.model_dir / f"{layer_name}.pt")

    def save_model(self, epoch: int) -> None:
        """Save checkpoints for all layers.

        Args:
            epoch: Epoch index used when saving checkpoints.
        """
        for name in self.model.layers_name:
            self.save_layer_checkpoint(name, epoch)

    def norm_vect(self, vect: torch.Tensor, col: Any) -> torch.Tensor:
        """Normalize tensor using stored statistics for a column.

        Args:
            vect: Tensor to normalize.
            col: Column identifier used to fetch statistics.

        Returns:
            Normalized tensor.
        """
        return (vect - self.stats_dict[col]["mean"]) / (
            self.stats_dict[col]["std"] + 1e-3
        )

    def cat_to_dict_vects(
        self,
        vect_list: Sequence[torch.Tensor],
        col_list: Sequence[Any],
        alpha_dict: Dict[Any, float],
        normalize: bool = True,
    ) -> Dict[Any, torch.Tensor]:
        """Concatenate vectors and build a dict keyed by column definitions.

        Args:
            vect_list: Sequence of tensors to concatenate along the feature axis.
            col_list: Column identifiers matching the concatenated tensors.
            alpha_dict: Optional scaling factors applied per column.
            normalize: Whether to normalize columns that request it.

        Returns:
            Dictionary mapping columns to (optionally) scaled and normalized tensors.
        """

        def modifier(el: torch.Tensor, col: Any) -> torch.Tensor:
            if (col.normalize_mode == "normal") & (normalize):
                return self.norm_vect(el, col)
            return el

        coeff_list = [
            alpha_dict[col] if col in alpha_dict.keys() else 0.0 for col in col_list
        ]

        vects = torch.cat(vect_list, dim=2)
        vects_dict = {
            col: coeff * modifier(vects[..., i], col).unsqueeze(-1)
            for i, (col, coeff) in enumerate(zip(col_list, coeff_list))
        }
        return vects_dict

    def ode_step(
        self,
        x_seq: torch.Tensor,
        u_seq: torch.Tensor,
        e_seq: torch.Tensor,
        method: str,
        alpha_dict: Dict[Any, float],
    ) -> Tuple[torch.Tensor, torch.Tensor, Sequence[Any]]:
        """Integrate one ODE step and return true/predicted trajectories.

        Args:
            x_seq: State sequences for the batch.
            u_seq: Control sequences for the batch.
            e_seq: Environment sequences for the batch.
            method: ODE solver method passed to `odeint`.
            alpha_dict: Scaling factors per monitored column.

        Returns:
            Tuple of (true trajectories, predicted trajectories, monitored columns).
        """
        seq_len = x_seq.shape[1]

        assert not torch.isnan(x_seq).any(), "NaN in x_seq"
        assert not torch.isnan(u_seq).any(), "NaN in u_seq"
        assert not torch.isnan(e_seq).any(), "NaN in e_seq"

        x0 = x_seq[:, 0, :]

        t_grid = torch.arange(
            0, seq_len * self.step, self.step, dtype=torch.float32, device=self.device
        )

        func = BatchNeuralODE(self.model, u_seq, e_seq, t_grid)

        odeint(func, x0, t_grid, method=method)

        vects = torch.cat([x_seq, u_seq, e_seq], dim=2)
        vect_dict = {
            col: vects[..., i].unsqueeze(-1)
            for i, col in enumerate(
                self.x_cols + self.u_cols + self.e0_cols + self.e_cols
            )
        }

        vects_dict = dict()

        monitor_cols = self.x_cols + self.e_cols

        for case in ["true", "pred"]:
            if case == "pred":
                vect_list = [
                    self.model.history[col].unsqueeze(-1) for col in monitor_cols
                ]
            else:
                vect_list = [vect_dict[col][:, 1:] for col in monitor_cols]

            vects_dict[case] = self.cat_to_dict_vects(
                vect_list,
                monitor_cols,
                alpha_dict=alpha_dict,
            )
        true_vect = torch.cat([vects_dict["true"][col] for col in monitor_cols], dim=2)
        pred_vect = torch.cat([vects_dict["pred"][col] for col in monitor_cols], dim=2)
        return true_vect, pred_vect, monitor_cols

    def compute_loss_ode_step(
        self,
        batch: Sequence[torch.Tensor],
        alpha_dict: Dict[Any, float],
        method: str = "rk4",
    ) -> torch.Tensor:
        """Compute loss for a single ODE rollout batch.

        Args:
            batch: Tuple of tensors `(x_seq, u_seq, e_seq, dx_seq)` from the DataLoader.
            alpha_dict: Scaling factors per monitored column.
            method: ODE solver method.

        Returns:
            Scalar loss tensor for the batch.
        """
        x_seq, u_seq, e_seq, _ = [b.to(self.device) for b in batch]
        true_vect, pred_vect, monitor_cols = self.ode_step(
            x_seq,
            u_seq,
            e_seq,
            method,
            alpha_dict,
        )

        loss = 0.0
        for i, col in enumerate(monitor_cols):
            if col in alpha_dict.keys():
                loss_fn = get_loss(col.loss_name)
                assert not torch.isnan(pred_vect[..., i]).any(), "NaN in pred_vect"
                assert not torch.isnan(true_vect[..., i]).any(), "NaN in true_vect"
                res = loss_fn(pred_vect[..., i], true_vect[..., i])
                loss += res

        if torch.isnan(loss) or torch.isinf(loss):
            print("NaN or Inf in loss!")

        return loss

    def train(
        self,
        epochs: int = 800,
        batch_size: int = 512,
        val_batch_size: int = 10000,
        scheduler: Optional[Any] = None,
        method: str = "rk4",
        alpha_dict: Optional[Dict[Any, float]] = None,
    ) -> None:
        """Train the ODE model and persist checkpoints/metrics.

        Args:
            epochs: Number of training epochs.
            batch_size: Training batch size.
            val_batch_size: Validation batch size.
            scheduler: Optional learning-rate scheduler.
            method: ODE solver method.
            alpha_dict: Optional scaling factors per monitored column.
        """
        self.train_loader = DataLoader(
            self.train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=self.num_workers,
        )
        self.val_loader = DataLoader(
            self.val_dataset,
            batch_size=val_batch_size,
            shuffle=False,
            num_workers=self.num_workers,
        )

        if alpha_dict is None:
            alpha_dict = {col: 1.0 for col in self.x_cols}

        self.stats_dict = self.train_dataset.stats_dict

        losses = []
        loss_csv_path = os.path.join(self.model_dir, "training_losses.csv")
        fig_path = os.path.join(self.model_dir, "training_curve.png")

        for epoch in range(epochs):
            # --- TRAIN LOOP ---
            self.model.train()
            total_loss, total_batches = 0, 0
            for batch in self.train_loader:
                loss = self.compute_loss_ode_step(
                    batch, alpha_dict=alpha_dict, method=method
                )
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                self.optimizer.step()
                total_loss += loss.item()
                total_batches += 1
            avg_train_loss = total_loss / total_batches

            # --- VALIDATION LOOP ---
            self.model.eval()
            val_loss, val_batches = 0, 0
            with torch.no_grad():
                for batch in self.val_loader:
                    loss = self.compute_loss_ode_step(
                        batch, alpha_dict=alpha_dict, method=method
                    )
                    val_loss += loss.item()
                    val_batches += 1
            avg_val_loss = val_loss / val_batches

            if scheduler is not None:
                scheduler.step(avg_val_loss)

            losses.append(
                {
                    "epoch": epoch + 1,
                    "train_loss": avg_train_loss,
                    "val_loss": avg_val_loss,
                }
            )

            print(
                f"Epoch {epoch+1}/{epochs} | train loss: {avg_train_loss:.5f} | val loss: {avg_val_loss:.5f}"
            )

            # --- SAVE BEST MODEL ---
            if avg_val_loss < self.best_val_loss:
                print(f"  New best validation loss: {avg_val_loss:.5f}. Saving model.")
                self.best_val_loss = avg_val_loss
                self.save_model(epoch)

        df_losses = pd.DataFrame(losses)
        df_losses.to_csv(loss_csv_path, index=False)
        print(f"✅ Saved training log to {loss_csv_path}")

        plt.figure(figsize=(7, 4))
        plt.semilogy(
            df_losses["epoch"],
            df_losses["train_loss"],
            label="Training loss",
            color="#1f77b4",
            linewidth=2,
        )
        plt.semilogy(
            df_losses["epoch"],
            df_losses["val_loss"],
            label="Validation loss",
            color="#ff7f0e",
            linewidth=2,
            linestyle="--",
        )

        plt.title("Training and validation losses", fontsize=13)
        plt.xlabel("Epoch", fontsize=11)
        plt.ylabel("Loss (log scale)", fontsize=11)
        plt.grid(True, which="both", linestyle=":", linewidth=0.8, alpha=0.7)
        plt.legend(frameon=False, fontsize=10)
        plt.tight_layout()
        plt.savefig(fig_path, dpi=200)
        plt.close()

        print(f"✅ Saved training curve to {fig_path}")

__init__(data_df, model_config, model_dir, num_workers=4, load_parallel=True, train_val_num=(5000, 500))

Initialize trainer with data, model configuration, and I/O paths.

Parameters:

Name Type Description Default
data_df DataFrame

DataFrame containing file paths and split labels.

required
model_config Dict[str, Any]

Dictionary describing architecture, hyperparameters, and loader settings.

required
model_dir Any

Base directory to store checkpoints and metadata.

required
num_workers int

Number of workers for DataLoaders.

4
load_parallel bool

Whether to load flights in parallel.

True
train_val_num Tuple[int, int]

Tuple specifying how many train/val files to load.

(5000, 500)
Source code in src/node_fdm/ode_trainer.py
def __init__(
    self,
    data_df: pd.DataFrame,
    model_config: Dict[str, Any],
    model_dir: Any,
    num_workers: int = 4,
    load_parallel: bool = True,
    train_val_num: Tuple[int, int] = (5000, 500),
) -> None:
    """Initialize trainer with data, model configuration, and I/O paths.

    Args:
        data_df: DataFrame containing file paths and split labels.
        model_config: Dictionary describing architecture, hyperparameters, and loader settings.
        model_dir: Base directory to store checkpoints and metadata.
        num_workers: Number of workers for DataLoaders.
        load_parallel: Whether to load flights in parallel.
        train_val_num: Tuple specifying how many train/val files to load.
    """

    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    architecture, self.model_cols, custom_fn = get_architecture_from_name(
        model_config["architecture_name"]
    )
    self.x_cols, self.u_cols, self.e0_cols, self.e_cols, self.dx_cols = (
        self.model_cols
    )
    self.model_dir = model_dir / model_config["model_name"]
    os.makedirs(self.model_dir, exist_ok=True)
    self.architecture = architecture
    self.model_config = model_config
    self.architecture_name = model_config["architecture_name"]
    self.model_params = model_config["model_params"]

    self.train_dataset, self.val_dataset = get_train_val_data(
        data_df,
        self.model_cols,
        shift=model_config["shift"],
        seq_len=model_config["seq_len"],
        custom_fn=custom_fn,
        load_parallel=load_parallel,
        train_val_num=train_val_num,
    )
    self.step = model_config["step"]
    self.num_workers = num_workers

    self.stats_dict = self.train_dataset.stats_dict

    self.model = self.get_or_create_model(*model_config["loading_args"])

    self.optimizer = torch.optim.AdamW(
        self.model.parameters(),
        lr=model_config["lr"],
        weight_decay=model_config["weight_decay"],
    )
    self.epoch = 1
    self.save_meta()

cat_to_dict_vects(vect_list, col_list, alpha_dict, normalize=True)

Concatenate vectors and build a dict keyed by column definitions.

Parameters:

Name Type Description Default
vect_list Sequence[Tensor]

Sequence of tensors to concatenate along the feature axis.

required
col_list Sequence[Any]

Column identifiers matching the concatenated tensors.

required
alpha_dict Dict[Any, float]

Optional scaling factors applied per column.

required
normalize bool

Whether to normalize columns that request it.

True

Returns:

Type Description
Dict[Any, Tensor]

Dictionary mapping columns to (optionally) scaled and normalized tensors.

Source code in src/node_fdm/ode_trainer.py
def cat_to_dict_vects(
    self,
    vect_list: Sequence[torch.Tensor],
    col_list: Sequence[Any],
    alpha_dict: Dict[Any, float],
    normalize: bool = True,
) -> Dict[Any, torch.Tensor]:
    """Concatenate vectors and build a dict keyed by column definitions.

    Args:
        vect_list: Sequence of tensors to concatenate along the feature axis.
        col_list: Column identifiers matching the concatenated tensors.
        alpha_dict: Optional scaling factors applied per column.
        normalize: Whether to normalize columns that request it.

    Returns:
        Dictionary mapping columns to (optionally) scaled and normalized tensors.
    """

    def modifier(el: torch.Tensor, col: Any) -> torch.Tensor:
        if (col.normalize_mode == "normal") & (normalize):
            return self.norm_vect(el, col)
        return el

    coeff_list = [
        alpha_dict[col] if col in alpha_dict.keys() else 0.0 for col in col_list
    ]

    vects = torch.cat(vect_list, dim=2)
    vects_dict = {
        col: coeff * modifier(vects[..., i], col).unsqueeze(-1)
        for i, (col, coeff) in enumerate(zip(col_list, coeff_list))
    }
    return vects_dict

compute_loss_ode_step(batch, alpha_dict, method='rk4')

Compute loss for a single ODE rollout batch.

Parameters:

Name Type Description Default
batch Sequence[Tensor]

Tuple of tensors (x_seq, u_seq, e_seq, dx_seq) from the DataLoader.

required
alpha_dict Dict[Any, float]

Scaling factors per monitored column.

required
method str

ODE solver method.

'rk4'

Returns:

Type Description
Tensor

Scalar loss tensor for the batch.

Source code in src/node_fdm/ode_trainer.py
def compute_loss_ode_step(
    self,
    batch: Sequence[torch.Tensor],
    alpha_dict: Dict[Any, float],
    method: str = "rk4",
) -> torch.Tensor:
    """Compute loss for a single ODE rollout batch.

    Args:
        batch: Tuple of tensors `(x_seq, u_seq, e_seq, dx_seq)` from the DataLoader.
        alpha_dict: Scaling factors per monitored column.
        method: ODE solver method.

    Returns:
        Scalar loss tensor for the batch.
    """
    x_seq, u_seq, e_seq, _ = [b.to(self.device) for b in batch]
    true_vect, pred_vect, monitor_cols = self.ode_step(
        x_seq,
        u_seq,
        e_seq,
        method,
        alpha_dict,
    )

    loss = 0.0
    for i, col in enumerate(monitor_cols):
        if col in alpha_dict.keys():
            loss_fn = get_loss(col.loss_name)
            assert not torch.isnan(pred_vect[..., i]).any(), "NaN in pred_vect"
            assert not torch.isnan(true_vect[..., i]).any(), "NaN in true_vect"
            res = loss_fn(pred_vect[..., i], true_vect[..., i])
            loss += res

    if torch.isnan(loss) or torch.isinf(loss):
        print("NaN or Inf in loss!")

    return loss

get_or_create_model(load=False, load_loss=False)

Instantiate a new model or load existing checkpoints.

Parameters:

Name Type Description Default
load bool

Whether to attempt loading existing checkpoints.

False
load_loss bool

Whether to restore tracked best validation loss when loading.

False

Returns:

Type Description
FlightDynamicsModel

Initialized or restored FlightDynamicsModel instance.

Source code in src/node_fdm/ode_trainer.py
def get_or_create_model(
    self, load: bool = False, load_loss: bool = False
) -> FlightDynamicsModel:
    """Instantiate a new model or load existing checkpoints.

    Args:
        load: Whether to attempt loading existing checkpoints.
        load_loss: Whether to restore tracked best validation loss when loading.

    Returns:
        Initialized or restored `FlightDynamicsModel` instance.
    """
    self.best_val_loss = float("inf")
    if load and os.path.exists(self.model_dir / "meta.json"):
        model = self.load_best_checkpoint(load_loss=load_loss)
    else:
        print("Creating new model.")
        model = FlightDynamicsModel(
            self.architecture,
            self.stats_dict,
            self.model_cols,
            model_params=self.model_params,
        ).to(self.device)
    return model

load_best_checkpoint(load_loss=False)

Create and populate a model from saved checkpoints.

Parameters:

Name Type Description Default
load_loss bool

Whether to restore tracked best validation loss.

False

Returns:

Type Description
FlightDynamicsModel

Model with layer weights loaded when available.

Source code in src/node_fdm/ode_trainer.py
def load_best_checkpoint(self, load_loss: bool = False) -> FlightDynamicsModel:
    """Create and populate a model from saved checkpoints.

    Args:
        load_loss: Whether to restore tracked best validation loss.

    Returns:
        Model with layer weights loaded when available.
    """
    model = FlightDynamicsModel(
        self.architecture,
        self.stats_dict,
        self.model_cols,
        model_params=self.model_params,
    ).to(self.device)

    for name in model.layers_name:
        checkpoint = self.load_layer_checkpoint(name)
        if checkpoint is not None:
            model.layers_dict[name].load_state_dict(
                checkpoint["layer_state"], strict=False
            )
            best_val_loss = checkpoint.get("best_val_loss", float("inf"))
            self.epoch = checkpoint.get("epoch", 0)
        else:
            best_val_loss = float("inf")
            self.epoch = 0

    if load_loss:
        self.best_val_loss = best_val_loss

    print("Best val loss per layer:", self.best_val_loss)
    print(f"Loaded modular model from {self.model_dir}")

    return model

load_layer_checkpoint(layer_name)

Load checkpoint dictionary for a specific layer if available.

Parameters:

Name Type Description Default
layer_name str

Name of the layer to load.

required

Returns:

Type Description
Optional[Dict[str, Any]]

Checkpoint dictionary if found, otherwise None.

Source code in src/node_fdm/ode_trainer.py
def load_layer_checkpoint(self, layer_name: str) -> Optional[Dict[str, Any]]:
    """Load checkpoint dictionary for a specific layer if available.

    Args:
        layer_name: Name of the layer to load.

    Returns:
        Checkpoint dictionary if found, otherwise None.
    """
    path = os.path.join(self.model_dir, f"{layer_name}.pt")
    if not os.path.exists(path):
        print(f"No checkpoint found for layer {layer_name}, skipping load.")
        return None
    else:
        print(f"checkpoint found for layer {layer_name}")
    checkpoint = torch.load(path, map_location=self.device)
    return checkpoint

norm_vect(vect, col)

Normalize tensor using stored statistics for a column.

Parameters:

Name Type Description Default
vect Tensor

Tensor to normalize.

required
col Any

Column identifier used to fetch statistics.

required

Returns:

Type Description
Tensor

Normalized tensor.

Source code in src/node_fdm/ode_trainer.py
def norm_vect(self, vect: torch.Tensor, col: Any) -> torch.Tensor:
    """Normalize tensor using stored statistics for a column.

    Args:
        vect: Tensor to normalize.
        col: Column identifier used to fetch statistics.

    Returns:
        Normalized tensor.
    """
    return (vect - self.stats_dict[col]["mean"]) / (
        self.stats_dict[col]["std"] + 1e-3
    )

ode_step(x_seq, u_seq, e_seq, method, alpha_dict)

Integrate one ODE step and return true/predicted trajectories.

Parameters:

Name Type Description Default
x_seq Tensor

State sequences for the batch.

required
u_seq Tensor

Control sequences for the batch.

required
e_seq Tensor

Environment sequences for the batch.

required
method str

ODE solver method passed to odeint.

required
alpha_dict Dict[Any, float]

Scaling factors per monitored column.

required

Returns:

Type Description
Tuple[Tensor, Tensor, Sequence[Any]]

Tuple of (true trajectories, predicted trajectories, monitored columns).

Source code in src/node_fdm/ode_trainer.py
def ode_step(
    self,
    x_seq: torch.Tensor,
    u_seq: torch.Tensor,
    e_seq: torch.Tensor,
    method: str,
    alpha_dict: Dict[Any, float],
) -> Tuple[torch.Tensor, torch.Tensor, Sequence[Any]]:
    """Integrate one ODE step and return true/predicted trajectories.

    Args:
        x_seq: State sequences for the batch.
        u_seq: Control sequences for the batch.
        e_seq: Environment sequences for the batch.
        method: ODE solver method passed to `odeint`.
        alpha_dict: Scaling factors per monitored column.

    Returns:
        Tuple of (true trajectories, predicted trajectories, monitored columns).
    """
    seq_len = x_seq.shape[1]

    assert not torch.isnan(x_seq).any(), "NaN in x_seq"
    assert not torch.isnan(u_seq).any(), "NaN in u_seq"
    assert not torch.isnan(e_seq).any(), "NaN in e_seq"

    x0 = x_seq[:, 0, :]

    t_grid = torch.arange(
        0, seq_len * self.step, self.step, dtype=torch.float32, device=self.device
    )

    func = BatchNeuralODE(self.model, u_seq, e_seq, t_grid)

    odeint(func, x0, t_grid, method=method)

    vects = torch.cat([x_seq, u_seq, e_seq], dim=2)
    vect_dict = {
        col: vects[..., i].unsqueeze(-1)
        for i, col in enumerate(
            self.x_cols + self.u_cols + self.e0_cols + self.e_cols
        )
    }

    vects_dict = dict()

    monitor_cols = self.x_cols + self.e_cols

    for case in ["true", "pred"]:
        if case == "pred":
            vect_list = [
                self.model.history[col].unsqueeze(-1) for col in monitor_cols
            ]
        else:
            vect_list = [vect_dict[col][:, 1:] for col in monitor_cols]

        vects_dict[case] = self.cat_to_dict_vects(
            vect_list,
            monitor_cols,
            alpha_dict=alpha_dict,
        )
    true_vect = torch.cat([vects_dict["true"][col] for col in monitor_cols], dim=2)
    pred_vect = torch.cat([vects_dict["pred"][col] for col in monitor_cols], dim=2)
    return true_vect, pred_vect, monitor_cols

save_layer_checkpoint(layer_name, epoch)

Save checkpoint for an individual layer.

Parameters:

Name Type Description Default
layer_name str

Name of the layer to checkpoint.

required
epoch int

Current epoch offset for tracking.

required
Source code in src/node_fdm/ode_trainer.py
def save_layer_checkpoint(self, layer_name: str, epoch: int) -> None:
    """Save checkpoint for an individual layer.

    Args:
        layer_name: Name of the layer to checkpoint.
        epoch: Current epoch offset for tracking.
    """
    layer = self.model.layers_dict[layer_name]
    save_dict = {
        "layer_state": layer.state_dict(),
        "optimizer_state": self.optimizer.state_dict(),
        "best_val_loss": self.best_val_loss,
        "epoch": self.epoch + epoch,
    }
    torch.save(save_dict, self.model_dir / f"{layer_name}.pt")

save_meta()

Persist training metadata and statistics to disk.

Creates or updates meta.json within the model directory.

Source code in src/node_fdm/ode_trainer.py
def save_meta(self) -> None:
    """Persist training metadata and statistics to disk.

    Creates or updates `meta.json` within the model directory.
    """
    saved_stats_dict = {str(col): value for col, value in self.stats_dict.items()}

    meta_dict = {
        "architecture_name": self.architecture_name,
        "model_params": self.model_config["model_params"],
        "step": self.model_config["step"],
        "shift": self.model_config["shift"],
        "lr": self.model_config["lr"],
        "seq_len": self.model_config["seq_len"],
        "batch_size": self.model_config["batch_size"],
        "stats_dict": saved_stats_dict,
    }
    print(self.model_dir / "meta.json")
    with open(self.model_dir / "meta.json", "w") as f:
        json.dump(meta_dict, f, indent=4)

save_model(epoch)

Save checkpoints for all layers.

Parameters:

Name Type Description Default
epoch int

Epoch index used when saving checkpoints.

required
Source code in src/node_fdm/ode_trainer.py
def save_model(self, epoch: int) -> None:
    """Save checkpoints for all layers.

    Args:
        epoch: Epoch index used when saving checkpoints.
    """
    for name in self.model.layers_name:
        self.save_layer_checkpoint(name, epoch)

train(epochs=800, batch_size=512, val_batch_size=10000, scheduler=None, method='rk4', alpha_dict=None)

Train the ODE model and persist checkpoints/metrics.

Parameters:

Name Type Description Default
epochs int

Number of training epochs.

800
batch_size int

Training batch size.

512
val_batch_size int

Validation batch size.

10000
scheduler Optional[Any]

Optional learning-rate scheduler.

None
method str

ODE solver method.

'rk4'
alpha_dict Optional[Dict[Any, float]]

Optional scaling factors per monitored column.

None
Source code in src/node_fdm/ode_trainer.py
def train(
    self,
    epochs: int = 800,
    batch_size: int = 512,
    val_batch_size: int = 10000,
    scheduler: Optional[Any] = None,
    method: str = "rk4",
    alpha_dict: Optional[Dict[Any, float]] = None,
) -> None:
    """Train the ODE model and persist checkpoints/metrics.

    Args:
        epochs: Number of training epochs.
        batch_size: Training batch size.
        val_batch_size: Validation batch size.
        scheduler: Optional learning-rate scheduler.
        method: ODE solver method.
        alpha_dict: Optional scaling factors per monitored column.
    """
    self.train_loader = DataLoader(
        self.train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=self.num_workers,
    )
    self.val_loader = DataLoader(
        self.val_dataset,
        batch_size=val_batch_size,
        shuffle=False,
        num_workers=self.num_workers,
    )

    if alpha_dict is None:
        alpha_dict = {col: 1.0 for col in self.x_cols}

    self.stats_dict = self.train_dataset.stats_dict

    losses = []
    loss_csv_path = os.path.join(self.model_dir, "training_losses.csv")
    fig_path = os.path.join(self.model_dir, "training_curve.png")

    for epoch in range(epochs):
        # --- TRAIN LOOP ---
        self.model.train()
        total_loss, total_batches = 0, 0
        for batch in self.train_loader:
            loss = self.compute_loss_ode_step(
                batch, alpha_dict=alpha_dict, method=method
            )
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            total_loss += loss.item()
            total_batches += 1
        avg_train_loss = total_loss / total_batches

        # --- VALIDATION LOOP ---
        self.model.eval()
        val_loss, val_batches = 0, 0
        with torch.no_grad():
            for batch in self.val_loader:
                loss = self.compute_loss_ode_step(
                    batch, alpha_dict=alpha_dict, method=method
                )
                val_loss += loss.item()
                val_batches += 1
        avg_val_loss = val_loss / val_batches

        if scheduler is not None:
            scheduler.step(avg_val_loss)

        losses.append(
            {
                "epoch": epoch + 1,
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss,
            }
        )

        print(
            f"Epoch {epoch+1}/{epochs} | train loss: {avg_train_loss:.5f} | val loss: {avg_val_loss:.5f}"
        )

        # --- SAVE BEST MODEL ---
        if avg_val_loss < self.best_val_loss:
            print(f"  New best validation loss: {avg_val_loss:.5f}. Saving model.")
            self.best_val_loss = avg_val_loss
            self.save_model(epoch)

    df_losses = pd.DataFrame(losses)
    df_losses.to_csv(loss_csv_path, index=False)
    print(f"✅ Saved training log to {loss_csv_path}")

    plt.figure(figsize=(7, 4))
    plt.semilogy(
        df_losses["epoch"],
        df_losses["train_loss"],
        label="Training loss",
        color="#1f77b4",
        linewidth=2,
    )
    plt.semilogy(
        df_losses["epoch"],
        df_losses["val_loss"],
        label="Validation loss",
        color="#ff7f0e",
        linewidth=2,
        linestyle="--",
    )

    plt.title("Training and validation losses", fontsize=13)
    plt.xlabel("Epoch", fontsize=11)
    plt.ylabel("Loss (log scale)", fontsize=11)
    plt.grid(True, which="both", linestyle=":", linewidth=0.8, alpha=0.7)
    plt.legend(frameon=False, fontsize=10)
    plt.tight_layout()
    plt.savefig(fig_path, dpi=200)
    plt.close()

    print(f"✅ Saved training curve to {fig_path}")