Module OPTIMA.lightning.inputs
Collection of classes and functions specific to the handling of the training input for Lightning models.
Expand source code
# -*- coding: utf-8 -*-
"""Collection of classes and functions specific to the handling of the training input for Lightning models."""
import lightning as pl
import torch
from typing import Optional, Tuple, Union, Any
from types import ModuleType
import numpy as np
import OPTIMA.builtin.inputs
model_config_type = dict[str, Union[int, float, str, Any]]
class LightningDataset(torch.utils.data.Dataset):
"""Tensor dataset with in-memory batching to improve performance."""
def __init__(
self, inputs: torch.Tensor, targets: torch.Tensor, weights: Optional[torch.Tensor] = None, batch_size: int = 1
) -> None:
"""Constructor of ``LightningDataset``.
Parameters
----------
inputs : torch.Tensor
Torch tensor of input features.
targets : torch.Tensor
Torch tensor of target labels.
weights : Optional[torch.Tensor]
Torch tensor of event weights. (Default value = None)
batch_size : int
The batch size of the dataset. (Default value = 1)
"""
self.inputs = inputs
self.targets = targets
self.weights = weights
self.len = len(inputs) // batch_size
self.batch_size = batch_size
def __len__(self):
"""_summary_.
Returns
-------
_type_
_description_
"""
return self.len
def __getitem__(
self, idx: int
) -> Union[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
"""_summary_.
Parameters
----------
idx : int
_description_
Returns
-------
Union[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]
_description_
"""
start = self.batch_size * idx
end = self.batch_size * (idx + 1)
if self.weights is not None:
return self.inputs[start:end], self.targets[start:end], self.weights[start:end]
else:
return self.inputs[start:end], self.targets[start:end]
def get_dataset(
run_config: ModuleType, model_config: Optional[model_config_type] = None, *numpy_arrays: np.ndarray
) -> torch.utils.data.Dataset:
"""Helper function to build a Lightning dataset from numpy arrays provided as positional arguments.
From the provided numpy arrays, Torch tensors are built.
If a ``LightningDataset`` class is defined in the `run-config`, its constructor is provided with the ``model_config``
and the ``torch.Tensor`` arguments are passed as positional arguments. Otherwise, a Torch ``TensorDataset`` is built.
Parameters
----------
run_config : ModuleType
Reference to the imported run-config file.
model_config : Optional[model_config_type]
The model-config of the Lightning model this dataset is to be built for. This is only provided to the
``LightningDataset``-class in the `run-config` (if present). (Default value = None)
*numpy_arrays : np.ndarray
The numpy arrays to be combined into a dataset.
Returns
-------
torch.utils.data.Dataset
The built dataset.
"""
tensors = []
for array in numpy_arrays:
# make a copy of the array to ensure it is writable (pytorch currently only supports converting from writable
# numpy arrays)
array = array.copy()
tensors.append(torch.from_numpy(array).type(torch.float))
if hasattr(run_config, "LightningDataset"):
dataset = run_config.LightningDataset(model_config, *tensors)
else:
dataset = torch.utils.data.TensorDataset(*tensors)
return dataset
def get_dataloader(
run_config: ModuleType, model_config: model_config_type, dataset: torch.utils.data.Dataset
) -> torch.utils.data.DataLoader:
"""Helper function to build a dataloader from a torch dataset.
By default, a ``torch.utils.data.DataLoader`` is instantiated, with the ``batch_size`` set to the ``"batch_size"``
entry in the provided ``model_config``. To overwrite this behavior, a ``Dataloader``-class can be defined in the
`run-config`. It is provided with the ``dataset`` and the ``model_config``.
Parameters
----------
run_config : ModuleType
Reference to the imported run-config file.
model_config : model_config_type
The model-config of the Lightning model this dataset is to be built for. If a ``Dataloader``-class is defined
in the `run-config`, it is provided directly to the constructor. If this is not the case, it is expected to
contain a ``"batch_size"``-entry that is provided to the ``torch.utils.data.DataLoader``.
dataset : torch.utils.data.Dataset
The dataset to be wrapped in a dataloader.
Returns
-------
torch.utils.data.DataLoader
The built dataloader.
"""
if hasattr(run_config, "DataLoader"):
dataloader = run_config.DataLoader(dataset, model_config)
else:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=model_config["batch_size"])
return dataloader
class DefaultDataModule(pl.LightningDataModule):
"""A LightningDataModule that provides default functionality."""
def __init__(
self,
inputs_train: np.ndarray,
targets_train: np.ndarray,
inputs_val: np.ndarray,
targets_val: np.ndarray,
run_config: ModuleType,
model_config: model_config_type,
weights_train: Optional[np.ndarray] = None,
weights_val: Optional[np.ndarray] = None,
inputs_test: Optional[np.ndarray] = None,
targets_test: Optional[np.ndarray] = None,
weights_test: Optional[np.ndarray] = None,
input_handler: Optional[OPTIMA.builtin.inputs.InputHandler] = None,
):
"""Constructor of ``DefaultDataModule``.
It represents the easiest possible implementation of a ``LightningDataModule``. The datasets are built by calling
the ``get_dataset``-function. The dataloaders are provided by ``get_dataloader``.
Parameters
----------
inputs_train : np.ndarray
Numpy array of training input features.
targets_train : np.ndarray
Numpy array of training target labels.
inputs_val : np.ndarray
Numpy array of validation input features.
targets_val : np.ndarray
Numpy array of validation target labels.
run_config : ModuleType
Reference to the imported run-config file.
model_config : model_config_type
The model-config of the Lightning model this data module is to be built for.
weights_train : Optional[np.ndarray]
Numpy array of training event weights. (Default value = None)
weights_val : Optional[np.ndarray]
Numpy array of validation event weights. (Default value = None)
inputs_test : Optional[np.ndarray]
Numpy array of testing input features. (Default value = None)
targets_test : Optional[np.ndarray]
Numpy array of testing target labels. (Default value = None)
weights_test : Optional[np.ndarray]
Numpy array of testing event weights. (Default value = None)
input_handler : Optional[OPTIMA.builtin.inputs.InputHandler]
Instance of the ``InputHandler``-class. While not needed for the built-in DataModule, a DataModule defined
in the `run-config` may need to know the inputs they are provided with, and will, thus, be provided with the
``input_handler``. (Default value = None)
"""
super().__init__()
self.inputs_train = inputs_train
self.targets_train = targets_train
self.weights_train = weights_train
self.inputs_val = inputs_val
self.targets_val = targets_val
self.weights_val = weights_val
self.inputs_test = inputs_test
self.targets_test = targets_test
self.weights_test = weights_test
self.run_config = run_config
self.model_config = model_config
self.train_dataset = None
self.val_dataset = None
self.test_dataset = None
def prepare_data(self):
"""_summary_.
Returns
-------
_type_
_description_
"""
pass
def setup(self, stage: str = None):
"""_summary_.
Parameters
----------
stage : str
_description_ (Default value = None)
Returns
-------
_type_
_description_
"""
# get the datasets
if stage == "fit" or stage is None:
self.train_dataset = get_dataset(
self.run_config, self.model_config, self.inputs_train, self.targets_train, self.weights_train
)
self.val_dataset = get_dataset(
self.run_config, self.model_config, self.inputs_val, self.targets_val, self.weights_val
)
if stage == "test" or stage is None and self.inputs_test is not None and self.targets_test is not None:
self.test_dataset = get_dataset(
self.run_config, self.model_config, self.inputs_test, self.targets_test, self.weights_test
)
def train_dataloader(self):
"""_summary_.
Returns
-------
_type_
_description_
"""
dataloader = get_dataloader(self.run_config, self.model_config, self.train_dataset)
return dataloader
def val_dataloader(self):
"""_summary_.
Returns
-------
_type_
_description_
"""
dataloader = get_dataloader(self.run_config, self.model_config, self.val_dataset)
return dataloader
def test_dataloader(self):
"""_summary_.
Returns
-------
_type_
_description_
"""
dataloader = get_dataloader(self.run_config, self.model_config, self.test_dataset)
return dataloader
Functions
def get_dataloader(run_config: module, model_config: dict[str, typing.Union[int, float, str, typing.Any]], dataset: torch.utils.data.dataset.Dataset) ‑> torch.utils.data.dataloader.DataLoader-
Helper function to build a dataloader from a torch dataset.
By default, a
torch.utils.data.DataLoaderis instantiated, with thebatch_sizeset to the"batch_size"entry in the providedmodel_config. To overwrite this behavior, aDataloader-class can be defined in therun-config. It is provided with thedatasetand themodel_config.Parameters
run_config:ModuleType- Reference to the imported run-config file.
model_config:model_config_type- The model-config of the Lightning model this dataset is to be built for. If a
Dataloader-class is defined in therun-config, it is provided directly to the constructor. If this is not the case, it is expected to contain a"batch_size"-entry that is provided to thetorch.utils.data.DataLoader. dataset:torch.utils.data.Dataset- The dataset to be wrapped in a dataloader.
Returns
torch.utils.data.DataLoader- The built dataloader.
Expand source code
def get_dataloader( run_config: ModuleType, model_config: model_config_type, dataset: torch.utils.data.Dataset ) -> torch.utils.data.DataLoader: """Helper function to build a dataloader from a torch dataset. By default, a ``torch.utils.data.DataLoader`` is instantiated, with the ``batch_size`` set to the ``"batch_size"`` entry in the provided ``model_config``. To overwrite this behavior, a ``Dataloader``-class can be defined in the `run-config`. It is provided with the ``dataset`` and the ``model_config``. Parameters ---------- run_config : ModuleType Reference to the imported run-config file. model_config : model_config_type The model-config of the Lightning model this dataset is to be built for. If a ``Dataloader``-class is defined in the `run-config`, it is provided directly to the constructor. If this is not the case, it is expected to contain a ``"batch_size"``-entry that is provided to the ``torch.utils.data.DataLoader``. dataset : torch.utils.data.Dataset The dataset to be wrapped in a dataloader. Returns ------- torch.utils.data.DataLoader The built dataloader. """ if hasattr(run_config, "DataLoader"): dataloader = run_config.DataLoader(dataset, model_config) else: dataloader = torch.utils.data.DataLoader(dataset, batch_size=model_config["batch_size"]) return dataloader def get_dataset(run_config: module, model_config: Optional[dict[str, typing.Union[int, float, str, typing.Any]]] = None, *numpy_arrays: numpy.ndarray) ‑> torch.utils.data.dataset.Dataset-
Helper function to build a Lightning dataset from numpy arrays provided as positional arguments.
From the provided numpy arrays, Torch tensors are built.
If a
LightningDatasetclass is defined in therun-config, its constructor is provided with themodel_configand thetorch.Tensorarguments are passed as positional arguments. Otherwise, a TorchTensorDatasetis built.Parameters
run_config:ModuleType- Reference to the imported run-config file.
model_config:Optional[model_config_type]- The model-config of the Lightning model this dataset is to be built for. This is only provided to the
LightningDataset-class in therun-config(if present). (Default value = None) *numpy_arrays:np.ndarray- The numpy arrays to be combined into a dataset.
Returns
torch.utils.data.Dataset- The built dataset.
Expand source code
def get_dataset( run_config: ModuleType, model_config: Optional[model_config_type] = None, *numpy_arrays: np.ndarray ) -> torch.utils.data.Dataset: """Helper function to build a Lightning dataset from numpy arrays provided as positional arguments. From the provided numpy arrays, Torch tensors are built. If a ``LightningDataset`` class is defined in the `run-config`, its constructor is provided with the ``model_config`` and the ``torch.Tensor`` arguments are passed as positional arguments. Otherwise, a Torch ``TensorDataset`` is built. Parameters ---------- run_config : ModuleType Reference to the imported run-config file. model_config : Optional[model_config_type] The model-config of the Lightning model this dataset is to be built for. This is only provided to the ``LightningDataset``-class in the `run-config` (if present). (Default value = None) *numpy_arrays : np.ndarray The numpy arrays to be combined into a dataset. Returns ------- torch.utils.data.Dataset The built dataset. """ tensors = [] for array in numpy_arrays: # make a copy of the array to ensure it is writable (pytorch currently only supports converting from writable # numpy arrays) array = array.copy() tensors.append(torch.from_numpy(array).type(torch.float)) if hasattr(run_config, "LightningDataset"): dataset = run_config.LightningDataset(model_config, *tensors) else: dataset = torch.utils.data.TensorDataset(*tensors) return dataset
Classes
class DefaultDataModule (inputs_train: numpy.ndarray, targets_train: numpy.ndarray, inputs_val: numpy.ndarray, targets_val: numpy.ndarray, run_config: module, model_config: dict[str, typing.Union[int, float, str, typing.Any]], weights_train: Optional[numpy.ndarray] = None, weights_val: Optional[numpy.ndarray] = None, inputs_test: Optional[numpy.ndarray] = None, targets_test: Optional[numpy.ndarray] = None, weights_test: Optional[numpy.ndarray] = None, input_handler: Optional[InputHandler] = None)-
A LightningDataModule that provides default functionality.
Constructor of
DefaultDataModule.It represents the easiest possible implementation of a
LightningDataModule. The datasets are built by calling theget_dataset()-function. The dataloaders are provided byget_dataloader().Parameters
inputs_train:np.ndarray- Numpy array of training input features.
targets_train:np.ndarray- Numpy array of training target labels.
inputs_val:np.ndarray- Numpy array of validation input features.
targets_val:np.ndarray- Numpy array of validation target labels.
run_config:ModuleType- Reference to the imported run-config file.
model_config:model_config_type- The model-config of the Lightning model this data module is to be built for.
weights_train:Optional[np.ndarray]- Numpy array of training event weights. (Default value = None)
weights_val:Optional[np.ndarray]- Numpy array of validation event weights. (Default value = None)
inputs_test:Optional[np.ndarray]- Numpy array of testing input features. (Default value = None)
targets_test:Optional[np.ndarray]- Numpy array of testing target labels. (Default value = None)
weights_test:Optional[np.ndarray]- Numpy array of testing event weights. (Default value = None)
input_handler:Optional[InputHandler]- Instance of the
InputHandler-class. While not needed for the built-in DataModule, a DataModule defined in therun-configmay need to know the inputs they are provided with, and will, thus, be provided with theinput_handler. (Default value = None)
Expand source code
class DefaultDataModule(pl.LightningDataModule): """A LightningDataModule that provides default functionality.""" def __init__( self, inputs_train: np.ndarray, targets_train: np.ndarray, inputs_val: np.ndarray, targets_val: np.ndarray, run_config: ModuleType, model_config: model_config_type, weights_train: Optional[np.ndarray] = None, weights_val: Optional[np.ndarray] = None, inputs_test: Optional[np.ndarray] = None, targets_test: Optional[np.ndarray] = None, weights_test: Optional[np.ndarray] = None, input_handler: Optional[OPTIMA.builtin.inputs.InputHandler] = None, ): """Constructor of ``DefaultDataModule``. It represents the easiest possible implementation of a ``LightningDataModule``. The datasets are built by calling the ``get_dataset``-function. The dataloaders are provided by ``get_dataloader``. Parameters ---------- inputs_train : np.ndarray Numpy array of training input features. targets_train : np.ndarray Numpy array of training target labels. inputs_val : np.ndarray Numpy array of validation input features. targets_val : np.ndarray Numpy array of validation target labels. run_config : ModuleType Reference to the imported run-config file. model_config : model_config_type The model-config of the Lightning model this data module is to be built for. weights_train : Optional[np.ndarray] Numpy array of training event weights. (Default value = None) weights_val : Optional[np.ndarray] Numpy array of validation event weights. (Default value = None) inputs_test : Optional[np.ndarray] Numpy array of testing input features. (Default value = None) targets_test : Optional[np.ndarray] Numpy array of testing target labels. (Default value = None) weights_test : Optional[np.ndarray] Numpy array of testing event weights. (Default value = None) input_handler : Optional[OPTIMA.builtin.inputs.InputHandler] Instance of the ``InputHandler``-class. While not needed for the built-in DataModule, a DataModule defined in the `run-config` may need to know the inputs they are provided with, and will, thus, be provided with the ``input_handler``. (Default value = None) """ super().__init__() self.inputs_train = inputs_train self.targets_train = targets_train self.weights_train = weights_train self.inputs_val = inputs_val self.targets_val = targets_val self.weights_val = weights_val self.inputs_test = inputs_test self.targets_test = targets_test self.weights_test = weights_test self.run_config = run_config self.model_config = model_config self.train_dataset = None self.val_dataset = None self.test_dataset = None def prepare_data(self): """_summary_. Returns ------- _type_ _description_ """ pass def setup(self, stage: str = None): """_summary_. Parameters ---------- stage : str _description_ (Default value = None) Returns ------- _type_ _description_ """ # get the datasets if stage == "fit" or stage is None: self.train_dataset = get_dataset( self.run_config, self.model_config, self.inputs_train, self.targets_train, self.weights_train ) self.val_dataset = get_dataset( self.run_config, self.model_config, self.inputs_val, self.targets_val, self.weights_val ) if stage == "test" or stage is None and self.inputs_test is not None and self.targets_test is not None: self.test_dataset = get_dataset( self.run_config, self.model_config, self.inputs_test, self.targets_test, self.weights_test ) def train_dataloader(self): """_summary_. Returns ------- _type_ _description_ """ dataloader = get_dataloader(self.run_config, self.model_config, self.train_dataset) return dataloader def val_dataloader(self): """_summary_. Returns ------- _type_ _description_ """ dataloader = get_dataloader(self.run_config, self.model_config, self.val_dataset) return dataloader def test_dataloader(self): """_summary_. Returns ------- _type_ _description_ """ dataloader = get_dataloader(self.run_config, self.model_config, self.test_dataset) return dataloaderAncestors
- lightning.pytorch.core.datamodule.LightningDataModule
- lightning.pytorch.core.hooks.DataHooks
- lightning.pytorch.core.mixins.hparams_mixin.HyperparametersMixin
Methods
def prepare_data(self)-
summary.
Returns
_type_- description
Expand source code
def prepare_data(self): """_summary_. Returns ------- _type_ _description_ """ pass def setup(self, stage: str = None)-
summary.
Parameters
stage:str- description (Default value = None)
Returns
_type_- description
Expand source code
def setup(self, stage: str = None): """_summary_. Parameters ---------- stage : str _description_ (Default value = None) Returns ------- _type_ _description_ """ # get the datasets if stage == "fit" or stage is None: self.train_dataset = get_dataset( self.run_config, self.model_config, self.inputs_train, self.targets_train, self.weights_train ) self.val_dataset = get_dataset( self.run_config, self.model_config, self.inputs_val, self.targets_val, self.weights_val ) if stage == "test" or stage is None and self.inputs_test is not None and self.targets_test is not None: self.test_dataset = get_dataset( self.run_config, self.model_config, self.inputs_test, self.targets_test, self.weights_test ) def test_dataloader(self)-
summary.
Returns
_type_- description
Expand source code
def test_dataloader(self): """_summary_. Returns ------- _type_ _description_ """ dataloader = get_dataloader(self.run_config, self.model_config, self.test_dataset) return dataloader def train_dataloader(self)-
summary.
Returns
_type_- description
Expand source code
def train_dataloader(self): """_summary_. Returns ------- _type_ _description_ """ dataloader = get_dataloader(self.run_config, self.model_config, self.train_dataset) return dataloader def val_dataloader(self)-
summary.
Returns
_type_- description
Expand source code
def val_dataloader(self): """_summary_. Returns ------- _type_ _description_ """ dataloader = get_dataloader(self.run_config, self.model_config, self.val_dataset) return dataloader
class LightningDataset (inputs: torch.Tensor, targets: torch.Tensor, weights: Optional[torch.Tensor] = None, batch_size: int = 1)-
Tensor dataset with in-memory batching to improve performance.
Constructor of
LightningDataset.Parameters
inputs:torch.Tensor- Torch tensor of input features.
targets:torch.Tensor- Torch tensor of target labels.
weights:Optional[torch.Tensor]- Torch tensor of event weights. (Default value = None)
batch_size:int- The batch size of the dataset. (Default value = 1)
Expand source code
class LightningDataset(torch.utils.data.Dataset): """Tensor dataset with in-memory batching to improve performance.""" def __init__( self, inputs: torch.Tensor, targets: torch.Tensor, weights: Optional[torch.Tensor] = None, batch_size: int = 1 ) -> None: """Constructor of ``LightningDataset``. Parameters ---------- inputs : torch.Tensor Torch tensor of input features. targets : torch.Tensor Torch tensor of target labels. weights : Optional[torch.Tensor] Torch tensor of event weights. (Default value = None) batch_size : int The batch size of the dataset. (Default value = 1) """ self.inputs = inputs self.targets = targets self.weights = weights self.len = len(inputs) // batch_size self.batch_size = batch_size def __len__(self): """_summary_. Returns ------- _type_ _description_ """ return self.len def __getitem__( self, idx: int ) -> Union[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: """_summary_. Parameters ---------- idx : int _description_ Returns ------- Union[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]] _description_ """ start = self.batch_size * idx end = self.batch_size * (idx + 1) if self.weights is not None: return self.inputs[start:end], self.targets[start:end], self.weights[start:end] else: return self.inputs[start:end], self.targets[start:end]Ancestors
- torch.utils.data.dataset.Dataset
- typing.Generic