Module OPTIMA.core.inputs
A module that provides functionality to handle the training inputs.
Expand source code
# -*- coding: utf-8 -*-
"""A module that provides functionality to handle the training inputs."""
from types import ModuleType
from typing import Optional, Union
import itertools
import os
from functools import partial
import numpy as np
import ray
import OPTIMA.builtin.inputs
train_val_splitting_type = tuple[Union[ray.ObjectRef, list[ray.ObjectRef]], Union[ray.ObjectRef, list[ray.ObjectRef]]]
train_val_test_splitting_type = tuple[
Union[ray.ObjectRef, list[ray.ObjectRef]],
Union[ray.ObjectRef, list[ray.ObjectRef]],
Union[ray.ObjectRef, list[ray.ObjectRef]],
]
def _event_nums_splitting_cond_kfold(
event_nums: np.ndarray,
run_config: ModuleType,
split: str = "val",
use_testing_set: bool = True,
fixed_testing_set: bool = True,
) -> list[np.ndarray]:
"""Does the k-fold event splitting based on the array of event numbers.
The following cases are distinguished:
- ``use_testing_set`` is ``False``: only a training/validation split is to be done, thus the only allowed
value of ``split`` is ``'val'``. The returned array is calculated using the condition:
``(event_nums - i + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0``,
where ``i`` is varied between ``0`` and ``run_config.eventNums_splitting_N - 1``. This results in
``run_config.eventNums_splitting_N`` different folds.
- ``use_testing_set`` is ``True``:
- ``fixed_testing_set`` is ``True``: the same testing dataset is to be used for all folds. The conditions are:
- ``split == 'test'``: ``(event_nums + run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N == 0``
- ``split == 'val'``: ``(event_nums - i + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0``,
where ``i`` is varied between ``0`` and ``run_config.eventNums_splitting_N - 1``. The iteration
``i == (run_config.eventNums_splitting_offset_val - run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N``
is skipped to ensure that the validation and testing datasets are always different.
This results in ``run_config.eventNums_splitting_N - 1`` different folds. Each event is either always
part of the testing dataset or exactly once part of the validation dataset and
``run_config.eventNums_splitting_N - 2`` times part of the training dataset.
- if ``fixed_testing_set`` is ``False``: the subset used as the testing dataset is shifted in the same
way as the validation dataset, resulting in a different testing dataset for every fold. The conditions are
- ``split == 'test'``: ``(event_nums - i + run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N == 0``
-`` split == 'val'``: ``(event_nums - i + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0``
In both cases, ``i`` is varied between ``0`` and ``run_config.eventNums_splitting_N - 1``, resulting in
``run_config.eventNums_splitting_N`` different folds. Each event is exactly once part of the testing dataset,
exactly once part of the validation dataset and ``run_config.eventNums_splitting_N - 2`` times part of the
training dataset.
The return value is a list of boolean arrays of the same shape as ``event_nums``. Each list entry corresponds to a
fold.
Parameters
----------
event_nums : np.ndarray
1D array of integers giving numbers to each event.
run_config : ModuleType
Reference to the imported run-config file.
split : str
Specifies if this is the training / validation or the training+validation / testing split. Possible values
are ``'val'`` or ``'test'``. (Default value = 'val')
use_testing_set : bool
Specifies if a training / validation or a training / validation / testing split is to be done. (Default value = True)
fixed_testing_set : bool
Specifies if the same testing dataset should be used for all folds or if it should be varied like the
validation dataset. (Default value = True)
Returns
-------
list[np.ndarray]
List of boolean arrays, ``True`` where ``event_nums`` fulfills a condition.
"""
condition_list = []
if use_testing_set and fixed_testing_set:
if split == "val":
for i in range(run_config.eventNums_splitting_N):
# skip if validation set == test set, i.e. (EventNumber - i + val) % N == (EventNumber + test) % N
# <==> (-i + val) % N == test % N
if (
use_testing_set
and (run_config.eventNums_splitting_offset_val - i) % run_config.eventNums_splitting_N
== run_config.eventNums_splitting_offset_test % run_config.eventNums_splitting_N
):
continue
condition_list.append(
((event_nums - i) + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N
== 0
)
elif split == "test":
condition_list = [
(event_nums + run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N == 0
] * (run_config.eventNums_splitting_N - 1)
else:
raise NotImplementedError
else:
if split == "val":
for i in range(run_config.eventNums_splitting_N):
condition_list.append(
((event_nums - i) + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N
== 0
)
elif split == "test":
assert use_testing_set, "Split 'test' can only be used when use_testing_set is True."
for i in range(run_config.eventNums_splitting_N):
condition_list.append(
((event_nums - i) + run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N
== 0
)
else:
raise NotImplementedError
return condition_list
def get_experiment_inputs(
run_config: ModuleType,
input_handler: OPTIMA.builtin.inputs.InputHandler,
output_dir: Optional[str] = None,
inputs_for_crossvalidation: bool = False,
disable_printing: bool = False,
) -> Union[
tuple[
train_val_test_splitting_type,
train_val_test_splitting_type,
train_val_test_splitting_type,
train_val_test_splitting_type,
],
tuple[
train_val_splitting_type,
train_val_splitting_type,
train_val_splitting_type,
train_val_splitting_type,
],
]:
"""Prepares the input data used for the training during the optimization and crossvalidation and copies them to Ray's object store.
This function depends on three functions that handle the loading and preprocessing (``get_inputs``), the splitting of
the dataset (``get_training_data``) and the plotting of the input variables (``plot_input_data``). If they are not
provided in the run-config, the defaults defined in ``OPTIMA.builtin.inputs`` are used. The expected behaviour of these
three functions is described in the corresponding documentations.
The input data is loaded by calling ``get_inputs`` and providing the ``run_config``, the desired number of events
and the input variables to include in the dataset. If the list of input variables is not specified in the `run-config`,
``None`` will be provided to ``get_inputs`` and it should use all available input variables. ``get_inputs`` is
expected to return an array of input features, an array of corresponding target labels, a 1D-array of event weights,
a 1D-array of normalized event weights and a 1D-array of event numbers. For all arrays, axis 0 is expected to separate
different events.
Once loaded, the inputs, targets, weights, normalized weights and event numbers are given to ``get_target_data`` to
be split into training, validation and (if requested) testing sets. The method of splitting and the sizes of the
respective dataset is controlled via the ``splitting_cond`` parameter of
``get_target_data``. Its value is controlled by various options discussed below. Depending on the value of
``inputs_for_crossvalidation``, which controls if a simple splitting or a k-fold splitting is to be done and is given to
``get_target_data`` as ``do_kfold``, ``get_target_data`` is expected to return tuples of arrays (if
``inputs_for_crossvalidation`` is ``False``) or tuples of lists of arrays (if ``inputs_for_crossvalidation`` is
``True``). Each tuple entry is expected to correspond to a different type of dataset (training/validation/testing),
thus is expected to have length ``2`` if no testing dataset is requested and ``3`` otherwise. If
``inputs_for_crossvalidation`` is ``True``, each list entry is expected to correspond to a different fold, and
subsequently all lists are expected to be of the same length.
If ``run_config.produce_inputs_plots`` is ``True`` and ``inputs_for_crossvalidation`` is ``False``, the
``plot_input_data``-function is called and the path to a subdirectory ``'inputs'`` in the provided ``output_dir``
is given as the directory to save the plots into.
Finally, the numpy arrays returned by ``get_target_data`` are copied to Ray's object store and the object references
are returned.
The behaviour of this function is controlled by various options that are expected to be present in the run-config:
- ``max_num_events``: controls the number of events to load from the dataset and will be provided as ``nevts`` to
``get_inputs``.
- ``use_testing_dataset``: if ``True``, the dataset will be split into training, validation and testing sets. If
``False``, only a training/validation split will be done.
- ``use_eventNums_splitting``: `bool` to choose if the dataset should be split randomly or based on the event numbers
- ``False``: the options ``run_config.validation_fraction`` and (if ``run_config.use_testing_dataset`` is ``True``)
``run_config.test_fraction`` are provided as ``splitting_cond`` to ``get_training_data``.
- ``True``: depending on ``inputs_for_crossvalidation``, a simple splitting or a k-fold splitting based on the
array of event numbers is performed.
- if ``inputs_for_crossvalidation`` is ``False``: a callable evaluating the condition
`(EventNumber + C_val) % N = 0` and (if ``use_testing_dataset`` is ``True``) a callable evaluating
`(EventNumber + C_test) % N = 0` is/are provided as ``splitting_cond`` to ``get_training_data``. `C_val`
and `C_test` are given by the options ``run_config.eventNums_splitting_offset_val`` and
``run_config.eventNums_splitting_offset_test`` and `N` is given by ``run_config.eventNums_splitting_N``.
- ``inputs_for_crossvalidation`` is ``True``: the callable(s) provided as ``splitting_cond`` to
``get_training_data`` return(s) a list of boolean arrays when given an array of event numbers. Each list
entry corresponds to a fold. The boolean arrays are calculated according to:
- if ``run_config.use_testing_dataset`` is ``True``:
- if ``run_config.fixed_testing_dataset`` is ``True``: a fixed testing dataset is used for all folds
while the remaining dataset is used for k-fold splitting to create `k = N-1` different training/
validation splits.
- testing dataset: `(EventNumber + C_test) % N = 0`
- validation dataset: `(EventNumber - i + C_val) % N = 0`, with
`0 <= i <= N` and `(-i + C_val) % N != C_test`
- ``run_config.fixed_testing_dataset`` is ``False``: the subset of the data used for testing is
shifted in the same way as the validation dataset, resulting in `k = N` different training/
validation/testing splits.
- testing dataset: `(EventNumber - i + C_test) % N = 0`
- valdiation dataset: `(EventNumber - i + C_val) % N = 0`
- ``run_config.use_testing_dataset`` is ``False``: `k = N` different training/validation splits are returned.
Validation dataset: `(EventNumber - i + C_val) % N = 0`
The size of the validation dataset and (if ``run_config.use_testing_dataset`` is ``True``) the testing dataset
are thus controlled via ``run_config.eventNums_splitting_N`` and the validation and testing set are always of the
same size.
- ``produce_inputs_plots``: controls if ``plot_input_data`` is called.
Parameters
----------
run_config : ModuleType
Reference to the imported run-config file.
input_handler : OPTIMA.builtin.inputs.InputHandler
Instance of the ``InputHandler``-class.
output_dir : Optional[str]
Directory to save the output plots to. (Default value = None)
inputs_for_crossvalidation : bool
If True, k-fold splitting will be performed according to the options given in the run-config. Otherwise, simple
splitting is done. (Default value = False)
disable_printing : bool
If True, no messages will be printed. (Default value = False)
Returns
-------
Union[
tuple[
train_val_test_splitting_type,
train_val_test_splitting_type,
train_val_test_splitting_type,
train_val_test_splitting_type,
],
tuple[
train_val_splitting_type,
train_val_splitting_type,
train_val_splitting_type,
train_val_splitting_type,
],
]
The Ray object references to the split inputs, targets, weights and normalized weights are returned.
"""
# define splitting conditions
if not run_config.use_eventNums_splitting:
if run_config.use_testing_dataset:
splitting_cond_trainVal_test = run_config.test_fraction
splitting_cond_train_val = run_config.validation_fraction
splitting_cond = (splitting_cond_trainVal_test, splitting_cond_train_val)
else:
splitting_cond = run_config.validation_fraction
elif run_config.use_eventNums_splitting and not inputs_for_crossvalidation:
if run_config.use_testing_dataset:
splitting_cond_trainVal_test = (
lambda x: (x + run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N == 0
)
splitting_cond_train_val = (
lambda x: (x + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0
)
splitting_cond = (splitting_cond_trainVal_test, splitting_cond_train_val)
else:
splitting_cond = (
lambda x: (x + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0
)
elif run_config.use_eventNums_splitting and inputs_for_crossvalidation:
if run_config.use_testing_dataset:
splitting_cond_trainVal_test = partial(
_event_nums_splitting_cond_kfold,
run_config=run_config,
split="test",
use_testing_set=True,
fixed_testing_set=run_config.fixed_testing_dataset,
)
splitting_cond_train_val = partial(
_event_nums_splitting_cond_kfold,
run_config=run_config,
split="val",
use_testing_set=True,
fixed_testing_set=run_config.fixed_testing_dataset,
)
splitting_cond = (splitting_cond_trainVal_test, splitting_cond_train_val)
else:
splitting_cond = partial(
_event_nums_splitting_cond_kfold, run_config=run_config, split="val", use_testing_set=False
)
# load the input data and split it
if hasattr(run_config, "get_inputs"):
get_inputs = run_config.get_inputs
else:
get_inputs = OPTIMA.builtin.inputs.get_inputs
inputs, targets, weights, normalized_weights, event_nums = get_inputs(
run_config,
run_config.max_num_events,
input_handler.get_vars(as_indices=input_handler.as_indices),
disable_printing=disable_printing,
)
# if no input variables were specified in the run_config, we can use the indices of the inputs instead. For that,
# we need to provide all possible indices once to the input handler.
if input_handler.get_vars() is None:
index_slices = [
list(range(i)) for i in inputs[0].shape
] # get the shape of an input and get indices for each dimension
indices_list = list(itertools.product(*index_slices)) # calculate the cross product over the lists of indices
input_handler.set_vars(indices_list, as_indices=True)
if hasattr(run_config, "get_training_data"):
get_training_data = run_config.get_training_data
else:
get_training_data = OPTIMA.builtin.inputs.get_training_data
(
inputs_split,
targets_split,
weights_split,
normalized_weights_split,
) = get_training_data(
inputs,
targets,
weights,
normalized_weights,
splitting_cond,
event_nums=event_nums,
do_kfold=inputs_for_crossvalidation,
fixed_test_dataset=run_config.fixed_testing_dataset,
disable_printing=disable_printing,
)
if run_config.produce_inputs_plots and not inputs_for_crossvalidation:
if not disable_printing:
print("Plotting the input variables...")
if hasattr(run_config, "plot_input_data"):
plot_input_data = run_config.plot_input_data
else:
plot_input_data = OPTIMA.builtin.inputs.plot_input_data
plot_input_data(
run_config,
inputs,
targets,
input_handler.get_vars(),
outdir=os.path.join(output_dir, "inputs"),
weights=weights,
)
# copy the data to the object store
inputs_split = [[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in inputs_split]
targets_split = [[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in targets_split]
weights_split = [[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in weights_split]
normalized_weights_split = [
[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in normalized_weights_split
]
return inputs_split, targets_split, weights_split, normalized_weights_split
Functions
def get_experiment_inputs(run_config: module, input_handler: InputHandler, output_dir: Optional[str] = None, inputs_for_crossvalidation: bool = False, disable_printing: bool = False) ‑> Union[tuple[tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]], tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]], tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]], tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]]], tuple[tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]], tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]], tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]], tuple[Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]], Union[ray._raylet.ObjectRef, list[ray._raylet.ObjectRef]]]]]-
Prepares the input data used for the training during the optimization and crossvalidation and copies them to Ray's object store.
This function depends on three functions that handle the loading and preprocessing (
get_inputs), the splitting of the dataset (get_training_data) and the plotting of the input variables (plot_input_data). If they are not provided in the run-config, the defaults defined inOPTIMA.builtin.inputsare used. The expected behaviour of these three functions is described in the corresponding documentations.The input data is loaded by calling
get_inputsand providing therun_config, the desired number of events and the input variables to include in the dataset. If the list of input variables is not specified in therun-config,Nonewill be provided toget_inputsand it should use all available input variables.get_inputsis expected to return an array of input features, an array of corresponding target labels, a 1D-array of event weights, a 1D-array of normalized event weights and a 1D-array of event numbers. For all arrays, axis 0 is expected to separate different events.Once loaded, the inputs, targets, weights, normalized weights and event numbers are given to
get_target_datato be split into training, validation and (if requested) testing sets. The method of splitting and the sizes of the respective dataset is controlled via thesplitting_condparameter ofget_target_data. Its value is controlled by various options discussed below. Depending on the value ofinputs_for_crossvalidation, which controls if a simple splitting or a k-fold splitting is to be done and is given toget_target_dataasdo_kfold,get_target_datais expected to return tuples of arrays (ifinputs_for_crossvalidationisFalse) or tuples of lists of arrays (ifinputs_for_crossvalidationisTrue). Each tuple entry is expected to correspond to a different type of dataset (training/validation/testing), thus is expected to have length2if no testing dataset is requested and3otherwise. Ifinputs_for_crossvalidationisTrue, each list entry is expected to correspond to a different fold, and subsequently all lists are expected to be of the same length.If
run_config.produce_inputs_plotsisTrueandinputs_for_crossvalidationisFalse, theplot_input_data-function is called and the path to a subdirectory'inputs'in the providedoutput_diris given as the directory to save the plots into.Finally, the numpy arrays returned by
get_target_dataare copied to Ray's object store and the object references are returned.The behaviour of this function is controlled by various options that are expected to be present in the run-config:
max_num_events: controls the number of events to load from the dataset and will be provided asnevtstoget_inputs.use_testing_dataset: ifTrue, the dataset will be split into training, validation and testing sets. IfFalse, only a training/validation split will be done.use_eventNums_splitting:boolto choose if the dataset should be split randomly or based on the event numbersFalse: the optionsrun_config.validation_fractionand (ifrun_config.use_testing_datasetisTrue)run_config.test_fractionare provided assplitting_condtoget_training_data.True: depending oninputs_for_crossvalidation, a simple splitting or a k-fold splitting based on the array of event numbers is performed.- if
inputs_for_crossvalidationisFalse: a callable evaluating the condition(EventNumber + C_val) % N = 0and (ifuse_testing_datasetisTrue) a callable evaluating(EventNumber + C_test) % N = 0is/are provided assplitting_condtoget_training_data.C_valandC_testare given by the optionsrun_config.eventNums_splitting_offset_valandrun_config.eventNums_splitting_offset_testandNis given byrun_config.eventNums_splitting_N. inputs_for_crossvalidationisTrue: the callable(s) provided assplitting_condtoget_training_datareturn(s) a list of boolean arrays when given an array of event numbers. Each list entry corresponds to a fold. The boolean arrays are calculated according to:- if
run_config.use_testing_datasetisTrue:- if
run_config.fixed_testing_datasetisTrue: a fixed testing dataset is used for all folds while the remaining dataset is used for k-fold splitting to createk = N-1different training/ validation splits.- testing dataset:
(EventNumber + C_test) % N = 0 - validation dataset:
(EventNumber - i + C_val) % N = 0, with0 <= i <= Nand(-i + C_val) % N != C_test
- testing dataset:
run_config.fixed_testing_datasetisFalse: the subset of the data used for testing is shifted in the same way as the validation dataset, resulting ink = Ndifferent training/ validation/testing splits.- testing dataset:
(EventNumber - i + C_test) % N = 0 - valdiation dataset:
(EventNumber - i + C_val) % N = 0
- testing dataset:
- if
run_config.use_testing_datasetisFalse:k = Ndifferent training/validation splits are returned. Validation dataset:(EventNumber - i + C_val) % N = 0The size of the validation dataset and (ifrun_config.use_testing_datasetisTrue) the testing dataset are thus controlled viarun_config.eventNums_splitting_Nand the validation and testing set are always of the same size.
- if
- if
produce_inputs_plots: controls ifplot_input_datais called.
Parameters
run_config:ModuleType- Reference to the imported run-config file.
input_handler:InputHandler- Instance of the
InputHandler-class. output_dir:Optional[str]- Directory to save the output plots to. (Default value = None)
inputs_for_crossvalidation:bool- If True, k-fold splitting will be performed according to the options given in the run-config. Otherwise, simple splitting is done. (Default value = False)
disable_printing:bool- If True, no messages will be printed. (Default value = False)
Returns
Union[- tuple[ train_val_test_splitting_type, train_val_test_splitting_type, train_val_test_splitting_type, train_val_test_splitting_type, ], tuple[ train_val_splitting_type, train_val_splitting_type, train_val_splitting_type, train_val_splitting_type, ],
] The Ray object references to the split inputs, targets, weights and normalized weights are returned.
Expand source code
def get_experiment_inputs( run_config: ModuleType, input_handler: OPTIMA.builtin.inputs.InputHandler, output_dir: Optional[str] = None, inputs_for_crossvalidation: bool = False, disable_printing: bool = False, ) -> Union[ tuple[ train_val_test_splitting_type, train_val_test_splitting_type, train_val_test_splitting_type, train_val_test_splitting_type, ], tuple[ train_val_splitting_type, train_val_splitting_type, train_val_splitting_type, train_val_splitting_type, ], ]: """Prepares the input data used for the training during the optimization and crossvalidation and copies them to Ray's object store. This function depends on three functions that handle the loading and preprocessing (``get_inputs``), the splitting of the dataset (``get_training_data``) and the plotting of the input variables (``plot_input_data``). If they are not provided in the run-config, the defaults defined in ``OPTIMA.builtin.inputs`` are used. The expected behaviour of these three functions is described in the corresponding documentations. The input data is loaded by calling ``get_inputs`` and providing the ``run_config``, the desired number of events and the input variables to include in the dataset. If the list of input variables is not specified in the `run-config`, ``None`` will be provided to ``get_inputs`` and it should use all available input variables. ``get_inputs`` is expected to return an array of input features, an array of corresponding target labels, a 1D-array of event weights, a 1D-array of normalized event weights and a 1D-array of event numbers. For all arrays, axis 0 is expected to separate different events. Once loaded, the inputs, targets, weights, normalized weights and event numbers are given to ``get_target_data`` to be split into training, validation and (if requested) testing sets. The method of splitting and the sizes of the respective dataset is controlled via the ``splitting_cond`` parameter of ``get_target_data``. Its value is controlled by various options discussed below. Depending on the value of ``inputs_for_crossvalidation``, which controls if a simple splitting or a k-fold splitting is to be done and is given to ``get_target_data`` as ``do_kfold``, ``get_target_data`` is expected to return tuples of arrays (if ``inputs_for_crossvalidation`` is ``False``) or tuples of lists of arrays (if ``inputs_for_crossvalidation`` is ``True``). Each tuple entry is expected to correspond to a different type of dataset (training/validation/testing), thus is expected to have length ``2`` if no testing dataset is requested and ``3`` otherwise. If ``inputs_for_crossvalidation`` is ``True``, each list entry is expected to correspond to a different fold, and subsequently all lists are expected to be of the same length. If ``run_config.produce_inputs_plots`` is ``True`` and ``inputs_for_crossvalidation`` is ``False``, the ``plot_input_data``-function is called and the path to a subdirectory ``'inputs'`` in the provided ``output_dir`` is given as the directory to save the plots into. Finally, the numpy arrays returned by ``get_target_data`` are copied to Ray's object store and the object references are returned. The behaviour of this function is controlled by various options that are expected to be present in the run-config: - ``max_num_events``: controls the number of events to load from the dataset and will be provided as ``nevts`` to ``get_inputs``. - ``use_testing_dataset``: if ``True``, the dataset will be split into training, validation and testing sets. If ``False``, only a training/validation split will be done. - ``use_eventNums_splitting``: `bool` to choose if the dataset should be split randomly or based on the event numbers - ``False``: the options ``run_config.validation_fraction`` and (if ``run_config.use_testing_dataset`` is ``True``) ``run_config.test_fraction`` are provided as ``splitting_cond`` to ``get_training_data``. - ``True``: depending on ``inputs_for_crossvalidation``, a simple splitting or a k-fold splitting based on the array of event numbers is performed. - if ``inputs_for_crossvalidation`` is ``False``: a callable evaluating the condition `(EventNumber + C_val) % N = 0` and (if ``use_testing_dataset`` is ``True``) a callable evaluating `(EventNumber + C_test) % N = 0` is/are provided as ``splitting_cond`` to ``get_training_data``. `C_val` and `C_test` are given by the options ``run_config.eventNums_splitting_offset_val`` and ``run_config.eventNums_splitting_offset_test`` and `N` is given by ``run_config.eventNums_splitting_N``. - ``inputs_for_crossvalidation`` is ``True``: the callable(s) provided as ``splitting_cond`` to ``get_training_data`` return(s) a list of boolean arrays when given an array of event numbers. Each list entry corresponds to a fold. The boolean arrays are calculated according to: - if ``run_config.use_testing_dataset`` is ``True``: - if ``run_config.fixed_testing_dataset`` is ``True``: a fixed testing dataset is used for all folds while the remaining dataset is used for k-fold splitting to create `k = N-1` different training/ validation splits. - testing dataset: `(EventNumber + C_test) % N = 0` - validation dataset: `(EventNumber - i + C_val) % N = 0`, with `0 <= i <= N` and `(-i + C_val) % N != C_test` - ``run_config.fixed_testing_dataset`` is ``False``: the subset of the data used for testing is shifted in the same way as the validation dataset, resulting in `k = N` different training/ validation/testing splits. - testing dataset: `(EventNumber - i + C_test) % N = 0` - valdiation dataset: `(EventNumber - i + C_val) % N = 0` - ``run_config.use_testing_dataset`` is ``False``: `k = N` different training/validation splits are returned. Validation dataset: `(EventNumber - i + C_val) % N = 0` The size of the validation dataset and (if ``run_config.use_testing_dataset`` is ``True``) the testing dataset are thus controlled via ``run_config.eventNums_splitting_N`` and the validation and testing set are always of the same size. - ``produce_inputs_plots``: controls if ``plot_input_data`` is called. Parameters ---------- run_config : ModuleType Reference to the imported run-config file. input_handler : OPTIMA.builtin.inputs.InputHandler Instance of the ``InputHandler``-class. output_dir : Optional[str] Directory to save the output plots to. (Default value = None) inputs_for_crossvalidation : bool If True, k-fold splitting will be performed according to the options given in the run-config. Otherwise, simple splitting is done. (Default value = False) disable_printing : bool If True, no messages will be printed. (Default value = False) Returns ------- Union[ tuple[ train_val_test_splitting_type, train_val_test_splitting_type, train_val_test_splitting_type, train_val_test_splitting_type, ], tuple[ train_val_splitting_type, train_val_splitting_type, train_val_splitting_type, train_val_splitting_type, ], ] The Ray object references to the split inputs, targets, weights and normalized weights are returned. """ # define splitting conditions if not run_config.use_eventNums_splitting: if run_config.use_testing_dataset: splitting_cond_trainVal_test = run_config.test_fraction splitting_cond_train_val = run_config.validation_fraction splitting_cond = (splitting_cond_trainVal_test, splitting_cond_train_val) else: splitting_cond = run_config.validation_fraction elif run_config.use_eventNums_splitting and not inputs_for_crossvalidation: if run_config.use_testing_dataset: splitting_cond_trainVal_test = ( lambda x: (x + run_config.eventNums_splitting_offset_test) % run_config.eventNums_splitting_N == 0 ) splitting_cond_train_val = ( lambda x: (x + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0 ) splitting_cond = (splitting_cond_trainVal_test, splitting_cond_train_val) else: splitting_cond = ( lambda x: (x + run_config.eventNums_splitting_offset_val) % run_config.eventNums_splitting_N == 0 ) elif run_config.use_eventNums_splitting and inputs_for_crossvalidation: if run_config.use_testing_dataset: splitting_cond_trainVal_test = partial( _event_nums_splitting_cond_kfold, run_config=run_config, split="test", use_testing_set=True, fixed_testing_set=run_config.fixed_testing_dataset, ) splitting_cond_train_val = partial( _event_nums_splitting_cond_kfold, run_config=run_config, split="val", use_testing_set=True, fixed_testing_set=run_config.fixed_testing_dataset, ) splitting_cond = (splitting_cond_trainVal_test, splitting_cond_train_val) else: splitting_cond = partial( _event_nums_splitting_cond_kfold, run_config=run_config, split="val", use_testing_set=False ) # load the input data and split it if hasattr(run_config, "get_inputs"): get_inputs = run_config.get_inputs else: get_inputs = OPTIMA.builtin.inputs.get_inputs inputs, targets, weights, normalized_weights, event_nums = get_inputs( run_config, run_config.max_num_events, input_handler.get_vars(as_indices=input_handler.as_indices), disable_printing=disable_printing, ) # if no input variables were specified in the run_config, we can use the indices of the inputs instead. For that, # we need to provide all possible indices once to the input handler. if input_handler.get_vars() is None: index_slices = [ list(range(i)) for i in inputs[0].shape ] # get the shape of an input and get indices for each dimension indices_list = list(itertools.product(*index_slices)) # calculate the cross product over the lists of indices input_handler.set_vars(indices_list, as_indices=True) if hasattr(run_config, "get_training_data"): get_training_data = run_config.get_training_data else: get_training_data = OPTIMA.builtin.inputs.get_training_data ( inputs_split, targets_split, weights_split, normalized_weights_split, ) = get_training_data( inputs, targets, weights, normalized_weights, splitting_cond, event_nums=event_nums, do_kfold=inputs_for_crossvalidation, fixed_test_dataset=run_config.fixed_testing_dataset, disable_printing=disable_printing, ) if run_config.produce_inputs_plots and not inputs_for_crossvalidation: if not disable_printing: print("Plotting the input variables...") if hasattr(run_config, "plot_input_data"): plot_input_data = run_config.plot_input_data else: plot_input_data = OPTIMA.builtin.inputs.plot_input_data plot_input_data( run_config, inputs, targets, input_handler.get_vars(), outdir=os.path.join(output_dir, "inputs"), weights=weights, ) # copy the data to the object store inputs_split = [[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in inputs_split] targets_split = [[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in targets_split] weights_split = [[ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in weights_split] normalized_weights_split = [ [ray.put(arr) for arr in e] if isinstance(e, list) else ray.put(e) for e in normalized_weights_split ] return inputs_split, targets_split, weights_split, normalized_weights_split