Module OPTIMA.builtin.evaluation
Provides a collection of classes and functions to evaluate the performance of classifiers.
Expand source code
# -*- coding: utf-8 -*-
"""Provides a collection of classes and functions to evaluate the performance of classifiers."""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import ray
import OPTIMA.core.model
import OPTIMA.core.evaluation
def evaluate(
run_config,
model_path,
inputs_split,
targets_split,
weights_split,
normalized_weights_split,
fig_dir,
native_metrics=None,
weighted_native_metrics=None,
custom_FoMs=None,
class_labels=None,
cpus=1,
results_dir=None,
N_bins=40,
print_results=True,
return_unfilled=False,
ratio=True,
):
"""_summary_.
Parameters
----------
run_config : _type_
_description_
model_path : _type_
_description_
inputs_split : _type_
_description_
targets_split : _type_
_description_
weights_split : _type_
_description_
normalized_weights_split : _type_
_description_
fig_dir : _type_
_description_
native_metrics : _type_
_description_ (Default value = [])
weighted_native_metrics : _type_
_description_ (Default value = [])
custom_FoMs : _type_
_description_ (Default value = [])
class_labels : _type_
_description_ (Default value = None)
cpus : _type_
_description_ (Default value = 1)
results_dir : _type_
_description_ (Default value = None)
N_bins : _type_
_description_ (Default value = 40)
print_results : _type_
_description_ (Default value = True)
return_unfilled : _type_
_description_ (Default value = False)
ratio : _type_
_description_ (Default value = True)
Returns
-------
_type_
_description_
"""
if custom_FoMs is None:
custom_FoMs = []
if weighted_native_metrics is None:
weighted_native_metrics = []
if native_metrics is None:
native_metrics = []
# fetch the inputs from the object store
if len(inputs_split) == 2:
explicit_testing_dataset = False
inputs_train, inputs_val = ray.get(inputs_split)
targets_train, targets_val = ray.get(targets_split)
weights_train, weights_val = ray.get(weights_split)
normalized_weights_train, normalized_weights_val = ray.get(normalized_weights_split)
print(
"testing model using {} training and {} validation events".format(
inputs_train.shape[0], inputs_val.shape[0]
)
)
else:
explicit_testing_dataset = True
inputs_train, inputs_val, inputs_test = ray.get(inputs_split)
targets_train, targets_val, targets_test = ray.get(targets_split)
weights_train, weights_val, weights_test = ray.get(weights_split)
normalized_weights_train, normalized_weights_val, normalized_weights_test = ray.get(normalized_weights_split)
print(
"testing model using {} training, {} validation and {} testing events".format(
inputs_train.shape[0], inputs_val.shape[0], inputs_test.shape[0]
)
)
# create the output folders if they do not exist
if not os.path.exists(fig_dir):
os.makedirs(fig_dir, exist_ok=True)
if results_dir is not None:
if not os.path.exists(results_dir):
os.makedirs(results_dir, exist_ok=True)
# load the model and get the model predictions
model = OPTIMA.core.model.load_model(run_config, model_path, cpus)
pred_train = model.predict(inputs_train, verbose=0)
pred_val = model.predict(inputs_val, verbose=0)
if explicit_testing_dataset:
pred_test = model.predict(inputs_test, verbose=0)
num_outputs = pred_train.shape[1]
# check if we have binary or multiclass classification
if targets_train.shape[1] == 1:
binary_classification = True
num_classes = 2
else:
binary_classification = False
num_classes = targets_train.shape[1]
# for each class, calculate the train, validation and test weights, scaled weights where the sum of the weights is
# scaled to the total weight for each class, and split the model predictions into the different classes
weights_train_classes = []
scaled_weights_train_classes = []
pred_train_classes = []
weights_val_classes = []
scaled_weights_val_classes = []
pred_val_classes = []
if explicit_testing_dataset:
weights_test_classes = []
scaled_weights_test_classes = []
pred_test_classes = []
total_weight_classes = []
for i in range(num_classes if not binary_classification else 1):
for t in range(
1 if not binary_classification else 0, 2
): # for binary classification, both 0 and 1 are important target values
# get class weights and total weight for this class
weights_train_classes.append(weights_train[targets_train[:, i] == t])
weights_val_classes.append(weights_val[targets_val[:, i] == t])
if explicit_testing_dataset:
weights_test_classes.append(weights_test[targets_test[:, i] == t])
total_weight_classes.append(
np.sum(weights_train_classes[-1])
+ np.sum(weights_val_classes[-1])
+ np.sum(weights_test_classes[-1])
)
else:
total_weight_classes.append(np.sum(weights_train_classes[-1]) + np.sum(weights_val_classes[-1]))
# calculate scaled weights
scaled_weights_train_classes.append(
weights_train_classes[-1] / np.sum(weights_train_classes[-1]) * total_weight_classes[-1]
)
scaled_weights_val_classes.append(
weights_val_classes[-1] / np.sum(weights_val_classes[-1]) * total_weight_classes[-1]
)
if explicit_testing_dataset:
scaled_weights_test_classes.append(
weights_test_classes[-1] / np.sum(weights_test_classes[-1]) * total_weight_classes[-1]
)
# get the model predictions for this class
pred_train_classes.append(pred_train[targets_train[:, i] == t])
pred_val_classes.append(pred_val[targets_val[:, i] == t])
if explicit_testing_dataset:
pred_test_classes.append(pred_test[targets_test[:, i] == t])
# get the ROC curves in the One-vs.-Rest scheme; this does only make sense for binary classification (one output,
# two classes) or multiclass (multiple outputs, same number of classes), but not for anything in between (e.g. one
# output, multiple classes)
do_roc = binary_classification or num_classes == num_outputs
if do_roc:
auc_train_classes = []
auc_val_classes = []
if explicit_testing_dataset:
auc_test_classes = []
for i in range(num_classes if not binary_classification else 1):
fig, ax = plt.subplots(figsize=[6, 4.5], layout="constrained")
# ignore negative sample weights
fpr_train, tpr_train, _ = roc_curve(
y_true=targets_train[:, i].ravel()[weights_train > 0],
y_score=pred_train[:, i].ravel()[weights_train > 0],
sample_weight=weights_train[weights_train > 0],
)
fpr_val, tpr_val, _ = roc_curve(
y_true=targets_val[:, i].ravel()[weights_val > 0],
y_score=pred_val[:, i].ravel()[weights_val > 0],
sample_weight=weights_val[weights_val > 0],
)
auc_train = auc(fpr_train, tpr_train)
auc_val = auc(fpr_val, tpr_val)
ax.plot(fpr_train, tpr_train, label=f"training (AUC = {auc_train:.4f}")
ax.plot(fpr_val, tpr_val, label=f"validation (AUC = {auc_val:.4f}")
if explicit_testing_dataset:
fpr_test, tpr_test, _ = roc_curve(
y_true=targets_test[:, i].ravel()[weights_test > 0],
y_score=pred_test[:, i].ravel()[weights_test > 0],
sample_weight=weights_test[weights_test > 0],
)
auc_test = auc(fpr_test, tpr_test)
ax.plot(fpr_test, tpr_test, label=f"testing (AUC = {auc_test:.4f}")
ax.set_xlim((0.0, 1.0))
ax.set_ylim((0.0, 1.0))
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
class_name = class_labels[i] if class_labels is not None else f"Class {i}"
ax.set_title(
"Receiver Operating Characteristic" + (f" ({class_name} vs. Rest)" if not binary_classification else "")
)
ax.legend()
fig.savefig(
os.path.join(fig_dir, f"ROC_{class_name}_vs_rest.pdf" if not binary_classification else "ROC.pdf")
)
auc_train_classes.append(auc_train)
auc_val_classes.append(auc_val)
if explicit_testing_dataset:
auc_test_classes.append(auc_test)
def _weighted_hists_with_uncertainty(arrays_tuple, weights_tuple, N_bins, hist_range=None, stacked=False):
"""_summary_.
Parameters
----------
arrays_tuple : _type_
_description_
weights_tuple : _type_
_description_
N_bins : _type_
_description_
hist_range : _type_
_description_ (Default value = None)
stacked : _type_
_description_ (Default value = False)
Returns
-------
_type_
_description_
"""
bin_contents, bin_edges, _ = plt.hist(arrays_tuple, bins=N_bins, range=hist_range, weights=weights_tuple)
plt.clf()
uncertainties = []
if not stacked:
for values, weights, _ in zip(arrays_tuple, weights_tuple, bin_contents):
# get the index of the bin each value belongs in; then expand the array of bin indices along a new axis,
# substract the new axis index from the values so that each bin index is zero in the corresponding subarray.
# Convert to boolean array by checking which entry is zero
conditions_array = (
pd.cut(values, bins=bin_edges, labels=False)
- np.linspace(0, N_bins, N_bins, endpoint=False, dtype=int).reshape((N_bins, 1))
) == 0
# tile the weight array to get the same shape as the conditions array; assumes 1D weights
weights_tiled = np.tile(weights, reps=N_bins).reshape((N_bins, weights.shape[0]))
# calculate uncertainties by replacing all values in weights_tiled with weights_tiled^2 where conditions_array
# is True, else replace with zero. then calculate sum along the values axis to get the uncertainty for this bin
delta = np.sqrt(
np.sum(np.where(conditions_array, np.square(weights_tiled), np.zeros_like(weights_tiled)), axis=1)
)
uncertainties.append(delta)
else:
# only the final entry in bin_contents should have errors, which then come from all entries together
uncertainties = [np.zeros_like(b) for b in bin_contents[:-1]]
values = np.concatenate(arrays_tuple)
weights = np.concatenate(weights_tuple)
conditions_array = (
pd.cut(values, bins=bin_edges, labels=False)
- np.linspace(0, N_bins, N_bins, endpoint=False, dtype=int).reshape((N_bins, 1))
) == 0
weights_tiled = np.tile(weights, reps=N_bins).reshape((N_bins, weights.shape[0]))
delta = np.sqrt(
np.sum(np.where(conditions_array, np.square(weights_tiled), np.zeros_like(weights_tiled)), axis=1)
)
uncertainties.append(delta)
return bin_contents, bin_edges, uncertainties
def _draw_hist_with_ratios(
bin_edges,
bin_contents_with_type,
uncertainties,
ratio_refs,
ratio_refs_uncertainties,
text_boxes,
colors,
colors_errors,
title,
x_label,
y_labels,
legend_labels,
x_range=None,
figpath=None,
stacked=False,
):
"""_summary_.
Parameters
----------
bin_edges : _type_
_description_
bin_contents_with_type : _type_
_description_
uncertainties : _type_
_description_
ratio_refs : _type_
_description_
ratio_refs_uncertainties : _type_
_description_
text_boxes : _type_
_description_
colors : _type_
_description_
colors_errors : _type_
_description_
title : _type_
_description_
x_label : _type_
_description_
y_labels : _type_
_description_
legend_labels : _type_
_description_
x_range : _type_
_description_ (Default value = None)
figpath : _type_
_description_ (Default value = None)
stacked : _type_
_description_ (Default value = False)
Returns
-------
_type_
_description_
"""
fig, axs = plt.subplots(
1 + len(ratio_refs),
1,
gridspec_kw={"height_ratios": [4] + [1] * len(ratio_refs)},
sharex="col",
layout="constrained",
)
if not isinstance(axs, np.ndarray):
axs = np.array([axs])
fig.set_figheight(4.5 + 0.6 * len(ratio_refs))
fig.set_figwidth(5.5)
cumsum = np.append(np.zeros_like(uncertainties[0]), 0.0)
plot_objects = []
error_objects = []
hatch_linewidth_before = plt.rcParams["hatch.linewidth"]
plt.rcParams["hatch.linewidth"] = 0.6
for i, (bin_content_with_type, delta) in enumerate(zip(bin_contents_with_type, uncertainties)):
content_type, ratio_ref_index, bin_content = bin_content_with_type
bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2
bin_content_extended = np.append(bin_content, bin_content[-1])
delta_extended = np.append(delta, delta[-1])
if content_type == "scatter":
plot_objects.append(
axs[0].scatter(bin_centers, bin_content, color=colors[i], s=10, label=legend_labels[i])
)
if (delta > 0).any():
error_objects.append(
axs[0].errorbar(
bin_centers,
bin_content,
xerr=1 / (2 * N_bins),
yerr=delta,
color=colors_errors[i],
ls="none",
linewidth=0.8,
)
)
if len(ratio_refs) > 0:
ratio_ref = ratio_refs[ratio_ref_index]
ratio_ref_uncertainty = ratio_refs_uncertainties[ratio_ref_index]
axs[1 + ratio_ref_index].scatter(
bin_centers[ratio_ref != 0],
bin_content[ratio_ref != 0] / ratio_ref[ratio_ref != 0],
color=colors[i],
s=10,
)
if (delta > 0).any():
axs[1 + ratio_ref_index].errorbar(
bin_centers[ratio_ref != 0],
bin_content[ratio_ref != 0] / ratio_ref[ratio_ref != 0],
xerr=1 / (2 * N_bins),
yerr=np.sqrt(
(delta[ratio_ref != 0] / ratio_ref[ratio_ref != 0]) ** 2
+ (
bin_content[ratio_ref != 0]
/ ratio_ref[ratio_ref != 0] ** 2
* ratio_ref_uncertainty[ratio_ref != 0]
)
** 2
), # assume the two hists are independent
color=colors_errors[i],
ls="none",
linewidth=0.8,
)
else:
if not stacked:
this_value = bin_content_extended
plot_objects.append(
axs[0].step(
bin_edges, this_value, where="post", color=colors[i], linewidth=0.8, label=legend_labels[i]
)
)
else:
this_value = cumsum + bin_content_extended
plot_objects.append(
axs[0].fill_between(
bin_edges,
cumsum,
this_value,
step="post",
alpha=1.0,
facecolor=colors[i],
linewidth=0.8,
label=legend_labels[i],
)
)
cumsum = this_value
if (delta_extended > 0).any():
error_objects.append(
axs[0].fill_between(
bin_edges,
this_value - delta_extended,
this_value + delta_extended,
step="post",
alpha=1.0,
hatch="///////",
facecolor="none",
edgecolor=colors_errors[i],
linewidth=0.0,
)
)
if len(ratio_refs) > 0:
ratio_ref_extended = np.append(ratio_refs[ratio_ref_index], ratio_refs[ratio_ref_index][-1])
ratio_ref_uncertainties_extended = np.append(
ratio_refs_uncertainties[ratio_ref_index], ratio_refs_uncertainties[ratio_ref_index][-1]
)
axs[1 + ratio_ref_index].step(
bin_edges[ratio_ref_extended != 0],
bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0],
where="post",
color=colors[i],
linewidth=0.8,
)
delta_ratio = np.sqrt(
(delta_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0]) ** 2
+ (
bin_content_extended[ratio_ref_extended != 0]
/ ratio_ref_extended[ratio_ref_extended != 0] ** 2
* ratio_ref_uncertainties_extended[ratio_ref_extended != 0]
)
** 2
)
if (delta_extended > 0).any():
axs[1 + ratio_ref_index].fill_between(
bin_edges[ratio_ref_extended != 0],
bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0]
- delta_ratio,
bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0]
+ delta_ratio,
step="post",
alpha=1.0,
hatch="///////",
facecolor="none",
edgecolor=colors_errors[i],
linewidth=0.0,
)
# error bar only for legend
axs[0].fill_between(
[],
[],
[],
step="post",
alpha=1.0,
hatch="///////",
facecolor="none",
edgecolor="black",
linewidth=0.0,
label="Stat. Unc.",
)
# text boxes on axes
for i, text in enumerate(text_boxes):
if i > len(ratio_refs):
break
# check if empty string
if not text:
continue
props = dict(alpha=0.0)
# place a text box in upper left
axs[i].text(
0.01, 1.01, text, transform=axs[i].transAxes, fontsize=8, verticalalignment="bottom", bbox=props
)
if x_range is not None:
axs[0].set_xlim(x_range)
axs[0].set_ylim(bottom=0)
for i in range(1, len(ratio_refs) + 1):
axs[i].set_ylabel(y_labels[i])
axs[i].set_ylim((0.5, 1.6))
# legend in two columns if more than three labels
handles, labels = axs[0].get_legend_handles_labels()
n_labels = len(handles)
if n_labels > 3:
handles = np.concatenate((handles[::2], handles[1::2]), axis=0)
labels = np.concatenate((labels[::2], labels[1::2]), axis=0)
axs[0].legend(handles, labels, loc=1, ncol=2)
else:
axs[0].legend(loc=1)
# set y-limit to fit the legend
ax_ylim = axs[0].get_ylim()[1]
scale_ylim = 0.1
scale_ylim *= n_labels if n_labels <= 3 else (n_labels / 2 + n_labels % 2)
axs[0].set_ylim(top=ax_ylim * (1 + scale_ylim))
axs[0].set_ylabel(y_labels[0])
axs[-1].set_xlabel(x_label)
fig.suptitle(title)
if figpath is not None:
plt.savefig(figpath, dpi=600)
else:
fig.set_dpi(300)
plt.show()
plt.rcParams["hatch.linewidth"] = hatch_linewidth_before
# create stacked histograms for each of the DNN outputs with all classes
# first define tuples of predictions and corresponding weights for both the stacked and the normalized histogram.
# For the stacked histogram, the order in the tuple defines the order of the contributions in the stack, from the
# bottom upwards.
for i in range(num_outputs):
# get the predictions of output i for all classes
if not explicit_testing_dataset:
pred_i = [p[:, i] for p in pred_val_classes]
else:
pred_i = [p[:, i] for p in pred_test_classes]
# create the stacked histogram with poisson uncertainties for each bin
bin_contents, bin_edges, uncertainties = _weighted_hists_with_uncertainty(
pred_i,
weights_tuple=scaled_weights_val_classes if not explicit_testing_dataset else scaled_weights_test_classes,
N_bins=N_bins,
# range=(0, 1),
stacked=True,
)
# draw the histogram
# for each contribution to the histogram, we need to provide the type of contribution (step or scatter), the
# index of the reference to use for the ratio subplot, and bin content itself..
bin_contents_with_type = zip(["step"] * num_classes, [None] * num_classes, bin_contents)
if (num_classes <= 10 and not explicit_testing_dataset) or num_classes <= 5:
colors = sns.color_palette()
else:
colors = sns.color_palette("husl", num_classes if not explicit_testing_dataset else 2 * num_classes)
colors_errors = [None] * (num_classes - 1) + ["0.4"] # only the uppermost contribution should have error bars
if class_labels is not None:
legend_labels = class_labels if not binary_classification else class_labels[::-1]
else:
legend_labels = (
[f"Class {k}" for k in range(num_classes)] if not binary_classification else ["Background", "Signal"]
)
_draw_hist_with_ratios(
bin_edges,
bin_contents_with_type,
uncertainties,
ratio_refs=[],
ratio_refs_uncertainties=[],
text_boxes=[],
colors=colors,
colors_errors=colors_errors,
legend_labels=legend_labels,
title=f"Neural Network Output {i} (scaled)" if num_outputs > 1 else "Neural Network Output (scaled)",
x_label="DNN Output",
y_labels=["Events"],
x_range=(bin_edges[0], bin_edges[-1]),
stacked=True,
figpath=os.path.join(fig_dir, f"DNN_output_{i}.pdf" if num_outputs > 1 else "DNN_output.pdf"),
)
plt.clf()
# create normalized histograms for each of the DNN outputs with all classes
# get the normalized event weights
weights_normalized = [w / np.sum(w) for w in weights_train_classes + weights_val_classes]
if explicit_testing_dataset:
weights_normalized += [w / np.sum(w) for w in weights_test_classes]
for i in range(num_outputs):
# get the predictions of output i for all classes and the normalized weights
pred_i = [p[:, i] for p in pred_train_classes + pred_val_classes]
if explicit_testing_dataset:
pred_i += [p[:, i] for p in pred_test_classes]
# get the histogram for each class with Poisson uncertainties
bin_contents_normal, bin_edges_normal, uncertainties_normal = _weighted_hists_with_uncertainty(
pred_i,
weights_tuple=weights_normalized,
N_bins=N_bins,
# range=(0, 1)
)
# construct the bin contents with type. We want the training predictions to be drawn as scatter and the validation
# and test prediction as step. For all contributions, the training prediction of the same class should be used
# as the reference for the ratio.
bin_contents_normal_with_type = [("scatter", i, bin_contents_normal[i]) for i in range(num_classes)]
bin_contents_normal_with_type += [("step", i, bin_contents_normal[num_classes + i]) for i in range(num_classes)]
if explicit_testing_dataset:
bin_contents_normal_with_type += [
("step", i, bin_contents_normal[int(2 * num_classes) + i]) for i in range(num_classes)
]
# we can choose the training and validation components of each class to have the same color and choose the testing
# colors to be different
colors_normal = 2 * [colors[i] for i in range(num_classes)]
if explicit_testing_dataset:
colors_normal += [colors[num_classes + i] for i in range(num_classes)]
colors_errors_normal = colors_normal
# build the labels for each contribution. Again we need to pay attention if we have binary classification or not.
# Order of labels needs to be the same as the order of contributions in bin_contents_normal_with_type.
legend_labels = []
for phase in ["training", "validation", "testing"] if explicit_testing_dataset else ["training", "validation"]:
for k in range(num_classes):
if not binary_classification:
if class_labels is not None:
legend_labels.append(f"{class_labels[k]} ({phase})")
else:
legend_labels.append(f"Class {k} ({phase})")
else:
if class_labels is not None:
legend_labels.append(f"{class_labels[-(k+1)]} ({phase})")
else:
c = ["Background", "Signal"][k]
legend_labels.append(f"{c} ({phase})")
# build the titels for the ratio plots. We need to provide a title for the main plots as well, but we can leave
# that blank
if class_labels is not None:
ratio_titles = [""] + class_labels
else:
ratio_titles = [""] + (
[f"Class {k}" for k in range(num_classes)] if not binary_classification else ["Background", "Signal"]
)
_draw_hist_with_ratios(
bin_edges_normal,
bin_contents_normal_with_type,
uncertainties_normal,
ratio_refs=bin_contents_normal[:num_classes] if ratio else [], # training for ratio
ratio_refs_uncertainties=uncertainties_normal[:num_classes] if ratio else [],
text_boxes=ratio_titles,
colors=colors_normal,
colors_errors=colors_errors_normal,
legend_labels=legend_labels,
title=f"Neural Network Output {i}" if not binary_classification else "Neural Network Output",
x_label="DNN Output",
y_labels=["Normalized Prediction"] + ["Pred. / Train"] * num_classes,
x_range=(bin_edges_normal[0], bin_edges_normal[-1]),
figpath=os.path.join(
fig_dir, f"DNN_output_{i}_normalized.pdf" if not binary_classification else "DNN_output_normalized.pdf"
),
)
# write AUC values to results string
results_string = ""
results_string_args = []
if do_roc:
for i in range(num_classes if not binary_classification else 1):
if binary_classification:
results_string += " AUC (training): {:.4f}\n"
results_string += " AUC (validation): {:.4f}\n"
if explicit_testing_dataset:
results_string += " AUC (testing): {:.4f}\n"
else:
results_string += (
f" AUC ({class_labels[i]} vs. rest):\n"
if class_labels is not None
else f" AUC (class {i} vs. rest):\n"
)
results_string += "\ttraining: {:.4f}\n"
results_string += "\tvalidation: {:.4f}\n"
if explicit_testing_dataset:
results_string += "\ttesting: {:.4f}\n"
results_string_args += [auc_train_classes[i], auc_val_classes[i]]
if explicit_testing_dataset:
results_string_args.append(auc_test_classes[i])
# loss
results_string += " Loss:\n"
train_loss = model.loss(
inputs=inputs_train, y_true=targets_train, sample_weight=normalized_weights_train, y_pred=pred_train
)
val_loss = model.loss(inputs=inputs_val, y_true=targets_val, sample_weight=normalized_weights_val, y_pred=pred_val)
if explicit_testing_dataset:
test_loss = model.loss(
inputs=inputs_test, y_true=targets_test, sample_weight=normalized_weights_test, y_pred=pred_test
)
results_string += "\ttraining: {}\n".format("{:.3f}")
results_string += "\tvalidation: {}\n".format("{:.3f}")
if explicit_testing_dataset:
results_string += "\ttesting: {}\n".format("{:.3f}")
results_string_args += [train_loss, val_loss, test_loss] if explicit_testing_dataset else [train_loss, val_loss]
if native_metrics != []:
# instantiate native metrics
native_metrics = [(name, metric(**kwargs)) for name, (metric, kwargs) in native_metrics]
results_string += " Native metrics:\n"
for metric_name, metric in native_metrics:
metric_value_train = OPTIMA.core.evaluation.calc_native_metric(
run_config, metric, targets_train, pred_train
)
metric_value_val = OPTIMA.core.evaluation.calc_native_metric(run_config, metric, targets_val, pred_val)
if explicit_testing_dataset:
metric_value_test = OPTIMA.core.evaluation.calc_native_metric(
run_config, metric, targets_test, pred_test
)
results_string += "\t{} (training): {}\n".format(metric_name, "{:.3f}")
results_string += "\t{} (validation): {}\n".format(metric_name, "{:.3f}")
if explicit_testing_dataset:
results_string += "\t{} (testing): {}\n".format(metric_name, "{:.3f}")
results_string_args += (
[metric_value_train, metric_value_val, metric_value_test]
if explicit_testing_dataset
else [metric_value_train, metric_value_val]
)
if weighted_native_metrics != []:
# instantiate weighted native metrics
weighted_native_metrics = [(name, metric(**kwargs)) for name, (metric, kwargs) in weighted_native_metrics]
results_string += " Weighted native metrics:\n"
for metric_name, metric in weighted_native_metrics:
metric_value_train = OPTIMA.core.evaluation.calc_native_metric(
run_config, metric, targets_train, pred_train, sample_weight=normalized_weights_train
)
metric_value_val = OPTIMA.core.evaluation.calc_native_metric(
run_config, metric, targets_val, pred_val, sample_weight=normalized_weights_val
)
if explicit_testing_dataset:
metric_value_test = OPTIMA.core.evaluation.calc_native_metric(
run_config, metric, targets_test, pred_test, sample_weight=normalized_weights_test
)
results_string += "\t{} (training): {}\n".format(metric_name, "{:.3f}")
results_string += "\t{} (validation): {}\n".format(metric_name, "{:.3f}")
if explicit_testing_dataset:
results_string += "\t{} (testing): {}\n".format(metric_name, "{:.3f}")
results_string_args += (
[metric_value_train, metric_value_val, metric_value_test]
if explicit_testing_dataset
else [metric_value_train, metric_value_val]
)
if custom_FoMs != []:
results_string += " Custom metrics:\n"
for FoM_name, FoM_func in custom_FoMs:
FoM_value_train = FoM_func(targets_train, pred_train, sample_weight=normalized_weights_train)
FoM_value_val = FoM_func(targets_val, pred_val, sample_weight=normalized_weights_val)
if explicit_testing_dataset:
FoM_value_test = FoM_func(targets_test, pred_test, sample_weight=normalized_weights_test)
results_string += "\t{} (training): {}\n".format(FoM_name, "{:.3f}")
results_string += "\t{} (validation): {}\n".format(FoM_name, "{:.3f}")
if explicit_testing_dataset:
results_string += "\t{} (testing): {}\n".format(FoM_name, "{:.3f}")
results_string_args += (
[FoM_value_train, FoM_value_val, FoM_value_test]
if explicit_testing_dataset
else [FoM_value_train, FoM_value_val]
)
if print_results:
print(results_string.format(*results_string_args))
if results_dir is not None:
with open(os.path.join(results_dir, "results_eval.txt"), "w") as results_file:
results_file.write(results_string.format(*results_string_args))
if not return_unfilled:
return results_string.format(*results_string_args)
else:
results_string = results_string.replace("{:.3f}", "{}").replace("{:.4f}", "{}")
return results_string, results_string_args
Functions
def evaluate(run_config, model_path, inputs_split, targets_split, weights_split, normalized_weights_split, fig_dir, native_metrics=None, weighted_native_metrics=None, custom_FoMs=None, class_labels=None, cpus=1, results_dir=None, N_bins=40, print_results=True, return_unfilled=False, ratio=True)-
summary.
Parameters
run_config:_type_- description
model_path:_type_- description
inputs_split:_type_- description
targets_split:_type_- description
weights_split:_type_- description
normalized_weights_split:_type_- description
fig_dir:_type_- description
native_metrics:_type_- description (Default value = [])
weighted_native_metrics:_type_- description (Default value = [])
custom_FoMs:_type_- description (Default value = [])
class_labels:_type_- description (Default value = None)
cpus:_type_- description (Default value = 1)
results_dir:_type_- description (Default value = None)
N_bins:_type_- description (Default value = 40)
print_results:_type_- description (Default value = True)
return_unfilled:_type_- description (Default value = False)
ratio:_type_- description (Default value = True)
Returns
_type_- description
Expand source code
def evaluate( run_config, model_path, inputs_split, targets_split, weights_split, normalized_weights_split, fig_dir, native_metrics=None, weighted_native_metrics=None, custom_FoMs=None, class_labels=None, cpus=1, results_dir=None, N_bins=40, print_results=True, return_unfilled=False, ratio=True, ): """_summary_. Parameters ---------- run_config : _type_ _description_ model_path : _type_ _description_ inputs_split : _type_ _description_ targets_split : _type_ _description_ weights_split : _type_ _description_ normalized_weights_split : _type_ _description_ fig_dir : _type_ _description_ native_metrics : _type_ _description_ (Default value = []) weighted_native_metrics : _type_ _description_ (Default value = []) custom_FoMs : _type_ _description_ (Default value = []) class_labels : _type_ _description_ (Default value = None) cpus : _type_ _description_ (Default value = 1) results_dir : _type_ _description_ (Default value = None) N_bins : _type_ _description_ (Default value = 40) print_results : _type_ _description_ (Default value = True) return_unfilled : _type_ _description_ (Default value = False) ratio : _type_ _description_ (Default value = True) Returns ------- _type_ _description_ """ if custom_FoMs is None: custom_FoMs = [] if weighted_native_metrics is None: weighted_native_metrics = [] if native_metrics is None: native_metrics = [] # fetch the inputs from the object store if len(inputs_split) == 2: explicit_testing_dataset = False inputs_train, inputs_val = ray.get(inputs_split) targets_train, targets_val = ray.get(targets_split) weights_train, weights_val = ray.get(weights_split) normalized_weights_train, normalized_weights_val = ray.get(normalized_weights_split) print( "testing model using {} training and {} validation events".format( inputs_train.shape[0], inputs_val.shape[0] ) ) else: explicit_testing_dataset = True inputs_train, inputs_val, inputs_test = ray.get(inputs_split) targets_train, targets_val, targets_test = ray.get(targets_split) weights_train, weights_val, weights_test = ray.get(weights_split) normalized_weights_train, normalized_weights_val, normalized_weights_test = ray.get(normalized_weights_split) print( "testing model using {} training, {} validation and {} testing events".format( inputs_train.shape[0], inputs_val.shape[0], inputs_test.shape[0] ) ) # create the output folders if they do not exist if not os.path.exists(fig_dir): os.makedirs(fig_dir, exist_ok=True) if results_dir is not None: if not os.path.exists(results_dir): os.makedirs(results_dir, exist_ok=True) # load the model and get the model predictions model = OPTIMA.core.model.load_model(run_config, model_path, cpus) pred_train = model.predict(inputs_train, verbose=0) pred_val = model.predict(inputs_val, verbose=0) if explicit_testing_dataset: pred_test = model.predict(inputs_test, verbose=0) num_outputs = pred_train.shape[1] # check if we have binary or multiclass classification if targets_train.shape[1] == 1: binary_classification = True num_classes = 2 else: binary_classification = False num_classes = targets_train.shape[1] # for each class, calculate the train, validation and test weights, scaled weights where the sum of the weights is # scaled to the total weight for each class, and split the model predictions into the different classes weights_train_classes = [] scaled_weights_train_classes = [] pred_train_classes = [] weights_val_classes = [] scaled_weights_val_classes = [] pred_val_classes = [] if explicit_testing_dataset: weights_test_classes = [] scaled_weights_test_classes = [] pred_test_classes = [] total_weight_classes = [] for i in range(num_classes if not binary_classification else 1): for t in range( 1 if not binary_classification else 0, 2 ): # for binary classification, both 0 and 1 are important target values # get class weights and total weight for this class weights_train_classes.append(weights_train[targets_train[:, i] == t]) weights_val_classes.append(weights_val[targets_val[:, i] == t]) if explicit_testing_dataset: weights_test_classes.append(weights_test[targets_test[:, i] == t]) total_weight_classes.append( np.sum(weights_train_classes[-1]) + np.sum(weights_val_classes[-1]) + np.sum(weights_test_classes[-1]) ) else: total_weight_classes.append(np.sum(weights_train_classes[-1]) + np.sum(weights_val_classes[-1])) # calculate scaled weights scaled_weights_train_classes.append( weights_train_classes[-1] / np.sum(weights_train_classes[-1]) * total_weight_classes[-1] ) scaled_weights_val_classes.append( weights_val_classes[-1] / np.sum(weights_val_classes[-1]) * total_weight_classes[-1] ) if explicit_testing_dataset: scaled_weights_test_classes.append( weights_test_classes[-1] / np.sum(weights_test_classes[-1]) * total_weight_classes[-1] ) # get the model predictions for this class pred_train_classes.append(pred_train[targets_train[:, i] == t]) pred_val_classes.append(pred_val[targets_val[:, i] == t]) if explicit_testing_dataset: pred_test_classes.append(pred_test[targets_test[:, i] == t]) # get the ROC curves in the One-vs.-Rest scheme; this does only make sense for binary classification (one output, # two classes) or multiclass (multiple outputs, same number of classes), but not for anything in between (e.g. one # output, multiple classes) do_roc = binary_classification or num_classes == num_outputs if do_roc: auc_train_classes = [] auc_val_classes = [] if explicit_testing_dataset: auc_test_classes = [] for i in range(num_classes if not binary_classification else 1): fig, ax = plt.subplots(figsize=[6, 4.5], layout="constrained") # ignore negative sample weights fpr_train, tpr_train, _ = roc_curve( y_true=targets_train[:, i].ravel()[weights_train > 0], y_score=pred_train[:, i].ravel()[weights_train > 0], sample_weight=weights_train[weights_train > 0], ) fpr_val, tpr_val, _ = roc_curve( y_true=targets_val[:, i].ravel()[weights_val > 0], y_score=pred_val[:, i].ravel()[weights_val > 0], sample_weight=weights_val[weights_val > 0], ) auc_train = auc(fpr_train, tpr_train) auc_val = auc(fpr_val, tpr_val) ax.plot(fpr_train, tpr_train, label=f"training (AUC = {auc_train:.4f}") ax.plot(fpr_val, tpr_val, label=f"validation (AUC = {auc_val:.4f}") if explicit_testing_dataset: fpr_test, tpr_test, _ = roc_curve( y_true=targets_test[:, i].ravel()[weights_test > 0], y_score=pred_test[:, i].ravel()[weights_test > 0], sample_weight=weights_test[weights_test > 0], ) auc_test = auc(fpr_test, tpr_test) ax.plot(fpr_test, tpr_test, label=f"testing (AUC = {auc_test:.4f}") ax.set_xlim((0.0, 1.0)) ax.set_ylim((0.0, 1.0)) ax.set_xlabel("False Positive Rate") ax.set_ylabel("True Positive Rate") class_name = class_labels[i] if class_labels is not None else f"Class {i}" ax.set_title( "Receiver Operating Characteristic" + (f" ({class_name} vs. Rest)" if not binary_classification else "") ) ax.legend() fig.savefig( os.path.join(fig_dir, f"ROC_{class_name}_vs_rest.pdf" if not binary_classification else "ROC.pdf") ) auc_train_classes.append(auc_train) auc_val_classes.append(auc_val) if explicit_testing_dataset: auc_test_classes.append(auc_test) def _weighted_hists_with_uncertainty(arrays_tuple, weights_tuple, N_bins, hist_range=None, stacked=False): """_summary_. Parameters ---------- arrays_tuple : _type_ _description_ weights_tuple : _type_ _description_ N_bins : _type_ _description_ hist_range : _type_ _description_ (Default value = None) stacked : _type_ _description_ (Default value = False) Returns ------- _type_ _description_ """ bin_contents, bin_edges, _ = plt.hist(arrays_tuple, bins=N_bins, range=hist_range, weights=weights_tuple) plt.clf() uncertainties = [] if not stacked: for values, weights, _ in zip(arrays_tuple, weights_tuple, bin_contents): # get the index of the bin each value belongs in; then expand the array of bin indices along a new axis, # substract the new axis index from the values so that each bin index is zero in the corresponding subarray. # Convert to boolean array by checking which entry is zero conditions_array = ( pd.cut(values, bins=bin_edges, labels=False) - np.linspace(0, N_bins, N_bins, endpoint=False, dtype=int).reshape((N_bins, 1)) ) == 0 # tile the weight array to get the same shape as the conditions array; assumes 1D weights weights_tiled = np.tile(weights, reps=N_bins).reshape((N_bins, weights.shape[0])) # calculate uncertainties by replacing all values in weights_tiled with weights_tiled^2 where conditions_array # is True, else replace with zero. then calculate sum along the values axis to get the uncertainty for this bin delta = np.sqrt( np.sum(np.where(conditions_array, np.square(weights_tiled), np.zeros_like(weights_tiled)), axis=1) ) uncertainties.append(delta) else: # only the final entry in bin_contents should have errors, which then come from all entries together uncertainties = [np.zeros_like(b) for b in bin_contents[:-1]] values = np.concatenate(arrays_tuple) weights = np.concatenate(weights_tuple) conditions_array = ( pd.cut(values, bins=bin_edges, labels=False) - np.linspace(0, N_bins, N_bins, endpoint=False, dtype=int).reshape((N_bins, 1)) ) == 0 weights_tiled = np.tile(weights, reps=N_bins).reshape((N_bins, weights.shape[0])) delta = np.sqrt( np.sum(np.where(conditions_array, np.square(weights_tiled), np.zeros_like(weights_tiled)), axis=1) ) uncertainties.append(delta) return bin_contents, bin_edges, uncertainties def _draw_hist_with_ratios( bin_edges, bin_contents_with_type, uncertainties, ratio_refs, ratio_refs_uncertainties, text_boxes, colors, colors_errors, title, x_label, y_labels, legend_labels, x_range=None, figpath=None, stacked=False, ): """_summary_. Parameters ---------- bin_edges : _type_ _description_ bin_contents_with_type : _type_ _description_ uncertainties : _type_ _description_ ratio_refs : _type_ _description_ ratio_refs_uncertainties : _type_ _description_ text_boxes : _type_ _description_ colors : _type_ _description_ colors_errors : _type_ _description_ title : _type_ _description_ x_label : _type_ _description_ y_labels : _type_ _description_ legend_labels : _type_ _description_ x_range : _type_ _description_ (Default value = None) figpath : _type_ _description_ (Default value = None) stacked : _type_ _description_ (Default value = False) Returns ------- _type_ _description_ """ fig, axs = plt.subplots( 1 + len(ratio_refs), 1, gridspec_kw={"height_ratios": [4] + [1] * len(ratio_refs)}, sharex="col", layout="constrained", ) if not isinstance(axs, np.ndarray): axs = np.array([axs]) fig.set_figheight(4.5 + 0.6 * len(ratio_refs)) fig.set_figwidth(5.5) cumsum = np.append(np.zeros_like(uncertainties[0]), 0.0) plot_objects = [] error_objects = [] hatch_linewidth_before = plt.rcParams["hatch.linewidth"] plt.rcParams["hatch.linewidth"] = 0.6 for i, (bin_content_with_type, delta) in enumerate(zip(bin_contents_with_type, uncertainties)): content_type, ratio_ref_index, bin_content = bin_content_with_type bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2 bin_content_extended = np.append(bin_content, bin_content[-1]) delta_extended = np.append(delta, delta[-1]) if content_type == "scatter": plot_objects.append( axs[0].scatter(bin_centers, bin_content, color=colors[i], s=10, label=legend_labels[i]) ) if (delta > 0).any(): error_objects.append( axs[0].errorbar( bin_centers, bin_content, xerr=1 / (2 * N_bins), yerr=delta, color=colors_errors[i], ls="none", linewidth=0.8, ) ) if len(ratio_refs) > 0: ratio_ref = ratio_refs[ratio_ref_index] ratio_ref_uncertainty = ratio_refs_uncertainties[ratio_ref_index] axs[1 + ratio_ref_index].scatter( bin_centers[ratio_ref != 0], bin_content[ratio_ref != 0] / ratio_ref[ratio_ref != 0], color=colors[i], s=10, ) if (delta > 0).any(): axs[1 + ratio_ref_index].errorbar( bin_centers[ratio_ref != 0], bin_content[ratio_ref != 0] / ratio_ref[ratio_ref != 0], xerr=1 / (2 * N_bins), yerr=np.sqrt( (delta[ratio_ref != 0] / ratio_ref[ratio_ref != 0]) ** 2 + ( bin_content[ratio_ref != 0] / ratio_ref[ratio_ref != 0] ** 2 * ratio_ref_uncertainty[ratio_ref != 0] ) ** 2 ), # assume the two hists are independent color=colors_errors[i], ls="none", linewidth=0.8, ) else: if not stacked: this_value = bin_content_extended plot_objects.append( axs[0].step( bin_edges, this_value, where="post", color=colors[i], linewidth=0.8, label=legend_labels[i] ) ) else: this_value = cumsum + bin_content_extended plot_objects.append( axs[0].fill_between( bin_edges, cumsum, this_value, step="post", alpha=1.0, facecolor=colors[i], linewidth=0.8, label=legend_labels[i], ) ) cumsum = this_value if (delta_extended > 0).any(): error_objects.append( axs[0].fill_between( bin_edges, this_value - delta_extended, this_value + delta_extended, step="post", alpha=1.0, hatch="///////", facecolor="none", edgecolor=colors_errors[i], linewidth=0.0, ) ) if len(ratio_refs) > 0: ratio_ref_extended = np.append(ratio_refs[ratio_ref_index], ratio_refs[ratio_ref_index][-1]) ratio_ref_uncertainties_extended = np.append( ratio_refs_uncertainties[ratio_ref_index], ratio_refs_uncertainties[ratio_ref_index][-1] ) axs[1 + ratio_ref_index].step( bin_edges[ratio_ref_extended != 0], bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0], where="post", color=colors[i], linewidth=0.8, ) delta_ratio = np.sqrt( (delta_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0]) ** 2 + ( bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0] ** 2 * ratio_ref_uncertainties_extended[ratio_ref_extended != 0] ) ** 2 ) if (delta_extended > 0).any(): axs[1 + ratio_ref_index].fill_between( bin_edges[ratio_ref_extended != 0], bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0] - delta_ratio, bin_content_extended[ratio_ref_extended != 0] / ratio_ref_extended[ratio_ref_extended != 0] + delta_ratio, step="post", alpha=1.0, hatch="///////", facecolor="none", edgecolor=colors_errors[i], linewidth=0.0, ) # error bar only for legend axs[0].fill_between( [], [], [], step="post", alpha=1.0, hatch="///////", facecolor="none", edgecolor="black", linewidth=0.0, label="Stat. Unc.", ) # text boxes on axes for i, text in enumerate(text_boxes): if i > len(ratio_refs): break # check if empty string if not text: continue props = dict(alpha=0.0) # place a text box in upper left axs[i].text( 0.01, 1.01, text, transform=axs[i].transAxes, fontsize=8, verticalalignment="bottom", bbox=props ) if x_range is not None: axs[0].set_xlim(x_range) axs[0].set_ylim(bottom=0) for i in range(1, len(ratio_refs) + 1): axs[i].set_ylabel(y_labels[i]) axs[i].set_ylim((0.5, 1.6)) # legend in two columns if more than three labels handles, labels = axs[0].get_legend_handles_labels() n_labels = len(handles) if n_labels > 3: handles = np.concatenate((handles[::2], handles[1::2]), axis=0) labels = np.concatenate((labels[::2], labels[1::2]), axis=0) axs[0].legend(handles, labels, loc=1, ncol=2) else: axs[0].legend(loc=1) # set y-limit to fit the legend ax_ylim = axs[0].get_ylim()[1] scale_ylim = 0.1 scale_ylim *= n_labels if n_labels <= 3 else (n_labels / 2 + n_labels % 2) axs[0].set_ylim(top=ax_ylim * (1 + scale_ylim)) axs[0].set_ylabel(y_labels[0]) axs[-1].set_xlabel(x_label) fig.suptitle(title) if figpath is not None: plt.savefig(figpath, dpi=600) else: fig.set_dpi(300) plt.show() plt.rcParams["hatch.linewidth"] = hatch_linewidth_before # create stacked histograms for each of the DNN outputs with all classes # first define tuples of predictions and corresponding weights for both the stacked and the normalized histogram. # For the stacked histogram, the order in the tuple defines the order of the contributions in the stack, from the # bottom upwards. for i in range(num_outputs): # get the predictions of output i for all classes if not explicit_testing_dataset: pred_i = [p[:, i] for p in pred_val_classes] else: pred_i = [p[:, i] for p in pred_test_classes] # create the stacked histogram with poisson uncertainties for each bin bin_contents, bin_edges, uncertainties = _weighted_hists_with_uncertainty( pred_i, weights_tuple=scaled_weights_val_classes if not explicit_testing_dataset else scaled_weights_test_classes, N_bins=N_bins, # range=(0, 1), stacked=True, ) # draw the histogram # for each contribution to the histogram, we need to provide the type of contribution (step or scatter), the # index of the reference to use for the ratio subplot, and bin content itself.. bin_contents_with_type = zip(["step"] * num_classes, [None] * num_classes, bin_contents) if (num_classes <= 10 and not explicit_testing_dataset) or num_classes <= 5: colors = sns.color_palette() else: colors = sns.color_palette("husl", num_classes if not explicit_testing_dataset else 2 * num_classes) colors_errors = [None] * (num_classes - 1) + ["0.4"] # only the uppermost contribution should have error bars if class_labels is not None: legend_labels = class_labels if not binary_classification else class_labels[::-1] else: legend_labels = ( [f"Class {k}" for k in range(num_classes)] if not binary_classification else ["Background", "Signal"] ) _draw_hist_with_ratios( bin_edges, bin_contents_with_type, uncertainties, ratio_refs=[], ratio_refs_uncertainties=[], text_boxes=[], colors=colors, colors_errors=colors_errors, legend_labels=legend_labels, title=f"Neural Network Output {i} (scaled)" if num_outputs > 1 else "Neural Network Output (scaled)", x_label="DNN Output", y_labels=["Events"], x_range=(bin_edges[0], bin_edges[-1]), stacked=True, figpath=os.path.join(fig_dir, f"DNN_output_{i}.pdf" if num_outputs > 1 else "DNN_output.pdf"), ) plt.clf() # create normalized histograms for each of the DNN outputs with all classes # get the normalized event weights weights_normalized = [w / np.sum(w) for w in weights_train_classes + weights_val_classes] if explicit_testing_dataset: weights_normalized += [w / np.sum(w) for w in weights_test_classes] for i in range(num_outputs): # get the predictions of output i for all classes and the normalized weights pred_i = [p[:, i] for p in pred_train_classes + pred_val_classes] if explicit_testing_dataset: pred_i += [p[:, i] for p in pred_test_classes] # get the histogram for each class with Poisson uncertainties bin_contents_normal, bin_edges_normal, uncertainties_normal = _weighted_hists_with_uncertainty( pred_i, weights_tuple=weights_normalized, N_bins=N_bins, # range=(0, 1) ) # construct the bin contents with type. We want the training predictions to be drawn as scatter and the validation # and test prediction as step. For all contributions, the training prediction of the same class should be used # as the reference for the ratio. bin_contents_normal_with_type = [("scatter", i, bin_contents_normal[i]) for i in range(num_classes)] bin_contents_normal_with_type += [("step", i, bin_contents_normal[num_classes + i]) for i in range(num_classes)] if explicit_testing_dataset: bin_contents_normal_with_type += [ ("step", i, bin_contents_normal[int(2 * num_classes) + i]) for i in range(num_classes) ] # we can choose the training and validation components of each class to have the same color and choose the testing # colors to be different colors_normal = 2 * [colors[i] for i in range(num_classes)] if explicit_testing_dataset: colors_normal += [colors[num_classes + i] for i in range(num_classes)] colors_errors_normal = colors_normal # build the labels for each contribution. Again we need to pay attention if we have binary classification or not. # Order of labels needs to be the same as the order of contributions in bin_contents_normal_with_type. legend_labels = [] for phase in ["training", "validation", "testing"] if explicit_testing_dataset else ["training", "validation"]: for k in range(num_classes): if not binary_classification: if class_labels is not None: legend_labels.append(f"{class_labels[k]} ({phase})") else: legend_labels.append(f"Class {k} ({phase})") else: if class_labels is not None: legend_labels.append(f"{class_labels[-(k+1)]} ({phase})") else: c = ["Background", "Signal"][k] legend_labels.append(f"{c} ({phase})") # build the titels for the ratio plots. We need to provide a title for the main plots as well, but we can leave # that blank if class_labels is not None: ratio_titles = [""] + class_labels else: ratio_titles = [""] + ( [f"Class {k}" for k in range(num_classes)] if not binary_classification else ["Background", "Signal"] ) _draw_hist_with_ratios( bin_edges_normal, bin_contents_normal_with_type, uncertainties_normal, ratio_refs=bin_contents_normal[:num_classes] if ratio else [], # training for ratio ratio_refs_uncertainties=uncertainties_normal[:num_classes] if ratio else [], text_boxes=ratio_titles, colors=colors_normal, colors_errors=colors_errors_normal, legend_labels=legend_labels, title=f"Neural Network Output {i}" if not binary_classification else "Neural Network Output", x_label="DNN Output", y_labels=["Normalized Prediction"] + ["Pred. / Train"] * num_classes, x_range=(bin_edges_normal[0], bin_edges_normal[-1]), figpath=os.path.join( fig_dir, f"DNN_output_{i}_normalized.pdf" if not binary_classification else "DNN_output_normalized.pdf" ), ) # write AUC values to results string results_string = "" results_string_args = [] if do_roc: for i in range(num_classes if not binary_classification else 1): if binary_classification: results_string += " AUC (training): {:.4f}\n" results_string += " AUC (validation): {:.4f}\n" if explicit_testing_dataset: results_string += " AUC (testing): {:.4f}\n" else: results_string += ( f" AUC ({class_labels[i]} vs. rest):\n" if class_labels is not None else f" AUC (class {i} vs. rest):\n" ) results_string += "\ttraining: {:.4f}\n" results_string += "\tvalidation: {:.4f}\n" if explicit_testing_dataset: results_string += "\ttesting: {:.4f}\n" results_string_args += [auc_train_classes[i], auc_val_classes[i]] if explicit_testing_dataset: results_string_args.append(auc_test_classes[i]) # loss results_string += " Loss:\n" train_loss = model.loss( inputs=inputs_train, y_true=targets_train, sample_weight=normalized_weights_train, y_pred=pred_train ) val_loss = model.loss(inputs=inputs_val, y_true=targets_val, sample_weight=normalized_weights_val, y_pred=pred_val) if explicit_testing_dataset: test_loss = model.loss( inputs=inputs_test, y_true=targets_test, sample_weight=normalized_weights_test, y_pred=pred_test ) results_string += "\ttraining: {}\n".format("{:.3f}") results_string += "\tvalidation: {}\n".format("{:.3f}") if explicit_testing_dataset: results_string += "\ttesting: {}\n".format("{:.3f}") results_string_args += [train_loss, val_loss, test_loss] if explicit_testing_dataset else [train_loss, val_loss] if native_metrics != []: # instantiate native metrics native_metrics = [(name, metric(**kwargs)) for name, (metric, kwargs) in native_metrics] results_string += " Native metrics:\n" for metric_name, metric in native_metrics: metric_value_train = OPTIMA.core.evaluation.calc_native_metric( run_config, metric, targets_train, pred_train ) metric_value_val = OPTIMA.core.evaluation.calc_native_metric(run_config, metric, targets_val, pred_val) if explicit_testing_dataset: metric_value_test = OPTIMA.core.evaluation.calc_native_metric( run_config, metric, targets_test, pred_test ) results_string += "\t{} (training): {}\n".format(metric_name, "{:.3f}") results_string += "\t{} (validation): {}\n".format(metric_name, "{:.3f}") if explicit_testing_dataset: results_string += "\t{} (testing): {}\n".format(metric_name, "{:.3f}") results_string_args += ( [metric_value_train, metric_value_val, metric_value_test] if explicit_testing_dataset else [metric_value_train, metric_value_val] ) if weighted_native_metrics != []: # instantiate weighted native metrics weighted_native_metrics = [(name, metric(**kwargs)) for name, (metric, kwargs) in weighted_native_metrics] results_string += " Weighted native metrics:\n" for metric_name, metric in weighted_native_metrics: metric_value_train = OPTIMA.core.evaluation.calc_native_metric( run_config, metric, targets_train, pred_train, sample_weight=normalized_weights_train ) metric_value_val = OPTIMA.core.evaluation.calc_native_metric( run_config, metric, targets_val, pred_val, sample_weight=normalized_weights_val ) if explicit_testing_dataset: metric_value_test = OPTIMA.core.evaluation.calc_native_metric( run_config, metric, targets_test, pred_test, sample_weight=normalized_weights_test ) results_string += "\t{} (training): {}\n".format(metric_name, "{:.3f}") results_string += "\t{} (validation): {}\n".format(metric_name, "{:.3f}") if explicit_testing_dataset: results_string += "\t{} (testing): {}\n".format(metric_name, "{:.3f}") results_string_args += ( [metric_value_train, metric_value_val, metric_value_test] if explicit_testing_dataset else [metric_value_train, metric_value_val] ) if custom_FoMs != []: results_string += " Custom metrics:\n" for FoM_name, FoM_func in custom_FoMs: FoM_value_train = FoM_func(targets_train, pred_train, sample_weight=normalized_weights_train) FoM_value_val = FoM_func(targets_val, pred_val, sample_weight=normalized_weights_val) if explicit_testing_dataset: FoM_value_test = FoM_func(targets_test, pred_test, sample_weight=normalized_weights_test) results_string += "\t{} (training): {}\n".format(FoM_name, "{:.3f}") results_string += "\t{} (validation): {}\n".format(FoM_name, "{:.3f}") if explicit_testing_dataset: results_string += "\t{} (testing): {}\n".format(FoM_name, "{:.3f}") results_string_args += ( [FoM_value_train, FoM_value_val, FoM_value_test] if explicit_testing_dataset else [FoM_value_train, FoM_value_val] ) if print_results: print(results_string.format(*results_string_args)) if results_dir is not None: with open(os.path.join(results_dir, "results_eval.txt"), "w") as results_file: results_file.write(results_string.format(*results_string_args)) if not return_unfilled: return results_string.format(*results_string_args) else: results_string = results_string.replace("{:.3f}", "{}").replace("{:.4f}", "{}") return results_string, results_string_args