Benchmark multi-model/multi-view models.
Source code for mmbench.workflow.predict
# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2022 - 2023
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################
"""
Define the predicction workflows.
"""
# Imports
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from mmbench.color_utils import (
print_title, print_subtitle, print_text, print_result,
print_error)
from mmbench.plotting import plot_bar
[docs]def benchmark_pred_exp(dataset, datasetdir, datadir, outdir):
""" Compare the learned latent space of different models using
prediction analysis.
Parameters
----------
dataset: str
the dataset name: euaims or hbn.
datasetdir: str
the path to the dataset associated data.
datadir: str
the path containing the embedding data.
outdir: str
the destination folder.
Notes
-----
- The samples are generated with the 'bench-latent' sub-command and are
stored in the 'outdir' in two files named 'latent_vecs_<dataset>.npz'
and 'latent_vecs_train_<dataset>.npz' for the
test and train sets, respectively..
- The associated metadata are stored in two others files named
'latent_meta_<dataset>.tsv' and 'latent_meta_train_<dataset>.tsv'.
- The samples shape is (n_samples, n_subjects, latent_dim). All samples
must have the same number of samples and subjects, but possibly
different latent dimensions.
- The metadata columns must be the same.
"""
print_title("COMPARE MODELS USING REGRESSIONS "
f"OR CLASSIFICATION WITH ML ANALYSIS: {dataset}")
if not os.path.isdir(outdir):
os.mkdir(outdir)
print_text(f"Benchmark directory: {outdir}")
print_subtitle("Loading data...")
latent_data_test = np.load(
os.path.join(datadir, f"latent_vecs_test_{dataset}.npz"))
latent_data_train = np.load(
os.path.join(datadir, f"latent_vecs_train_{dataset}.npz"))
assert (sorted(latent_data_test.keys()) ==
sorted(latent_data_train.keys())), (
"latent data must have the same keys")
meta_df = pd.read_csv(
os.path.join(datadir, f"latent_meta_test_{dataset}.tsv"), sep="\t")
meta_df_tr = pd.read_csv(
os.path.join(datadir, f"latent_meta_train_{dataset}.tsv"), sep="\t")
assert (sorted(meta_df.columns) == sorted(meta_df_tr.columns)), (
"metadata must have the same columns.")
clinical_scores = meta_df_tr.columns
predict_results = dict()
for latent_key in latent_data_test:
samples = latent_data_train[latent_key]
samples_test = latent_data_test[latent_key]
assert samples.shape[-1] == samples_test.shape[-1], (
"The train and test data must be generated by the same model.")
n_samples, _, _ = samples.shape
print_subtitle("Train model...")
res_cv_list, sname = [], []
for qname in clinical_scores:
y_train = meta_df_tr[qname]
y_test = meta_df[qname]
for latent_key in latent_data_test:
print_text(f"- {qname} - {latent_key}...")
res, res_cv = [], []
samples_train = latent_data_train[latent_key]
samples_test = latent_data_test[latent_key]
for idx in tqdm(range(n_samples)):
clf, scorer, name = get_predictor(y_train)
scores = cross_val_score(
clf, samples_train[idx], y_train, cv=5, scoring=scorer,
n_jobs=-1)
clf.fit(samples_train[idx], y_train)
res_cv.append(f"{scores.mean():.2f} +/- {scores.std():.2f}")
res.append(scorer(clf, samples_test[idx], y_test))
res_cv_df = pd.DataFrame.from_dict(
{"model": range(n_samples), "score": res_cv})
res_cv_df["qname"] = qname
res_cv_df["latent"] = latent_key
print(res_cv_df)
res_cv_list.append(res_cv_df)
predict_results.setdefault(qname, {})[latent_key] = np.asarray(res)
sname.append(name)
predict_df = pd.DataFrame.from_dict(predict_results, orient="index")
predict_df = pd.concat([predict_df[col].explode() for col in predict_df],
axis="columns")
predict_df.to_csv(os.path.join(outdir, "predict.tsv"), sep="\t",
index=False)
_df = pd.concat(res_cv_list)
_df.to_csv(os.path.join(outdir, "predict_cv.tsv"), sep="\t",
index=False)
print_subtitle("Display statistics...")
ncols = 3
nrows = int(np.ceil(len(clinical_scores) / ncols))
plt.figure(figsize=np.array((ncols, nrows)) * 4)
pairwise_stats = []
for idx, qname in enumerate(clinical_scores):
ax = plt.subplot(nrows, ncols, idx + 1)
pairwise_stat_df = plot_bar(
qname, predict_results, ax=ax, figsize=None, dpi=300, fontsize=7,
fontsize_star=12, fontweight="bold", line_width=2.5,
marker_size=3, title=qname.upper(),
do_one_sample_stars=False, palette="Set2", yname=sname[idx])
if pairwise_stat_df is not None:
pairwise_stats.append(pairwise_stat_df)
if len(pairwise_stats) > 0:
pairwise_stat_df = pd.concat(pairwise_stats)
pairwise_stat_df.to_csv(
os.path.join(outdir, "predict_pairwise_stats.tsv"), sep="\t",
index=False)
plt.subplots_adjust(
left=None, bottom=None, right=None, top=None, wspace=.5, hspace=.5)
plt.suptitle(f"{dataset.upper()} PREDICT RESULTS", fontsize=20, y=.95)
filename = os.path.join(outdir, f"predict_{dataset}.png")
plt.savefig(filename)
print_result(f"PREDICT: {filename}")
[docs]def get_predictor(data):
""" Return a classifier and a BAcc metric if the data is of type int or
str, otherwise a regressor and a MAE metric.
Parameters
----------
data: list
list of value that will be submitted to a predictor.
Returns
-------
predictor: linear_model
A classifier or a regressor.
scorer: callable
a scorer callable object/function with signature which returns a
single value.
name: str
the name of the scorer.
"""
data = np.array(data)
is_int = ((data - data.astype(int) == 0).all()
if not isinstance(data[0], str) else False)
if isinstance(data[0], str) or is_int:
predictor = linear_model.RidgeClassifier()
scorer = metrics.get_scorer("balanced_accuracy")
name = "BAcc"
else:
predictor = linear_model.Ridge(alpha=.5)
scorer = metrics.get_scorer("neg_mean_absolute_error")
scorer = metrics.make_scorer(scorer._score_func,
greater_is_better=True)
name = "MAE"
return predictor, scorer, name
Follow us