Benchmark multi-model/multi-view models.

Source code for mmbench.workflow.predict

# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2022 - 2023
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################

"""
Define the predicction workflows.
"""
# Imports
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from mmbench.color_utils import (
    print_title, print_subtitle, print_text, print_result,
    print_error)
from mmbench.plotting import plot_bar


[docs]def benchmark_pred_exp(dataset, datasetdir, datadir, outdir):
    """ Compare the learned latent space of different models using
    prediction analysis.

    Parameters
    ----------
    dataset: str
        the dataset name: euaims or hbn.
    datasetdir: str
        the path to the dataset associated data.
    datadir: str
        the path containing the embedding data.
    outdir: str
        the destination folder.

    Notes
    -----
    - The samples are generated with the 'bench-latent' sub-command and are
      stored in the 'outdir' in two files named 'latent_vecs_<dataset>.npz'
      and 'latent_vecs_train_<dataset>.npz' for the
      test and train sets, respectively..
    - The associated metadata are stored in two others files named
      'latent_meta_<dataset>.tsv' and 'latent_meta_train_<dataset>.tsv'.
    - The samples shape is (n_samples, n_subjects, latent_dim). All samples
      must have the same number of samples and subjects, but possibly
      different latent dimensions.
    - The metadata columns must be the same.
    """
    print_title("COMPARE MODELS USING REGRESSIONS "
                f"OR CLASSIFICATION WITH ML ANALYSIS: {dataset}")
    if not os.path.isdir(outdir):
        os.mkdir(outdir)
    print_text(f"Benchmark directory: {outdir}")

    print_subtitle("Loading data...")
    latent_data_test = np.load(
        os.path.join(datadir, f"latent_vecs_test_{dataset}.npz"))
    latent_data_train = np.load(
        os.path.join(datadir, f"latent_vecs_train_{dataset}.npz"))
    assert (sorted(latent_data_test.keys()) ==
            sorted(latent_data_train.keys())), (
                "latent data must have the same keys")
    meta_df = pd.read_csv(
        os.path.join(datadir, f"latent_meta_test_{dataset}.tsv"), sep="\t")
    meta_df_tr = pd.read_csv(
        os.path.join(datadir, f"latent_meta_train_{dataset}.tsv"), sep="\t")
    assert (sorted(meta_df.columns) == sorted(meta_df_tr.columns)), (
        "metadata must have the same columns.")
    clinical_scores = meta_df_tr.columns
    predict_results = dict()
    for latent_key in latent_data_test:
        samples = latent_data_train[latent_key]
        samples_test = latent_data_test[latent_key]
        assert samples.shape[-1] == samples_test.shape[-1], (
            "The train and test data must be generated by the same model.")
        n_samples, _, _ = samples.shape

    print_subtitle("Train model...")
    res_cv_list, sname = [], []
    for qname in clinical_scores:
        y_train = meta_df_tr[qname]
        y_test = meta_df[qname]
        for latent_key in latent_data_test:
            print_text(f"- {qname} - {latent_key}...")
            res, res_cv = [], []
            samples_train = latent_data_train[latent_key]
            samples_test = latent_data_test[latent_key]
            for idx in tqdm(range(n_samples)):
                clf, scorer, name = get_predictor(y_train)
                scores = cross_val_score(
                    clf, samples_train[idx], y_train, cv=5, scoring=scorer,
                    n_jobs=-1)
                clf.fit(samples_train[idx], y_train)
                res_cv.append(f"{scores.mean():.2f} +/- {scores.std():.2f}")
                res.append(scorer(clf, samples_test[idx], y_test))
            res_cv_df = pd.DataFrame.from_dict(
                {"model": range(n_samples), "score": res_cv})
            res_cv_df["qname"] = qname
            res_cv_df["latent"] = latent_key
            print(res_cv_df)
            res_cv_list.append(res_cv_df)
            predict_results.setdefault(qname, {})[latent_key] = np.asarray(res)
        sname.append(name)
    predict_df = pd.DataFrame.from_dict(predict_results, orient="index")
    predict_df = pd.concat([predict_df[col].explode() for col in predict_df],
                           axis="columns")
    predict_df.to_csv(os.path.join(outdir, "predict.tsv"), sep="\t",
                      index=False)
    _df = pd.concat(res_cv_list)
    _df.to_csv(os.path.join(outdir, "predict_cv.tsv"), sep="\t",
               index=False)

    print_subtitle("Display statistics...")
    ncols = 3
    nrows = int(np.ceil(len(clinical_scores) / ncols))
    plt.figure(figsize=np.array((ncols, nrows)) * 4)
    pairwise_stats = []
    for idx, qname in enumerate(clinical_scores):
        ax = plt.subplot(nrows, ncols, idx + 1)
        pairwise_stat_df = plot_bar(
            qname, predict_results, ax=ax, figsize=None, dpi=300, fontsize=7,
            fontsize_star=12, fontweight="bold", line_width=2.5,
            marker_size=3, title=qname.upper(),
            do_one_sample_stars=False, palette="Set2", yname=sname[idx])
        if pairwise_stat_df is not None:
            pairwise_stats.append(pairwise_stat_df)
    if len(pairwise_stats) > 0:
        pairwise_stat_df = pd.concat(pairwise_stats)
        pairwise_stat_df.to_csv(
            os.path.join(outdir, "predict_pairwise_stats.tsv"), sep="\t",
            index=False)
    plt.subplots_adjust(
        left=None, bottom=None, right=None, top=None, wspace=.5, hspace=.5)
    plt.suptitle(f"{dataset.upper()} PREDICT RESULTS", fontsize=20, y=.95)
    filename = os.path.join(outdir, f"predict_{dataset}.png")
    plt.savefig(filename)
    print_result(f"PREDICT: {filename}")


[docs]def get_predictor(data):
    """ Return a classifier and a BAcc metric if the data is of type int or
    str, otherwise a regressor and a MAE metric.

    Parameters
    ----------
    data: list
        list of value that will be submitted to a predictor.

    Returns
    -------
    predictor: linear_model
        A classifier or a regressor.
    scorer: callable
        a scorer callable object/function with signature which returns a
        single value.
    name: str
        the name of the scorer.
    """
    data = np.array(data)
    is_int = ((data - data.astype(int) == 0).all()
              if not isinstance(data[0], str) else False)
    if isinstance(data[0], str) or is_int:
        predictor = linear_model.RidgeClassifier()
        scorer = metrics.get_scorer("balanced_accuracy")
        name = "BAcc"
    else:
        predictor = linear_model.Ridge(alpha=.5)
        scorer = metrics.get_scorer("neg_mean_absolute_error")
        scorer = metrics.make_scorer(scorer._score_func,
                                     greater_is_better=True)
        name = "MAE"
    return predictor, scorer, name