%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

participants_train = pd.read_csv(os.path.join("data", "train_participants.csv" ))
participants_train["set"] = 'train'
participants_test = pd.read_csv(os.path.join("data", "test_participants.csv" ))
participants_test["set"] = 'test'
participants = participants_train.append(participants_test)

sns.violinplot(x="set", y="age", data=participants)
participants[["age", "set"]].groupby("set").describe()


globvol_rois_train = pd.read_csv(os.path.join("data", "train_rois.csv"))
print(globvol_rois_train.iloc[:5, :10])

   participant_id     CSF_Vol      GM_Vol      WM_Vol  l3thVen_GM_Vol  \
0             651  334.031002  603.083565  560.465707        0.046239   
1             431  300.991328  603.101354  594.425495        0.052599   
2             398  252.410342  641.965684  604.320324        0.063052   
3             419  402.533442  604.978032  490.709738        0.043962   
4             627  409.460047  579.029937  509.311974        0.042226   

   r3thVen_GM_Vol  l4thVen_GM_Vol  r4thVen_GM_Vol  lAcc_GM_Vol  rAcc_GM_Vol  
0        0.052240        0.063827        0.068097     0.419533     0.427702  
1        0.047901        0.088186        0.105241     0.373451     0.377817  
2        0.051575        0.083113        0.076834     0.447264     0.465415  
3        0.041684        0.089264        0.083791     0.391540     0.388656  
4        0.036072        0.073829        0.081725     0.339043     0.358513


rois_train = globvol_rois_train.loc[:, 'l3thVen_GM_Vol':]
vols_train = globvol_rois_train.loc[:, ['CSF_Vol', 'GM_Vol', 'WM_Vol']]


from sklearn.decomposition import PCA

pca_rois = PCA(n_components=2)
PCs_rois = pca_rois.fit_transform(rois_train)
print(pca_rois.explained_variance_ratio_)

df = pd.DataFrame(dict(age=participants_train['age'], PC1_ROIs=PCs_rois[:, 0], PC2_ROIs=PCs_rois[:, 1]))

[0.54771351 0.10711543]


sns.pairplot(df)
print(df.corr())

               age      PC1_ROIs      PC2_ROIs
age       1.000000  8.177015e-01 -1.108805e-01
PC1_ROIs  0.817702  1.000000e+00 -1.133611e-17
PC2_ROIs -0.110881 -1.133611e-17  1.000000e+00


df["GM_ratio"] = vols_train.loc[:, "GM_Vol"] / vols_train.sum(axis=1)
df["WM_ratio"] = vols_train.loc[:, "WM_Vol"] / vols_train.sum(axis=1)
df["CSF_ratio"] = vols_train.loc[:, "CSF_Vol"] / vols_train.sum(axis=1)

sns.pairplot(df[["PC1_ROIs", "age", "GM_ratio", "WM_ratio","CSF_ratio"]])
print(df[["PC1_ROIs", "age", "GM_ratio", "WM_ratio", "CSF_ratio"]].corr())

           PC1_ROIs       age  GM_ratio  WM_ratio  CSF_ratio
PC1_ROIs   1.000000  0.817702 -0.944958 -0.467386   0.960228
age        0.817702  1.000000 -0.818380 -0.345080   0.803764
GM_ratio  -0.944958 -0.818380  1.000000  0.225909  -0.890849
WM_ratio  -0.467386 -0.345080  0.225909  1.000000  -0.643805
CSF_ratio  0.960228  0.803764 -0.890849 -0.643805   1.000000


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
import sklearn.metrics as metrics
import problem

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

assert X_train.shape[1] == 284 + 331695


def cv_train_test_scores(rmse_cv_test, rmse_cv_train, r2_cv_test, r2_cv_train,
                         y_train, y_pred_train, y_test, y_pred_test):
    """Compute CV score, train and test score from a cv grid search model.

    Parameters
    ----------
    rmse_cv_test : array
        Test rmse across CV folds.
    rmse_cv_train : array
        Train rmse across CV folds.

    r2_cv_test : array
        Test R2 across CV folds.
    r2_cv_train : array
        Train R2 across CV folds.

    y_train : array
        True train values.
    y_pred_train : array
        Predicted train values.
    y_test : array
        True test values.
    y_pred_test : array
        Predicted test values.

    Returns
    -------
    info : TYPE
        DataFrame(r2_cv, r2_train, mae_train, mse_train).
    """
    # CV scores
    rmse_cv_test_mean, rmse_cv_test_sd = np.mean(rmse_cv_test), np.std(rmse_cv_test)
    rmse_cv_train_mean, rmse_cv_train_sd = np.mean(rmse_cv_train), np.std(rmse_cv_train)

    r2_cv_test_mean, r2_cv_test_sd = np.mean(r2_cv_test), np.std(r2_cv_test)
    r2_cv_train_mean, r2_cv_train_sd = np.mean(r2_cv_train), np.std(r2_cv_train)

    # Test scores
    rmse_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
    r2_test = metrics.r2_score(y_test, y_pred_test)

    # Train scores
    rmse_train = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
    r2_train = metrics.r2_score(y_train, y_pred_train)

    
    scores = pd.DataFrame([[rmse_cv_test_mean, rmse_cv_test_sd, rmse_cv_train_mean, rmse_cv_train_sd,
                            r2_cv_test_mean, rmse_cv_test_sd, r2_cv_train_mean, r2_cv_train_sd,
                            rmse_test, r2_test, rmse_train, r2_train
                           ]],
                        columns=('rmse_cv_test_mean', 'rmse_cv_test_sd', 'rmse_cv_train_mean', 'rmse_cv_train_sd',
                                 'r2_cv_test_mean', 'rmse_cv_test_sd', 'r2_cv_train_mean', 'r2_cv_train_sd',
                                 'rmse_test', 'r2_test', 'rmse_train', 'r2_train'
                                 ))

    return scores


class ROIsFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, :284]

class VBMFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, 284:]


fe = ROIsFeatureExtractor()
print(fe.transform(X_train).shape)

fe = VBMFeatureExtractor()
print(fe.transform(X_train).shape)

(357, 284)
(357, 331695)


cv = problem.get_cv(X_train, y_train)

estimator = make_pipeline(ROIsFeatureExtractor(), StandardScaler(), LinearRegression())

cv_results = cross_validate(estimator, X_train, y_train, scoring=['neg_root_mean_squared_error', 'r2'], cv=cv,
                         verbose=1, return_train_score=True, n_jobs=5)

# Refit on all train
estimator.fit(X_train, y_train)
# Apply on test
y_pred_train = estimator.predict(X_train)
y_pred_test = estimator.predict(X_test)

print("Important scores are rmse_cv_test_mean and rmse_test")
cv_train_test_scores(rmse_cv_test=-cv_results['test_neg_root_mean_squared_error'],
                     rmse_cv_train=-cv_results['train_neg_root_mean_squared_error'],
                     r2_cv_test=cv_results['test_r2'],
                     r2_cv_train=cv_results['train_r2'],
                     y_train=y_train, y_pred_train=y_pred_train, y_test=y_test, y_pred_test=y_pred_test).T.round(3)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    2.9s remaining:    4.4s

Important scores are rmse_cv_test_mean and rmse_test

[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    3.3s finished


estimator = make_pipeline(VBMFeatureExtractor(), StandardScaler(), LinearRegression())

cv = problem.get_cv(X_train, y_train)

cv_results = cross_validate(estimator, X_train, y_train, scoring=['neg_root_mean_squared_error', 'r2'], cv=cv,
                         verbose=1, return_train_score=True, n_jobs=5)

cv_train_test_scores(rmse_cv_test=-cv_results['test_neg_root_mean_squared_error'],
                     rmse_cv_train=-cv_results['train_neg_root_mean_squared_error'],
                     r2_cv_test=cv_results['test_r2'],
                     r2_cv_train=cv_results['train_r2'],
                     y_train=y_train, y_pred_train=y_pred_train, y_test=y_test, y_pred_test=y_pred_test).T.round(3)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   54.6s remaining:  1.4min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   54.9s finished


import numpy as np

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


class ROIsFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, :284]


def get_estimator():
    """Build your estimator here."""
    estimator = make_pipeline(ROIsFeatureExtractor(), StandardScaler(),
                              LinearRegression())
    return estimator

	age
	count	mean	std	min	25%	50%	75%	max
set
test	90.0	47.848019	17.852717	20.071184	31.819986	44.855578	62.017112	82.187543
train	357.0	49.138846	16.095719	19.980835	35.455168	50.902122	62.061602	86.318960

brain_age¶

Dataset¶

Input data¶

Target¶

Evaluation metrics¶

Links¶

Installation¶

Getting started¶

Descriptive statistics¶

Unsupervized analysis: Explore main sources of variabilities¶

PCA on ROIs: explore global effect of age¶

Neurobiological effect of age¶

Conclusion¶

Machine learning¶

Utils functions¶

Feature extractor of ROIs or voxels within the brain (VBM)¶

Design of predictors and their evaluation using CV and test set¶

Submission¶

	0
rmse_cv_test_mean	47.793
rmse_cv_test_sd	16.807
rmse_cv_train_mean	0.639
rmse_cv_train_sd	0.288
r2_cv_test_mean	-8.540
rmse_cv_test_sd	16.807
r2_cv_train_mean	0.998
r2_cv_train_sd	0.002
rmse_test	14.350
r2_test	0.347
rmse_train	2.659
r2_train	0.973

	0
rmse_cv_test_mean	6.968
rmse_cv_test_sd	0.623
rmse_cv_train_mean	0.000
rmse_cv_train_sd	0.000
r2_cv_test_mean	0.808
rmse_cv_test_sd	0.623
r2_cv_train_mean	1.000
r2_cv_train_sd	0.000
rmse_test	14.350
r2_test	0.347
rmse_train	2.659
r2_train	0.973