import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


from problem import get_train_data, get_test_data
X_train, y_train = get_train_data()
X_test, y_test = get_test_data()


lab_df = pd.DataFrame({'label': y_train})
lab_df.value_counts(normalize=True)

label       
T_cells_CD8+    0.342
T_cells_CD4+    0.336
Cancer_cells    0.237
NK_cells        0.085
dtype: float64


lab_df.label.hist();


print(X_train.shape)
print(type(X_train))

(1000, 13551)
<class 'scipy.sparse._csr.csr_matrix'>


X_train.toarray()

array([[1., 0., 0., ..., 3., 2., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)


total_genes_counts = X_train.toarray().sum(axis=0)
total_cell_counts = X_train.toarray().sum(axis=1)


# plt.hist(np.log10(total_genes_counts), bins = np.arange(6))
plt.hist(total_genes_counts, bins = 10**np.arange(6))
plt.xscale("log")
plt.title("Histogram of total gene (i.e. column) counts in log-scale.")
plt.xlabel('Total genes count (log-scale)')
plt.show()


plt.hist(np.log10(total_cell_counts), bins = np.arange(1,6))
plt.title("Histogram of log-total cell (i.e. row) counts.")
plt.xlabel('log(cell_count)')
plt.show()


def preprocess_X(X):
    X = X.toarray()
    return X / X.sum(axis=1)[:, np.newaxis]

X_train_norm = preprocess_X(X_train)
# sanity check
np.allclose(X_train_norm.sum(axis=1), np.ones(X_train_norm.shape[0]))

True


from sklearn.metrics import balanced_accuracy_score


# this custom class is used by the challenge and calls 
# balanced_accuracy_score(y_true, y_pred, adjusted=False)
# under the hood
from problem import BalancedAccuracy


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


pipe = Pipeline(
    [
        ("Scaler", StandardScaler(with_mean=True, with_std=True)),
        ("PCA with 50 components", PCA(n_components=50)),
        (
            "Random Forest Classifier",
            RandomForestClassifier(
                max_depth=5, n_estimators=100, max_features=3
            ),
        ),
    ]
)

pipe

Pipeline(steps=[('Scaler', StandardScaler()),
                ('PCA with 50 components', PCA(n_components=50)),
                ('Random Forest Classifier',
                 RandomForestClassifier(max_depth=5, max_features=3))])

Pipeline(steps=[('Scaler', StandardScaler()),
                ('PCA with 50 components', PCA(n_components=50)),
                ('Random Forest Classifier',
                 RandomForestClassifier(max_depth=5, max_features=3))])

StandardScaler()

PCA(n_components=50)

RandomForestClassifier(max_depth=5, max_features=3)


# fit on train
pipe.fit(X_train_norm, y_train)
y_tr_pred = pipe.predict(X_train_norm)

# predict on test
X_test_norm = preprocess_X(X_test)
y_te_pred = pipe.predict(X_test_norm)


from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# compute balanced accuracy and confusion matrix
print(f"Train balanced accuracy : {balanced_accuracy_score(y_train, y_tr_pred):.3f}")
print(f"Test balanced accuracy : {balanced_accuracy_score(y_test, y_te_pred):.3f}")
cm = confusion_matrix(y_test, y_te_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe.classes_, )
disp.plot()
plt.title("Confusion matrix on test set");

Train balanced accuracy : 0.688
Test balanced accuracy : 0.576


import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline


def _preprocess_X(X_sparse):
    # cast a dense array
    X = X_sparse.toarray()

    # normalize each row
    return X / X.sum(axis=1)[:, np.newaxis]


class Classifier(object):
    def __init__(self):
        # Use scikit-learn's pipeline
        self.pipe = make_pipeline(
            StandardScaler(with_mean=True, with_std=True),
            PCA(n_components=50),
            RandomForestClassifier(
                max_depth=5, n_estimators=100, 
                max_features=3
            ),
        )

    def fit(self, X_sparse, y):
        X = _preprocess_X(X_sparse)
        self.pipe.fit(X, y)
        self.classes_ = self.pipe.classes_
        pass

    def predict_proba(self, X_sparse):
        X = _preprocess_X(X_sparse)
        # here we use RandomForest.predict_proba()
        return self.pipe.predict_proba(X)


clf = Classifier()
clf.fit(X_train, y_train)
# predict_proba 
y_tr_pred_proba = clf.predict_proba(X_train)
y_te_pred_proba = clf.predict_proba(X_test)

# convert to hard classification with argmax
y_tr_pred = clf.classes_[np.argmax(y_tr_pred_proba, axis=1)]
y_te_pred = clf.classes_[np.argmax(y_te_pred_proba, axis=1)]


print('Train balanced accuracy:', balanced_accuracy_score(y_train, y_tr_pred))
print('Test balanced accuracy:', balanced_accuracy_score(y_test, y_te_pred))

Train balanced accuracy: 0.6930744874565415
Test balanced accuracy: 0.5820983426079323


!ramp-test --submission starting_kit

Testing Single-cell RNA-seq cell types classification
Reading train and test files from ./data/ ...
Reading cv ...
Training submissions/starting_kit ...
CV fold 0
	score  bal_acc      time
	train     0.71  2.156162
	valid     0.57  0.140732
	test      0.56  0.097918
CV fold 1
	score  bal_acc      time
	train     0.74  1.413743
	valid     0.58  0.146134
	test      0.59  0.096986
CV fold 2
	score  bal_acc      time
	train     0.71  2.122261
	valid     0.60  0.146318
	test      0.57  0.096218
CV fold 3
	score  bal_acc      time
	train     0.73  2.167345
	valid     0.59  0.154218
	test      0.57  0.099463
CV fold 4
	score  bal_acc      time
	train     0.70  1.898126
	valid     0.58  0.144049
	test      0.57  0.085239
----------------------------
Mean CV scores
----------------------------
	score       bal_acc        time
	train  0.72 ± 0.016  2.0 ± 0.29
	valid   0.58 ± 0.01   0.1 ± 0.0
	test   0.57 ± 0.009  0.1 ± 0.01
----------------------------
Bagged scores
----------------------------
	score  bal_acc
	valid     0.58
	test      0.58

Single-cell RNA-seq classification

A RAMP data-challenge on the prediction of cellular types based on genes expression level

This data-challenge was created for the data-camp course of the Master 2 Data-Science of Université Évry (Paris-Saclay)

Introduction¶

Setup¶

The data¶

Loading data¶

Labels proportions¶

Sparse matrix and NumPy arrays¶

A first look at the data¶

The score function¶

A first (naive) try at the challenge¶

Step-by-step construction of a classifier¶

Designing the RAMP submission¶

Mandatory structure of a submission¶

Illustration with the naive classifier¶

Submitting to RAMP¶

Ranking & Leaderboard¶

More information¶

Now it's your turn¶