%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)


import problem
X_df, y = problem.get_train_data(load_waveform=False)


print(f"There are {X_df.shape[0]} segment in this dataset.")
X_df.head()

There are 300750 segment in this dataset.


print(
    f"There are {X_df['subject'].nunique()} unique subjects in the train set."
)
# Display the number of subject/segments for each domain
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
X_df.groupby("domain")['subject'].apply(lambda x: x.nunique()).plot.bar(
    ax=axes[0], color=["C1", "C0"], title="Number of patients per domain"
)
X_df.groupby("domain").size().plot.bar(
    ax=axes[1], color=["C1", "C0"], title="Number of segments per domain"
)

There are 1203 unique subjects in the train set.

<Axes: title={'center': 'Number of segments per domain'}, xlabel='domain'>


fig, axes = plt.subplots(1, 2, figsize=(12, 4))
bins = np.linspace(0, 100, 21)
for i, domain in enumerate(["v", "m"]):
    X_ = X_df.query("domain == @domain")
    X_["age"].hist(
        ax=axes[0], bins=bins, density=True, alpha=0.5,
        label=domain, color=f"C{i}"
    )
    (X_["gender"].value_counts() / len(X_)).plot(
        kind="bar", ax=axes[1], label=domain, alpha=0.5, color=f"C{i}"
    )
plt.legend()

<matplotlib.legend.Legend at 0x7fa4d57c2cf0>


# Now let's check the distribution of the target
plt.hist(y[y != -1], bins=40, density=True, alpha=0.5, label="v")
plt.legend()

<matplotlib.legend.Legend at 0x7fa4d4273610>


import os
os.environ["RAMP_TEST_MODE"] = "1"

X_v, y = problem.get_train_data(start=0, stop=100)
X_m, y_m = problem.get_train_data(start=-101, stop=-1)
X_df = pd.concat([X_v, X_m], axis=0)
y = np.concatenate([y, y_m])

X_df


fig, axes = plt.subplots(4, 2, figsize=(10, 12))
plt.subplots_adjust(hspace=0.5)
idx = np.random.choice(min(100, len(X_m), len(X_v)), size=8, replace=False)
for i, ax in enumerate(axes.ravel()):
    d = "v" if i % 2 == 0 else "m"
    X_, y_ = (X_v, y) if i % 2 == 0 else (X_m, y_m)

    ax.plot(X_.iloc[idx[i]]['ecg'], c=f"C{i % 2}")
    ax.set_title(f"Seg {idx[i]} (domain = {d}) - MAP: {y_[idx[i]]:.1f}")


fig, axes = plt.subplots(4, 2, figsize=(10, 12))
plt.subplots_adjust(hspace=0.5)
idx = np.random.choice(min(100, len(X_m), len(X_v)), size=8, replace=False)
for i, ax in enumerate(axes.ravel()):
    d = "v" if i % 2 == 0 else "m"
    X_, y_ = (X_v, y) if i % 2 == 0 else (X_m, y_m)

    ax.plot(X_.iloc[idx[i]]['ppg'], c=f"C{i % 2}")
    ax.set_title(f"Seg {idx[i]} (domain = {d}) - MAP: {y_[idx[i]]:.1f}")


from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge


from sklearn import set_config
set_config(transform_output="pandas")

from sklearn.metrics import mean_absolute_error


X_df, y = problem.get_train_data(load_waveform=False)
X_df_test, y_test = problem.get_test_data(load_waveform=False)



class IgnoreAdapter(RandomForestRegressor):
    def fit(self, X, y):
        X = X[~y.isna()]
        y = y[~y.isna()]
        return super().fit(X, y)

    def predict(self, X):
        return super().predict(X.drop(columns=["domain"]))


clf = make_pipeline(
    make_column_transformer(
        ("passthrough", ["age"]),
        (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ["gender"]),
    ),
    RandomForestRegressor(n_estimators=50)
    # Ridge(alpha=1)
)
clf.fit(X_df, y)
print(f"Score on train: {mean_absolute_error(clf.predict(X_df), y)}")
print(f"Score on test: {mean_absolute_error(clf.predict(X_df_test), y_test)}")

Score on train: 4.27551420182459
Score on test: 6.535600633606684


# %load submissions/starting_kit/estimator.py

from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

set_config(transform_output="pandas")


class IgnoreDomain(RandomForestRegressor):
    def fit(self, X, y):
        # Ignore the samples with missing target
        X = X[y != -1]
        y = y[y != -1]
        return super().fit(X, y)


def get_estimator():
    return make_pipeline(
        make_column_transformer(
            ("passthrough", ["age"]),
            (OrdinalEncoder(
                handle_unknown='use_encoded_value', unknown_value=-1
            ), ["gender"]),
        ),
        IgnoreDomain(n_estimators=50)
    )


import problem
from sklearn.model_selection import cross_val_score

X_df, y = problem.get_train_data()

scores = cross_val_score(get_estimator(), X_df, y, cv=5, scoring='neg_mean_absolute_error')
print(scores)

[-11.3400143  -16.69525706 -63.09855314 -88.11120541 -69.41988315]

	n_seg	subject	age	gender	domain	height	weight	bmi	id	chunk
0	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	NaN
1	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	NaN
2	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	NaN
3	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	NaN
4	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	NaN

	n_seg	subject	age	gender	domain	height	weight	bmi	id	ecg	ppg	chunk
0	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	[0.3688673837817671, 0.3620627421921535, 0.365...	[0.40960445864963035, 0.36931866813001185, 0.3...	NaN
1	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	[0.25735809742643956, 0.23910011930527947, 0.2...	[0.36706728559046503, 0.4034043141571599, 0.35...	NaN
2	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	[0.13428866369506492, 0.14334834365416743, 0.1...	[0.7494333513716098, 0.8147798178151058, 0.799...	NaN
3	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	[0.17744878960204316, 0.18312116528072353, 0.2...	[0.2668839934294831, 0.26939094339014175, 0.25...	NaN
4	688	p000750	78.0	M	v	162.0	50.6	19.3	1.0	[0.6703874816047055, 0.5075806829705767, 0.444...	[0.27162392568169147, 0.2647447495713346, 0.24...	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...
2095	470	p090269	76.0	M	m	NaN	NaN	NaN	186.0	[0.2785793562708102, 0.28412874583795783, 0.25...	[0.19241192411924118, 0.1603432700993677, 0.13...	val
2096	470	p090269	76.0	M	m	NaN	NaN	NaN	186.0	[0.2987085906793936, 0.3161145423919146, 0.212...	[0.11162361623616242, 0.11531365313653143, 0.1...	val
2097	470	p090269	76.0	M	m	NaN	NaN	NaN	186.0	[0.20290697674418606, 0.1430232558139535, 0.10...	[0.04311482622085346, 0.04311482622085346, 0.0...	val
2098	470	p090269	76.0	M	m	NaN	NaN	NaN	186.0	[0.19429778247096094, 0.18373812038014786, 0.1...	[0.20263870094722602, 0.18775372124492556, 0.1...	val
2099	470	p090269	76.0	M	m	NaN	NaN	NaN	186.0	[0.28121098626716606, 0.27153558052434457, 0.2...	[0.7211621856027752, 0.7506504770164786, 0.775...	val

Non-invasive estimation of the MAP using PPG¶

Introduction¶

Extra resources¶

Exploratory data analysis¶

Load the data¶

Challenge evaluation¶

A first model without the waveforms¶

Submission format¶

The pipeline workflow¶

Testing using a scikit-learn pipeline¶

Submission¶