%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from problem import get_train_data

data_train, labels_train = get_train_data()


data_train.head()


data_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 509834 entries, 1997-10-01 00:00:00 to 2007-12-31 23:50:00
Data columns (total 33 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   B           509834 non-null  float32
 1   Bx          509834 non-null  float32
 2   Bx_rms      509834 non-null  float32
 3   By          509834 non-null  float32
 4   By_rms      509834 non-null  float32
 5   Bz          509834 non-null  float32
 6   Bz_rms      509834 non-null  float32
 7   Na_nl       509834 non-null  float32
 8   Np          509834 non-null  float32
 9   Np_nl       509834 non-null  float32
 10  Range F 0   509834 non-null  float32
 11  Range F 1   509834 non-null  float32
 12  Range F 10  509834 non-null  float32
 13  Range F 11  509834 non-null  float32
 14  Range F 12  509834 non-null  float32
 15  Range F 13  509834 non-null  float32
 16  Range F 14  509834 non-null  float32
 17  Range F 2   509834 non-null  float32
 18  Range F 3   509834 non-null  float32
 19  Range F 4   509834 non-null  float32
 20  Range F 5   509834 non-null  float32
 21  Range F 6   509834 non-null  float32
 22  Range F 7   509834 non-null  float32
 23  Range F 8   509834 non-null  float32
 24  Range F 9   509834 non-null  float32
 25  V           509834 non-null  float32
 26  Vth         509834 non-null  float32
 27  Vx          509834 non-null  float32
 28  Vy          509834 non-null  float32
 29  Vz          509834 non-null  float32
 30  Beta        509834 non-null  float64
 31  Pdyn        509834 non-null  float64
 32  RmsBob      509834 non-null  float32
dtypes: float32(31), float64(2)
memory usage: 72.0 MB


labels_train.head()

1997-10-01 00:00:00    0
1997-10-01 00:10:00    0
1997-10-01 00:20:00    0
1997-10-01 00:30:00    0
1997-10-01 00:40:00    0
Name: label, dtype: int64


def plot_event(start, end, data, delta=36):
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)
    subset = data[
        (start - pd.Timedelta(hours=delta)) : (end + pd.Timedelta(hours=delta))
    ]

    fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(10, 15), sharex=True)

    # plot 1
    axes[0].plot(subset.index, subset["B"], color="gray", linewidth=2.5)
    axes[0].plot(subset.index, subset["Bx"])
    axes[0].plot(subset.index, subset["By"])
    axes[0].plot(subset.index, subset["Bz"])
    axes[0].legend(
        ["B", "Bx", "By", "Bz (nT)"], loc="center left", bbox_to_anchor=(1, 0.5)
    )
    axes[0].set_ylabel("Magnetic Field (nT)")

    # plot 2
    axes[1].plot(subset.index, subset["Beta"], color="gray")
    axes[1].set_ylim(-0.05, 1.7)
    axes[1].set_ylabel("Beta")

    # plot 3
    axes[2].plot(subset.index, subset["V"], color="gray")
    axes[2].set_ylabel("V(km/s)")
    # axes[2].set_ylim(250, 500)

    # plot 4
    axes[3].plot(subset.index, subset["Vth"], color="gray")
    axes[3].set_ylabel("$V_{th}$(km/s)")
    # axes[3].set_ylim(5, 60)

    # add vertical lines
    for ax in axes:
        ax.axvline(start, color="k")
        ax.axvline(end, color="k")
        ax.xaxis.grid(True, which="minor")

    return fig, axes


plot_event(
    pd.Timestamp("2001-10-31 22:00:00"), pd.Timestamp("2001-11-02 05:30:00"), data_train
);


from problem import turn_prediction_to_event_list


events = turn_prediction_to_event_list(labels_train)


rng = np.random.RandomState(1234)

for i in rng.randint(0, len(events), 3):
    plot_event(events[i].begin, events[i].end, data_train)


duration = [ev.end - ev.begin for ev in events]
duration = pd.Series(duration).dt.total_seconds() / 60 / 60


duration.mean()

23.512135922330096


fig, ax = plt.subplots()
ax.hist(duration, bins=np.arange(0, 60, 2))
ax.set_xlabel("Duration of an ICME event (hours)")

Text(0.5, 0, 'Duration of an ICME event (hours)')


event_starts = pd.Series([ev.begin for ev in events])


event_starts.groupby(event_starts.dt.year).size().plot()

<AxesSubplot: >


labels_train[:10000].plot()

<AxesSubplot: >


labels_train.value_counts()

0    451269
1     58565
Name: label, dtype: int64


labels_train.value_counts()[1] / len(labels_train)

0.11487072262736499


from problem import get_test_data

data_test, labels_test = get_test_data()


data_test.head()


from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


def compute_rolling_std(X_df, feature, time_window, center=False):
    """
    For a given dataframe, compute the standard deviation over
    a defined period of time (time_window) of a defined feature

    Parameters
    ----------
    X : dataframe
    feature : str
        feature in the dataframe we wish to compute the rolling std from
    time_window : str
        string that defines the length of the time window passed to `rolling`
    center : bool
        boolean to indicate if the point of the dataframe considered is
        center or end of the window
    """
    name = "_".join([feature, time_window, "std"])
    X_df[name] = X_df[feature].rolling(time_window, center=center).std()
    X_df[name] = X_df[name].ffill().bfill()
    X_df[name] = X_df[name].astype(X_df[feature].dtype)
    return X_df


class FeatureExtractor(BaseEstimator):
    def fit(self, X, y):
        return self

    def transform(self, X):
        return compute_rolling_std(X, "Beta", "2h")


def get_estimator():

    feature_extractor = FeatureExtractor()

    classifier = LogisticRegression(max_iter=1000)

    pipe = make_pipeline(feature_extractor, StandardScaler(), classifier)
    return pipe


model = get_estimator()


model.fit(data_train, labels_train)

Pipeline(steps=[('featureextractor', FeatureExtractor()),
                ('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

Pipeline(steps=[('featureextractor', FeatureExtractor()),
                ('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=1000))])

FeatureExtractor()

StandardScaler()

LogisticRegression(max_iter=1000)


y_pred = model.predict_proba(data_test)


y_pred.shape

(205574, 2)


y_pred

array([[0.99209222, 0.00790778],
       [0.98448509, 0.01551491],
       [0.99119094, 0.00880906],
       ...,
       [0.91488259, 0.08511741],
       [0.95712919, 0.04287081],
       [0.96905694, 0.03094306]])


from sklearn.metrics import log_loss, classification_report


log_loss(labels_test, y_pred)

0.1642886559586532


# using argmax here to convert the probabilities to binary 0/1
print(classification_report(labels_test, y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97    191755
           1       0.87      0.17      0.28     13819

    accuracy                           0.94    205574
   macro avg       0.91      0.58      0.62    205574
weighted avg       0.94      0.94      0.92    205574


s_y_pred = pd.Series(y_pred[:, 1], index=labels_test.index)


start, stop = "2008-09-01", "2008-09-30"
ax = s_y_pred[start:stop].plot(figsize=(15, 5))
labels_test[start:stop].plot(ax=ax)
ax.axhline(0.5, color="r", linestyle="--")

<matplotlib.lines.Line2D at 0x7fab9c552620>


start, stop = "2008-09-02 12:00", "2008-09-05 09:00"
ax = s_y_pred[start:stop].plot(figsize=(10, 5))
labels_test[start:stop].plot(ax=ax)
ax.axhline(0.5, color="r", linestyle="--")

events = turn_prediction_to_event_list(s_y_pred[start:stop])
for evt in events:
    ax.axvspan(evt.begin, evt.end, ymax=0.5, color="g", alpha=0.3)


events

[Event(2008-09-03 21:10:00 ---> 2008-09-03 23:50:00)]


s_y_pred_smoothed = s_y_pred.rolling(12, min_periods=0, center=True).quantile(0.90)


start, stop = "2008-09-02 12:00", "2008-09-05 09:00"
ax = s_y_pred[start:stop].plot(figsize=(10, 5))
labels_test[start:stop].plot(ax=ax)
s_y_pred_smoothed[start:stop].plot(ax=ax)

ax.axhline(0.5, color="r", linestyle="--")

events = turn_prediction_to_event_list(s_y_pred_smoothed[start:stop])
for evt in events:
    ax.axvspan(evt.begin, evt.end, ymax=0.5, color="g", alpha=0.3)


from problem import turn_prediction_to_event_list, overlap_with_list, find


def precision(y_true, y_pred):
    event_true = turn_prediction_to_event_list(y_true)
    event_pred = turn_prediction_to_event_list(y_pred)
    FP = [
        x
        for x in event_pred
        if max(overlap_with_list(x, event_true, percent=True)) < 0.5
    ]
    if len(event_pred):
        score = 1 - len(FP) / len(event_pred)
    else:
        # no predictions -> precision not defined, but setting to 0
        score = 0
    return score


def recall(y_true, y_pred):
    event_true = turn_prediction_to_event_list(y_true)
    event_pred = turn_prediction_to_event_list(y_pred)
    if not event_pred:
        return 0.0
    FN = 0
    for event in event_true:
        corresponding = find(event, event_pred, 0.5, "best")
        if corresponding is None:
            FN += 1
    score = 1 - FN / len(event_true)
    return score


precision(labels_test, s_y_pred)

0.2195121951219512


recall(labels_test, s_y_pred)

0.08411214953271029


from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from problem import get_cv


def evaluation(X, y):
    pipe = make_pipeline(FeatureExtractor(), Classifier())
    cv = get_cv(X, y)
    results = cross_validate(
        pipe,
        X,
        y,
        scoring=["neg_log_loss"],
        cv=cv,
        verbose=1,
        return_train_score=True,
        n_jobs=1,
    )

    return results


results = evaluation(data_train, labels_train)


print(
    "Training score Log Loss: {:.3f} +- {:.3f}".format(
        -np.mean(results["train_neg_log_loss"]), np.std(results["train_neg_log_loss"])
    )
)
print(
    "Testing score Log Loss: {:.3f} +- {:.3f} \n".format(
        -np.mean(results["test_neg_log_loss"]), np.std(results["test_neg_log_loss"])
    )
)

Training score Log Loss: nan +- nan
Testing score Log Loss: nan +- nan


!ramp-test --submission starting_kit  # --quick-test to select only a small part of the data

	B	Bx	Bx_rms	By	By_rms	Bz	Bz_rms	Na_nl	Np	Np_nl	...	Range F 8	Range F 9	V	Vth	Vx	Vy	Vz	Beta	Pdyn	RmsBob
1997-10-01 00:00:00	6.584763	3.753262	2.303108	0.966140	2.602693	-5.179685	2.668414	2.290824	23.045732	24.352797	...	2.757919e+09	2.472087e+09	378.313934	80.613098	-351.598389	-138.521454	6.956387	7.641340	5.487331e-15	0.668473
1997-10-01 00:10:00	6.036456	0.693559	1.810752	-0.904843	2.165570	-1.944006	2.372931	2.119593	23.000492	20.993362	...	3.365612e+09	3.087122e+09	350.421021	69.919327	-331.012146	-110.970787	-21.269474	9.149856	4.783776e-15	0.753848
1997-10-01 00:20:00	5.653682	-4.684786	0.893058	-2.668830	0.768677	1.479302	1.069266	2.876815	20.676191	17.496399	...	1.675611e+09	1.558640e+09	328.324493	92.194435	-306.114899	-117.035202	-13.018987	11.924199	3.719768e-15	0.282667
1997-10-01 00:30:00	5.461768	-4.672382	1.081638	-2.425630	0.765681	1.203713	0.934445	2.851195	20.730188	16.747108	...	1.589037e+09	1.439569e+09	319.436859	94.230705	-298.460938	-110.403969	-20.350492	16.032987	3.525211e-15	0.304713
1997-10-01 00:40:00	6.177846	-5.230110	1.046126	-2.872561	0.635256	1.505010	0.850657	3.317076	20.675701	17.524536	...	1.812308e+09	1.529260e+09	327.545929	89.292595	-307.303070	-111.865845	-12.313167	10.253789	3.694283e-15	0.244203

	B	Bx	Bx_rms	By	By_rms	Bz	Bz_rms	Na_nl	Np	Np_nl	...	Range F 8	Range F 9	V	Vth	Vx	Vy	Vz	Beta	Pdyn	RmsBob
2008-01-01 00:00:00	4.191322	-3.683284	0.174691	1.798880	0.273878	-0.362722	0.428125	0.132609	6.028121	4.624486	...	9.860334e+08	1.558658e+09	346.071014	36.012177	-345.823730	2.977509	-6.587379	0.934875	1.208364e-15	0.129070
2008-01-01 00:10:00	4.257490	-2.976951	0.144501	2.953034	0.125648	0.410667	0.274116	0.123604	6.149254	5.863568	...	1.126279e+09	1.762056e+09	348.415741	34.976662	-348.193298	-5.476861	-10.663119	0.872927	1.248913e-15	0.079137
2008-01-01 00:20:00	4.190869	-3.058623	0.286867	2.442341	0.311452	-0.292565	0.326747	0.101965	6.070359	5.217190	...	1.144053e+09	1.718165e+09	348.164673	34.677750	-347.992798	-1.428905	-8.036335	0.879172	1.231391e-15	0.132966
2008-01-01 00:30:00	4.261395	-1.951039	0.148014	3.703976	0.095857	-0.347672	0.228227	0.079186	6.000411	5.828165	...	1.098246e+09	1.878379e+09	356.162506	32.947617	-355.946167	-10.620467	-5.134477	0.764977	1.272731e-15	0.069234
2008-01-01 00:40:00	4.267907	-1.729106	0.101710	3.840123	0.068386	0.392823	0.159227	0.073780	5.841330	5.708538	...	1.202470e+09	1.859657e+09	357.871826	31.242083	-357.522736	-9.289846	-12.252926	0.657990	1.251328e-15	0.047655

RAMP on detecting Solar storms¶

Table of Contents¶

Introduction¶

Getting started with the RAMP starting kit¶

Software prerequisites¶

Getting the data¶

The data¶

An example ICME "solar storm" event¶

The duration of events¶

Number of events show a cycle¶

Imbalance between solar wind and solar storm¶

Testing data¶

Workflow¶

The model to submit¶

Evaluation¶

Evaluation with Cross-Validation¶

Submitting to the online challenge: ramp.studio ¶

More information¶

Questions¶

RAMP on detecting Solar storms¶

Table of Contents¶

Introduction¶

Getting started with the RAMP starting kit¶

Software prerequisites¶

Getting the data¶

The data¶

An example ICME "solar storm" event¶

The duration of events¶

Number of events show a cycle¶

Imbalance between solar wind and solar storm¶

Testing data¶

Workflow¶

The model to submit¶

Evaluation¶

Evaluation with Cross-Validation¶

Submitting to the online challenge: ramp.studio¶

More information¶

Questions¶

Submitting to the online challenge: ramp.studio ¶