%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain
pd.set_option('display.max_columns', None)


import problem

X_df, y = problem.get_train_data()


X_df.head()


label_names = {1: 'binary', 2: 'cepheid', 3: 'rr_lyrae', 4: 'mira'}
labels = list(label_names.keys())
y_series = pd.Series(y).replace(label_names)
y_series.head()

0    rr_lyrae
1        mira
2    rr_lyrae
3      binary
4    rr_lyrae
dtype: object


_ = y_series.value_counts().plot(kind="bar")


colors = ['r', 'b', 'g', 'm']
def plot_classwise_normalized(feature, bins=None):
    if bins is None:
        bins = np.linspace(X_df[feature].min(), X_df[feature].max(), 15)
    for label, color in zip(labels, colors):
        plt.hist(X_df[y == label][feature].values, density=True, bins=bins, 
                 alpha=0.5, color=color)


plot_classwise_normalized('period')


plot_classwise_normalized('period', bins=np.linspace(0, 50, 15))


plot_classwise_normalized('period', bins=np.linspace(0, 5, 15))


X_df['real_period'] = X_df['period'] / X_df['div_period']


plot_classwise_normalized('real_period')


plot_classwise_normalized('real_period', bins=np.linspace(0, 50, 15))


plot_classwise_normalized('real_period', bins=np.linspace(0, 5, 15))


plot_classwise_normalized('magnitude_b')


plot_classwise_normalized('magnitude_r')


plot_classwise_normalized('asym_b')


plot_classwise_normalized('asym_r')


plot_classwise_normalized('sigma_flux_b')


plot_classwise_normalized('sigma_flux_b', bins=np.linspace(0, 1000, 15))


plot_classwise_normalized('sigma_flux_b', bins=np.linspace(0, 100, 15))


plot_classwise_normalized('sigma_flux_r')


plot_classwise_normalized('sigma_flux_r', bins=np.linspace(0, 1000, 15))


plot_classwise_normalized('sigma_flux_r', bins=np.linspace(0, 100, 15))


plot_classwise_normalized('log_p_not_variable')


colors = ['r', 'b', 'g', 'm']
def plot_classwise_scatter(feature1, feature2, range1=None, range2=None):
    if range1 is None:
        range1 = [X_df[feature1].min(), X_df[feature1].max()]
    if range2 is None:
        range2 = [X_df[feature2].min(), X_df[feature2].max()]
    for label, color in zip(labels, colors):
        plt.xlim(range1[0], range1[1])
        plt.ylim(range2[0], range2[1])
        plt.scatter(X_df[y == label][feature1], 
                    X_df[y == label][feature2],
            alpha=0.3, s=80, c=color, marker='.');


plot_classwise_scatter('magnitude_b', 'magnitude_r')


plot_classwise_scatter('magnitude_b', 'real_period', range1=None, range2=[0,10])


X_df.hist('num_points_good_r')
print(min(X_df['num_points_good_b']))
print(max(X_df['num_points_good_b']))
print(min(X_df['num_points_good_r']))
print(max(X_df['num_points_good_r']))

33.0
124.0
11.0
125.0


patch_id = 98
star_id_b = 477

def star_key(slab_id, star_id_b):
    return str(slab_id) + '_' + str(star_id_b)


X_df = X_df.set_index(X_df.apply(lambda row: "%d_%d" % (row['patch_id'], row['star_id_b']), axis=1))


time_points = X_df.loc[star_key(patch_id, star_id_b)]['time_points_b']
light_points = X_df.loc[star_key(patch_id, star_id_b)]['light_points_b']
error_points = X_df.loc[star_key(patch_id, star_id_b)]['error_points_b']
plt.errorbar(time_points, light_points, yerr=error_points, fmt='o');


def fold_time_series(time_point, period, div_period):
    real_period = period / div_period
    return time_point % real_period  # modulo real_period


period = X_df.loc[star_key(patch_id, star_id_b)]['period']
div_period = X_df.loc[star_key(patch_id, star_id_b)]['div_period']
print(period, div_period)

1.41875 1.0


time_points_folded = [fold_time_series(time_point, period, div_period) 
                      for time_point in time_points]


plt.gca().invert_yaxis()
plt.errorbar(time_points_folded, light_points, yerr=error_points, fmt='o');


cols = [
    'magnitude_b', 
    'magnitude_r',
    'period',
    'asym_b', 
    'asym_r', 
    'log_p_not_variable', 
    'sigma_flux_b', 
    'sigma_flux_r', 
    'quality', 
    'div_period',
]

from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
    ('passthrough', cols)
)

X_array = transformer.fit_transform(X_df)


X_array

array([[1.91458e+01, 1.88044e+01, 2.35495e+00, ..., 4.64916e+01,
        1.00000e+00, 7.00000e+00],
       [1.76799e+01, 1.53077e+01, 2.80000e+02, ..., 1.26974e+03,
        2.00000e+00, 2.00000e+00],
       [1.93872e+01, 1.88606e+01, 1.42264e+00, ..., 4.82365e+01,
        2.00000e+00, 3.00000e+00],
       ...,
       [1.59894e+01, 1.48990e+01, 7.37556e-01, ..., 1.68217e+03,
        0.00000e+00, 5.00000e+00],
       [1.80060e+01, 1.84855e+01, 6.94903e+00, ..., 7.63486e+01,
        1.00000e+00, 6.00000e+00],
       [1.79219e+01, 1.79493e+01, 3.31077e+00, ..., 9.65315e+01,
        2.00000e+00, 1.00000e+00]])


def fold_time_series(time_point, period, div_period):
    return (time_point -
            (time_point // (period / div_period)) * period / div_period)


def get_bin_means(X_df, num_bins, band):
    feature_array = np.empty((len(X_df), num_bins))

    for k, (_, x) in enumerate(X_df.iterrows()):
        period = x['period']
        div_period = x['div_period']
        real_period = period / div_period
        bins = [i * real_period / num_bins for i in range(num_bins + 1)]

        time_points = np.array(x['time_points_' + band])
        light_points = np.array(x['light_points_' + band])
        time_points_folded = \
            np.array([fold_time_series(time_point, period, div_period)
                      for time_point in time_points])
        time_points_folded_digitized = \
            np.digitize(time_points_folded, bins) - 1

        for i in range(num_bins):
            this_light_points = light_points[time_points_folded_digitized == i]
            if len(this_light_points) > 0:
                feature_array[k, i] = np.mean(this_light_points)
            else:
                feature_array[k, i] = np.nan  # missing

    return feature_array


get_bin_means(X_df.iloc[:2], 5, 'r')

array([[18.69068966, 18.86333333, 19.01230769, 18.92055556, 18.73166667],
       [15.60714286, 15.54458333, 15.23852941, 15.15911765, 15.32315789]])


from sklearn.preprocessing import FunctionTransformer

transformer_r = FunctionTransformer(
    lambda X_df: get_bin_means(X_df, 5, 'r')
)

transformer_b = FunctionTransformer(
    lambda X_df: get_bin_means(X_df, 5, 'b')
)

transformer = make_column_transformer(
    (transformer_r, ['period', 'div_period', 'time_points_r', 'light_points_r']),
    (transformer_b, ['period', 'div_period', 'time_points_b', 'light_points_b']),
)

X_array = transformer.fit_transform(X_df)
X_array.shape

(2912, 10)


import numpy as np

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer


def fold_time_series(time_point, period, div_period):
    return (time_point -
            (time_point // (period / div_period)) * period / div_period)


def get_bin_means(X_df, num_bins, band):
    feature_array = np.empty((len(X_df), num_bins))

    for k, (_, x) in enumerate(X_df.iterrows()):
        period = x['period']
        div_period = x['div_period']
        real_period = period / div_period
        bins = [i * real_period / num_bins for i in range(num_bins + 1)]

        time_points = np.array(x['time_points_' + band])
        light_points = np.array(x['light_points_' + band])
        time_points_folded = \
            np.array([fold_time_series(time_point, period, div_period)
                      for time_point in time_points])
        time_points_folded_digitized = \
            np.digitize(time_points_folded, bins) - 1

        for i in range(num_bins):
            this_light_points = light_points[time_points_folded_digitized == i]
            if len(this_light_points) > 0:
                feature_array[k, i] = np.mean(this_light_points)
            else:
                feature_array[k, i] = np.nan  # missing

    return feature_array


transformer_r = FunctionTransformer(
    lambda X_df: get_bin_means(X_df, 5, 'r')
)

transformer_b = FunctionTransformer(
    lambda X_df: get_bin_means(X_df, 5, 'b')
)

cols = [
    'magnitude_b',
    'magnitude_r',
    'period',
    'asym_b',
    'asym_r',
    'log_p_not_variable',
    'sigma_flux_b',
    'sigma_flux_r',
    'quality',
    'div_period',
]

common = ['period', 'div_period']
transformer = make_column_transformer(
    (transformer_r, common + ['time_points_r', 'light_points_r']),
    (transformer_b, common + ['time_points_b', 'light_points_b']),
    ('passthrough', cols)
)

pipe = make_pipeline(
    transformer,
    SimpleImputer(strategy='most_frequent'),
    RandomForestClassifier(max_depth=5, n_estimators=10)
)


def get_estimator():
    return pipe


import problem
from sklearn.model_selection import cross_val_score

X_df, y = problem.get_train_data()

scores = cross_val_score(get_estimator(), X_df, y, cv=2, scoring='accuracy')
print(scores)

[0.87087912 0.90728022]

	patch_id	star_id_b	star_id_r	magnitude_b	magnitude_r	asc_d	asc_m	asc_s	dec_d	dec_m	dec_s	period	frequency	num_points_good_b	num_points_good_r	asym_b	asym_r	log_p_not_variable	sigma_flux_b	sigma_flux_r	quality	div_period	time_points_b	time_points_r	light_points_b	light_points_r	error_points_b	error_points_r	bkg_points_b	bkg_points_r	polltn_points_b	polltn_points_r
0	135.0	9613.0	10062.0	19.1458	18.8044	5.0	40.0	51.37	-70.0	13.0	58.78	2.35495	0.424637	123.0	124.0	1.277780	0.653333	-2.80984	36.1471	46.4916	1.0	7.0	[290.3, 291.35, 322.25, 326.24, 345.18, 347.23...	[290.34, 291.31, 322.29, 326.28, 345.23, 347.1...	[19.15, 18.96, 18.86, 19.17, 18.96, 19.03, 19....	[18.75, 18.66, 18.5, 18.68, 18.77, 18.69, 18.7...	[0.13, 0.12, 0.12, 0.13, 0.13, 0.13, 0.13, 0.1...	[0.16, 0.25, 0.14, 0.17, 0.18, 0.23, 0.14, 0.1...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.01, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.13, 0.13, 0.11, 0.11, 0.14, 0.19, 0.08, 0.1...
1	271.0	4304.0	4708.0	17.6799	15.3077	5.0	28.0	51.55	-69.0	23.0	19.17	280.00000	0.003571	123.0	125.0	0.921875	1.777780	-14.51460	136.1020	1269.7400	2.0	2.0	[290.3, 291.35, 322.25, 326.24, 345.18, 347.23...	[290.34, 291.31, 322.29, 326.28, 345.23, 347.1...	[17.7, 17.97, 17.83, 17.71, 17.6, 17.4, 17.46,...	[15.43, 15.66, 15.66, 15.5, 15.28, 15.28, 15.3...	[0.12, 0.1, 0.08, 0.09, 0.11, 0.09, 0.11, 0.12...	[0.1, 0.11, 0.09, 0.09, 0.06, 0.07, 0.06, 0.07...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.02, 0.02, 0.0, 0.0, 0.02, 0.02, 0.0, 0...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2	295.0	8200.0	8739.0	19.3872	18.8606	5.0	27.0	7.89	-68.0	43.0	14.47	1.42264	0.702919	122.0	124.0	2.388890	1.883720	-6.50514	32.1775	48.2365	2.0	3.0	[290.3, 291.35, 322.25, 326.24, 345.18, 347.23...	[290.34, 291.31, 322.29, 326.28, 345.23, 347.1...	[20.16, 19.37, 19.28, 19.53, 19.53, 19.95, 19....	[19.19, 18.81, 18.86, 19.06, 19.05, 18.89, 18....	[0.25, 0.14, 0.13, 0.16, 0.22, 0.26, 0.19, 0.2...	[0.27, 0.25, 0.17, 0.21, 0.24, 0.25, 0.15, 0.2...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.0, 0.0, 0.0,...	[0.01, 0.01, 0.0, 0.0, 0.0, 0.01, 0.0, 0.0, 0....	[0.03, 0.03, 0.02, 0.02, 0.06, 0.11, 0.05, 0.0...	[0.06, 0.03, 0.03, 0.03, 0.05, 0.06, 0.02, 0.0...
3	223.0	530.0	557.0	16.4751	15.5033	5.0	35.0	41.69	-70.0	56.0	13.25	193.54800	0.005167	124.0	124.0	0.252525	1.431370	-6.12045	368.7470	1026.6800	0.0	3.0	[290.3, 291.35, 322.25, 326.24, 345.18, 347.23...	[290.34, 291.31, 322.29, 326.28, 345.23, 347.1...	[16.73, 16.3, 16.25, 16.38, 16.22, 16.07, 16.0...	[15.24, 15.55, 15.74, 15.69, 15.55, 15.37, 15....	[0.14, 0.12, 0.11, 0.18, 0.23, 0.16, 0.2, 0.16...	[0.25, 0.21, 0.13, 0.13, 0.12, 0.13, 0.12, 0.1...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4	354.0	10866.0	12022.0	17.8301	16.7286	5.0	21.0	50.48	-69.0	17.0	32.07	1.81622	0.550594	123.0	116.0	1.510200	0.633803	-2.07771	121.7490	318.9770	1.0	5.0	[290.3, 291.35, 322.25, 326.24, 345.18, 347.23...	[290.34, 291.31, 322.29, 326.28, 345.23, 347.1...	[17.67, 17.51, 17.78, 17.73, 17.6, 17.87, 18.0...	[16.97, 16.88, 16.72, 16.67, 16.47, 17.12, 16....	[0.13, 0.1, 0.1, 0.1, 0.12, 0.1, 0.12, 0.14, 0...	[0.13, 0.14, 0.1, 0.1, 0.08, 0.18, 0.08, 0.08,...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.02, 0.02, 0.02, 0.0, 0....	[0.19, 0.22, 0.13, 0.11, 0.11, 0.35, 0.05, 0.0...

RAMP on variable star type prediction¶

Introduction¶

Variable stars¶

The EROS1 database, a catalog of light curves¶

Selecting variable stars¶

Selecting data¶

Exploratory data analysis¶

Get access to the training data¶

The static features¶

The labels¶

Some classwise histograms and scatterplots¶

The time series¶

Plotting time curves¶

Submission¶

The feature extractor¶

Testing using a scikit-learn pipeline¶

Submission¶