import numpy as np


from problem import get_train_data, get_test_data, WeightedClassificationError


!python download_data.py

data directory is not empty. Please empty it or select another destination for LOCAL_DATA if you wish to proceed


videos_train, labels_train = get_train_data()


print(labels_train[:10])
print(labels_train.shape)
print('Number of videos in the training set: {}'.format(labels_train.size))

['H' 'H' 'A' 'F' 'H' 'A' 'A' 'F' 'F' 'A']
(177,)
Number of videos in the training set: 177


print(type(videos_train)) # List
print(type(videos_train[0])) # VideoReader object (see below)

<class 'list'>
<class 'problem.VideoReader'>


import matplotlib.pyplot as plt
# Here, we use the read_frame method to extract a specific time
# See below for the description of this method
plt.imshow(videos_train[20].read_frame(frame_time=24), cmap='gray')
plt.axis('off')
plt.show()


videos_train[20].plot_sequence(27., 31.25)


labs, counts = np.unique(labels_train, return_counts=True)
plt.bar(labs, counts)
plt.title("Barplot of class distribution in the train dataset")
plt.show()


# You can import the VideoReader class from problem.py via
from problem import VideoReader


import cv2

class VideoReader:
    def __init__(self, video_filename, frame_times, img_size=[250, 250]):
        self.video = cv2.VideoCapture(video_filename)
        self.nb_frames = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
        self.img_size = img_size
        self.frame_times = frame_times

    def read_frame(self, frame_time):
        """Return the frame of a VideoReader object at the specified `frame_time`

        Args:
            frame_time (float): the specified time in hours (allowing quarter hours, e.g. 25.75 or 26.50)

        Raises:
            ValueError: If the specified time does not exist for the selected video

        Returns:
            np.ndarray: A 2-D array containing the grayscale image. 
        """
        if frame_time not in self.frame_times:
            raise ValueError('The specified frame time must me within the time '
                             'interval of the video.')

        frame_nb = np.where(self.frame_times == frame_time)[0][0]
        if frame_nb is not None:
            self.video.set(cv2.CAP_PROP_POS_FRAMES, frame_nb)
        _, frame = self.video.read()

        # always reset video's frame counter to 0 to avoid unexpected behavior
        self.video.set(cv2.CAP_PROP_POS_FRAMES, 0)

        return cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    def read_sequence(self, begin_time=None, end_time=None):
        """Extract the sequence of consecutive frames from begin_time to end_time (included)

        Args:
            begin_time (float, optional): The time where the extraction begins. Defaults to None.
            end_time (float, optional):  The time where the extraction ends. Defaults to None.

        Returns:
            np.ndarray: A 3-D numpy array with first axis corresponding to the frame index and the remaining dimension to image size.
        """
        if begin_time is None:
            begin_time = self.frame_times[0]
        elif begin_time not in self.frame_times:
            raise ValueError('The specified begin_time must me within the time'
                             ' interval of the video.')

        if end_time is None:
            end_time = self.frame_times[-1]
        elif end_time not in self.frame_times:
            raise ValueError('The specified end_time must me within the time '
                             'interval of the video.')

        if begin_time > end_time:
            raise ValueError("begin_time must be smaller than end_time.")

        begin_nb = np.where(self.frame_times == begin_time)[0][0]
        end_nb = np.where(self.frame_times == end_time)[0][0]
        self.video.set(cv2.CAP_PROP_POS_FRAMES, begin_nb)

        my_frames = list(range(begin_nb, end_nb + 1))
        video_array = np.empty(
            shape=(len(my_frames), self.img_size[0], self.img_size[1])
        )
        for t, _ in enumerate(my_frames):
            _, frame = self.video.read()
            video_array[t, :, :] = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # always reset video's frame counter to 0 to avoid unexpected behavior 
        self.video.set(cv2.CAP_PROP_POS_FRAMES, 0)

        return video_array

    def read_samples(self, selected_times=None):
        """Read several frames of the video at once corresponding to the selected times.

        Args:
            selected_times (list, optional): The list of of desired extraction times, in hours (allowing quarter hour). Defaults to None, the whole 300 frames are returned.

        Returns:
            np.ndarray: A 3-D numpy array with of shape (size len(selected_times), 250, 250).
        """
        if selected_times is None:
            selected_times = self.frame_times

        res = np.empty([len(selected_times), self.img_size[0], self.img_size[1]])
        frame_nbs = np.where([t in selected_times for t in self.frame_times])[0]
        for i, f in enumerate(frame_nbs):
            self.video.set(cv2.CAP_PROP_POS_FRAMES, f)
            _, frame = self.video.read()

            res[i, :, :] = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        self.video.set(cv2.CAP_PROP_POS_FRAMES, 0)

        return res
    
    def plot_sequence(self, begin_time=None, end_time=None, fig_width=20.0):
        """Plots the result of read_sequence.

        Args:
            begin_time (float, optional): The time where the extraction begins.
            Defaults to None.
            end_time (float, optional):  The time where the extraction ends.
            Defaults to None.
            fig_width (float, optional): The total figure width, height is
            adapted automatically.

        Returns:
            None (but displays the matplotlib figure).
        """
        vid_arr = self.read_sequence(begin_time, end_time)
        n_vids = vid_arr.shape[0]

        # Create subplots of 10 columns
        num_cols = 10
        num_rows = int(n_vids // num_cols)
        num_rows = num_rows + 1 if n_vids % num_cols != 0 else num_rows

        fig_height = fig_width * num_rows / num_cols

        fig = plt.figure(figsize=(fig_width, fig_height))
        for i in range(1, num_rows * num_cols + 1):
            if i - 1 >= n_vids:
                break
            img = vid_arr[i - 1]
            fig.add_subplot(num_rows, num_cols, i)
            plt.imshow(img, cmap="gray")
            plt.axis("off")

        plt.show()


example_video = videos_train[0]
print("A video is stored as a " + str(type(example_video.video)) + " Python object.")
print("It has " + str(example_video.nb_frames) + " frames of size " + str(example_video.img_size) + ".")
print("The frame are recorded at the following times (expressed as time in hour since the fecondation)")
print(example_video.frame_times[0:30])

A video is stored as a <class 'cv2.VideoCapture'> Python object.
It has 300 frames of size [250, 250].
The frame are recorded at the following times (expressed as time in hour since the fecondation)
[23.75 24.   24.25 24.5  24.75 25.   25.25 25.5  25.75 26.   26.25 26.5
 26.75 27.   27.25 27.5  27.75 28.   28.25 28.5  28.75 29.   29.25 29.5
 29.75 30.   30.25 30.5  30.75 31.  ]


selected_times = example_video.frame_times[[0, 200, 299]] 
print("Plot the videos frames corresponding to times: " + str(selected_times) + " hours.")
for frame_time in selected_times:
    plt.imshow(example_video.read_frame(frame_time), cmap="gray")
    plt.axis('off')
    plt.show()

Plot the videos frames corresponding to times: [23.75 73.75 98.5 ] hours.


video_array = example_video.read_sequence()
video_array.shape

(300, 250, 250)


video_array = example_video.read_sequence(end_time = 29)
print(video_array.shape)

video_array = example_video.read_sequence(begin_time = 28.75, end_time=29)
print(video_array.shape)

(22, 250, 250)
(2, 250, 250)


video_array = example_video.read_samples(selected_times=[23.75, 62.25, 96])
video_array.shape

(3, 250, 250)


# DO NOT TRY TO RUN (memory will collapse)
# data = []
# for video in videos_train:
#     data.append(data.read_samples())


videos_test, labels_test  = get_test_data()


print('Number of subjects in the test set: {}'.format(labels_test.size))

Number of subjects in the test set: 100


pred_times = [27, 32, 37, 40, 44, 48, 53, 58, 63, 94]


import numpy as np


class VideoClassifier(object):
    def __init__(self):
        pass

    def fit(self, videos: list, y, pred_time: float):
        classes = ["A", "B", "C", "D", "E", "F", "G", "H"]
        self.n_classes = len(classes)
        pass

    def predict(self, videos: list, pred_time: float):
        proba = np.random.rand(len(videos), self.n_classes)
        proba /= proba.sum(axis=1)[:, np.newaxis]
        return proba


my_pred_time = 94
my_model = VideoClassifier()
my_model.fit(videos_train, labels_train, pred_time=my_pred_time)
y_pred = my_model.predict(videos_test, pred_time=my_pred_time)


W = np.array(
    [
        [0, 1, 6, 10, 10, 10, 10, 10],
        [1, 0, 3, 10, 10, 10, 10, 10],
        [6, 3, 0, 2, 9, 10, 10, 10],
        [10, 10, 2, 0, 9, 9, 10, 10],
        [10, 10, 9, 9, 0, 8, 8, 8],
        [10, 10, 10, 9, 8, 0, 9, 8],
        [10, 10, 10, 10, 8, 9, 0, 9],
        [10, 10, 10, 10, 8, 8, 9, 0],
    ]
)

labels = ["A", "B", "C", "D", "E", "F", "G", "H"]
fig, ax = plt.subplots()
im = ax.imshow(W, cmap="GnBu")

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(labels)), labels=labels)
ax.set_yticks(np.arange(len(labels)), labels=labels)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(labels)):
    for j in range(len(labels)):
        text = ax.text(j, i, W[i, j],
                       ha="center", va="center", color="w")

# space between tiles
ax.set_xticks(np.arange(8+1)-.5, minor=True)
ax.set_yticks(np.arange(8+1)-.5, minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
ax.tick_params(which="minor", bottom=False, left=False)

ax.set_title("Missclassification weight between each class")
fig.tight_layout()
plt.show()


from sklearn.preprocessing import OneHotEncoder

# we need to convert labels (str) to 1-hot encoding (n, 8) 
labels_test = labels_test.reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(labels_test)
y_true = enc.transform(labels_test)

# Example of 1-hot encoding for the 10 first test points
print(np.hstack([labels_test[:10], y_true.todense()[:10, :]]))

[['A' 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]
 ['A' 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]
 ['G' 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0]
 ['A' 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]
 ['F' 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0]
 ['C' 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0]
 ['H' 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0]
 ['C' 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0]
 ['C' 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0]
 ['H' 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0]]


wce = WeightedClassificationError(time_idx=my_pred_time)
wce.compute(y_true=y_true, y_pred=y_pred)

0.797


err = np.zeros((len(pred_times),))
all_preds = []
for time_idx, my_pred_time in enumerate(pred_times):
    my_model = VideoClassifier()
    my_model.fit(videos_train, labels_train, pred_time=my_pred_time)
    y_pred = my_model.predict(videos_test, pred_time=my_pred_time)
    all_preds.append(y_pred)
    wce = WeightedClassificationError(time_idx=time_idx)
    err[time_idx] = wce.compute(y_true=y_true, y_pred=y_pred)
    print(wce.name, ' at time', my_pred_time, 'is:', err[time_idx])

all_preds = np.concatenate(all_preds, axis=1)

WeightedClassificationError[1]  at time 27 is: 0.7219999999999999
WeightedClassificationError[2]  at time 32 is: 0.767
WeightedClassificationError[3]  at time 37 is: 0.74
WeightedClassificationError[4]  at time 40 is: 0.746
WeightedClassificationError[5]  at time 44 is: 0.669
WeightedClassificationError[6]  at time 48 is: 0.816
WeightedClassificationError[7]  at time 53 is: 0.7829999999999999
WeightedClassificationError[8]  at time 58 is: 0.6920000000000001
WeightedClassificationError[9]  at time 63 is: 0.722
WeightedClassificationError[10]  at time 94 is: 0.7490000000000001


plt.figure()
plt.plot(pred_times, err) 
plt.ylim(0, 1)
plt.title("Evolution of the error with the prediction time (in HAF)")

Text(0.5, 1.0, 'Evolution of the error with the prediction time (in HAF)')


from problem import AreaUnderCurveError

auc = AreaUnderCurveError(score_func_name="classification", prediction_times=pred_times)

auc.compute(y_true=y_true, y_pred=all_preds)

0.7393955223880597


n_test = y_true.shape[0]
for k in range(8):
    y_k = np.zeros(y_true.shape)
    y_k[range(n_test), k] = 1 
    print(
        'Classifying all videos in class', k, 
        'gives a score of:', wce.compute(y_true=y_true, y_pred=y_k))

Classifying all videos in class 0 gives a score of: 0.78
Classifying all videos in class 1 gives a score of: 0.746
Classifying all videos in class 2 gives a score of: 0.677
Classifying all videos in class 3 gives a score of: 0.768
Classifying all videos in class 4 gives a score of: 0.7070000000000002
Classifying all videos in class 5 gives a score of: 0.831
Classifying all videos in class 6 gives a score of: 0.778
Classifying all videos in class 7 gives a score of: 0.762


y_true.shape

(100, 8)


!ramp-test --quick-test --submission starting_kit

Testing Bovine embryos survival prediction
Reading train and test files from ./data/ ...
Reading cv ...
Training submissions/starting_kit ...
CV fold 0
	score  AUC[classification]  WeightedClassifErr[1]  WeightedClassifErr[2]  WeightedClassifErr[3]  WeightedClassifErr[4]  WeightedClassifErr[5]  WeightedClassifErr[6]  WeightedClassifErr[7]  WeightedClassifErr[8]  WeightedClassifErr[9]  WeightedClassifErr[10]      time
	train                 0.74                   0.70                   0.75                   0.72                   0.73                   0.80                   0.73                   0.74                   0.73                   0.77                    0.69  0.001729
	valid                 0.74                   0.72                   0.85                   0.82                   0.77                   0.62                   0.83                   0.87                   0.65                   0.48                    0.97  0.001750
	test                  0.78                   0.78                   0.83                   0.79                   0.70                   0.77                   0.69                   0.78                   0.78                   0.78                    0.80  0.001498
CV fold 1
	score  AUC[classification]  WeightedClassifErr[1]  WeightedClassifErr[2]  WeightedClassifErr[3]  WeightedClassifErr[4]  WeightedClassifErr[5]  WeightedClassifErr[6]  WeightedClassifErr[7]  WeightedClassifErr[8]  WeightedClassifErr[9]  WeightedClassifErr[10]      time
	train                 0.75                   0.75                   0.90                   0.74                   0.72                   0.65                   0.76                   0.83                   0.77                   0.65                    0.80  0.001336
	valid                 0.83                   0.93                   0.97                   0.75                   0.80                   0.87                   0.95                   1.00                   0.97                   0.82                    0.65  0.002065
	test                  0.77                   0.82                   0.77                   0.72                   0.79                   0.73                   0.71                   0.80                   0.78                   0.83                    0.72  0.001446
----------------------------
Mean CV scores
----------------------------
	score AUC[classification] WeightedClassifErr[1] WeightedClassifErr[2] WeightedClassifErr[3] WeightedClassifErr[4] WeightedClassifErr[5] WeightedClassifErr[6] WeightedClassifErr[7] WeightedClassifErr[8] WeightedClassifErr[9] WeightedClassifErr[10]       time
	train        0.74 ± 0.005          0.72 ± 0.023          0.83 ± 0.075           0.73 ± 0.01          0.72 ± 0.004          0.72 ± 0.073          0.75 ± 0.015          0.78 ± 0.046          0.75 ± 0.019           0.71 ± 0.06           0.75 ± 0.054  0.0 ± 0.0
	valid        0.78 ± 0.044          0.82 ± 0.108          0.91 ± 0.058          0.78 ± 0.033          0.78 ± 0.017          0.74 ± 0.125          0.89 ± 0.058          0.93 ± 0.067          0.81 ± 0.158          0.65 ± 0.167           0.81 ± 0.158  0.0 ± 0.0
	test         0.77 ± 0.003            0.8 ± 0.02           0.8 ± 0.028          0.75 ± 0.035          0.75 ± 0.047          0.75 ± 0.017           0.7 ± 0.013          0.79 ± 0.008          0.78 ± 0.002           0.8 ± 0.022           0.76 ± 0.038  0.0 ± 0.0
----------------------------
Bagged scores
----------------------------
	score  AUC[classification]  WeightedClassifErr[1]  WeightedClassifErr[2]  WeightedClassifErr[3]  WeightedClassifErr[4]  WeightedClassifErr[5]  WeightedClassifErr[6]  WeightedClassifErr[7]  WeightedClassifErr[8]  WeightedClassifErr[9]  WeightedClassifErr[10]
	valid                 0.78                   0.83                   0.91                   0.78                   0.78                   0.74                   0.89                   0.93                   0.81                   0.65                    0.81
	test                  0.73                   0.78                   0.82                   0.75                   0.68                   0.71                   0.72                   0.75                   0.79                   0.69                    0.73

Bovine Embryos Movies challenge

A data challenge on early prediction of the fate of bovine embryos

1. Introduction¶

Setup¶

Prerequisites¶

The problem.py file¶

Downloading the data¶

Data exploration

Loading the data¶

Data visualization¶

The VideoReader class¶

Attributes¶

Methods¶

Designing a submission¶

Evaluation¶

Handling several prediction times¶

Mandatory structure of a submission¶

Illustration : a dummy random classifier¶

Score function¶

Weight matrix $W$¶

Score of the dummy random classifier¶

The ranking method on RAMP-board¶

Performance of a deterministic classifier¶

Submitting to RAMP¶

More information¶