import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms, models
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneGroupOut


CLASSES = ["asymetric", "banded", "locked", "butterfly", "no_pattern"]


# Mapping int to categories
int_to_cat = {
    0: "asymetric",
    1: "banded",
    2: "locked",
    3: "butterfly",
    4: "no_pattern",
}


list(int_to_cat)

[0, 1, 2, 3, 4]


X_train = np.load("./data/X_train.npy")
X_train.shape

(620, 90, 180)


y_train_df = pd.read_csv("./data/y_train.csv")
y_train_df


plt.imshow(X_train[200], cmap='gray');
print(f"class: {y_train_df.category.iloc[200]}")

class: asymetric


plt.imshow(X_train[617], cmap='gray');
print(f"class: {y_train_df.category.iloc[617]}")

class: no_pattern


plt.imshow(X_train[72], cmap='gray');
print(f"class: {y_train_df.category.iloc[72]}")

class: banded


plt.imshow(X_train[349], cmap='gray');
print(f"class: {y_train_df.category.iloc[349]}")

class: butterfly


plt.imshow(X_train[222], cmap='gray');
print(f"class: {y_train_df.category.iloc[222]}")

class: locked


class_counts = y_train_df['category'].value_counts()

plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Examples')
plt.xticks(rotation=0)
plt.show()


all_pixels = X_train.flatten()

plt.figure(figsize=(10, 6))
plt.hist(all_pixels, bins=50, color='gray')
plt.title('Pixel Intensity Distribution Across All Images')
plt.xlabel('Pixel Intensity')
plt.ylabel('Frequency')
plt.show()


unique_categories = y_train_df['category'].unique()
num_categories = len(unique_categories)
fig, axes = plt.subplots(num_categories, 1, figsize=(8, 6 * num_categories))

for i, category in enumerate(unique_categories):
    # Get the indices for the current category
    category_indices = y_train_df[y_train_df['category'] == category].index
    # Select the images belonging to the current category
    category_images = X_train[category_indices]
    # Flatten the images to a single array of pixel values
    category_pixels = category_images.flatten()

    axes[i].hist(category_pixels, bins=50, color='gray')
    axes[i].set_title(f'Pixel Intensity Distribution for Category: {category}')
    axes[i].set_xlabel('Pixel Intensity')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


unique_categories = y_train_df['category'].unique()
num_categories = len(unique_categories)
fig, axes = plt.subplots(1, num_categories, figsize=(5 * num_categories, 5))

for i, category in enumerate(unique_categories):
    # Get the indices for the current category
    category_indices = y_train_df[y_train_df['category'] == category].index
    # Select the images belonging to the current category
    category_images = X_train[category_indices]
    # Calculate the average image
    average_image = np.mean(category_images, axis=0)

    ax = axes[i]
    ax.imshow(average_image, cmap='gray')
    ax.set_title(f'Average Image for Category: {category}')
    ax.axis('off')

plt.tight_layout()
plt.show()


X_test = np.load("./data/X_test.npy")
X_test.shape

(68, 90, 180)


y_test_df = pd.read_csv("./data/y_test.csv")
y_test_df


class_counts = y_test_df['category'].value_counts()

plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Examples')
plt.xticks(rotation=0)
plt.show()


def create_modified_mobilenet(num_classes):
    # Load a pretrained MobileNet model
    model = models.mobilenet_v2(weights='IMAGENET1K_V1')

    # Freeze all layers in the network
    for param in model.parameters():
        param.requires_grad = False

    # Replace the last fully connected layer
    # Parameters of newly constructed modules have requires_grad=True by default
    num_features = model.classifier[1].in_features
    model.classifier[1] = nn.Linear(num_features, num_classes)

    return model


# labels
y_train = y_train_df.cat_num.values
print(y_train.shape)
y_test = y_test_df.cat_num.values
print(y_test.shape)

(620,)
(68,)


# Reshape data to add channel dimension
X_train = X_train.reshape(-1, 1, 90, 180)
X_test = X_test.reshape(-1, 1, 90, 180)
print(X_train.shape)
print(X_test.shape)

(620, 1, 90, 180)
(68, 1, 90, 180)


# Conversion from numpy arrays to Pytorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.int64)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.int64)


class GrayscaleToRgb:
    def __call__(self, tensor):
        # tensor is a 1-channel grayscale image
        return tensor.repeat(3, 1, 1)


transform = transforms.Compose([
    GrayscaleToRgb(),
    transforms.Resize((224, 224), antialias=True),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


X_train_transformed = torch.stack([transform(x) for x in X_train_tensor])
X_train_transformed.shape

torch.Size([620, 3, 224, 224])


def custom_imshow(img):
    # Unnormalize the image
    img = img.numpy().transpose((1, 2, 0))  # Convert from Tensor image
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = std * img + mean
    img = np.clip(img, 0, 1)  # Clip to ensure it's between 0 and 1

    plt.imshow(img)
    plt.axis('off')
    plt.show()


custom_imshow(X_train_transformed[0])


custom_imshow(X_train_transformed[349])


# Transform on test images:
X_test_transformed = torch.stack([transform(x) for x in X_test_tensor])
X_test_transformed.shape

torch.Size([68, 3, 224, 224])


# Labels:
print(y_train_tensor.shape)
print(y_test_tensor.shape)

torch.Size([620])
torch.Size([68])


train_dataset = TensorDataset(X_train_transformed, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


test_dataset = TensorDataset(X_test_transformed, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = create_modified_mobilenet(num_classes=5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.6839
Epoch [2/10], Loss: 0.7206
Epoch [3/10], Loss: 0.6356
Epoch [4/10], Loss: 0.4827
Epoch [5/10], Loss: 0.4240
Epoch [6/10], Loss: 0.3566
Epoch [7/10], Loss: 0.4369
Epoch [8/10], Loss: 0.4515
Epoch [9/10], Loss: 0.4223
Epoch [10/10], Loss: 0.8125


model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on Test Data: {accuracy:.3f}')

Accuracy on Test Data: 0.735


X_test_to_predict = X_test_transformed.to(device)

# Model in evaluation mode
model.eval()

with torch.no_grad():
    # Forward pass
    outputs = model(X_test_to_predict)
    _, y_pred = torch.max(outputs, 1)

# Convert predictions to a numpy array
y_pred = y_pred.cpu().numpy()


accuracy_score(y_test_tensor, y_pred)

0.7352941176470589


balanced_accuracy_score(y_test_tensor, y_pred)

0.5771428571428572


print(classification_report(y_test_tensor, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.10      0.18        10
           1       1.00      0.83      0.91         6
           2       0.67      1.00      0.80         4
           3       0.00      0.00      0.00         6
           4       0.71      0.95      0.82        42

    accuracy                           0.74        68
   macro avg       0.68      0.58      0.54        68
weighted avg       0.72      0.74      0.66        68

/home/frcaud/anaconda3/envs/ramp-hotjupiter/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/frcaud/anaconda3/envs/ramp-hotjupiter/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/frcaud/anaconda3/envs/ramp-hotjupiter/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


disp = ConfusionMatrixDisplay.from_predictions(y_test_tensor, y_pred)
fig = disp.figure_
fig.set_figwidth(7)
fig.set_figheight(7)


y_train_df


# Create meta groups as a new column in y_train_df
# Simulations to groups dictionary
sim_to_group = {
    "Cool_0060_Locked": 0,
    "Cool_0110_match": 0,
    "Hot_0012_match": 0,
    "Hot_0036_Locked": 0,
    "Cool_0334_Locked": 1,
    "Hot_0012_Locked": 1,
    "Cool_0060_match": 1,
    "Cool_0110_Locked": 2,
    "Cool_0192_match": 2,
    "Cool_0334_match": 2,
    "Hot_0021_Locked": 2,
    "Hot_0021_match": 2,
}
# Function to apply to each row in the 'simulation' column
def get_group(simulation):
    return sim_to_group.get(simulation, None)
# Create a new column 'group' based on the 'simulation' column
y_train_df["group"] = y_train_df["simulation"].apply(get_group)
# Gobal variable groups
global groups
groups = y_train_df.group.to_numpy()


print(groups.shape)
groups

(620,)

array([0, 0, 2, 2, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 2, 2,
       2, 2, 0, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 0, 2, 1, 0, 0, 0, 0, 0,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1,
       0, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 0, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 1, 1,
       2, 0, 1, 0, 0, 2, 0, 1, 1, 2, 2, 0, 0, 1, 1, 1, 1, 0, 2, 2, 1, 1,
       2, 1, 0, 2, 1, 0, 2, 2, 0, 0, 2, 1, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 2, 2, 2, 1, 0, 1,
       1, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2,
       0, 1, 0, 2, 2, 0, 1, 0, 1, 1, 1, 1, 0, 2, 2, 2, 1, 2, 0, 1, 2, 2,
       2, 0, 2, 0, 2, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 1, 1, 1,
       1, 0, 2, 1, 1, 1, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 1, 2, 1, 1,
       2, 0, 1, 1, 1, 1, 0, 0, 2, 0, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 0, 2,
       2, 1, 1, 1, 0, 2, 1, 0, 0, 1, 0, 0, 1, 1, 1, 2, 2, 2, 1, 1, 0, 2,
       2, 2, 0, 0, 2, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 0, 0, 1, 2, 0, 2, 2,
       0, 2, 0, 0, 2, 1, 2, 0, 1, 1, 0, 1, 2, 2, 1, 0, 1, 0, 2, 1, 2, 1,
       0, 2, 0, 2, 2, 1, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 1, 1, 0, 2, 0, 2,
       0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 2, 0, 1, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 0, 2, 1, 2, 1, 0, 0, 2, 2, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 1, 0, 1, 1, 0, 0, 2, 0, 0, 2, 0, 2, 1, 2, 0, 0, 0, 2, 1, 0,
       0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 1, 1, 0, 0, 1, 0, 1, 2,
       0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 0, 2, 2, 1, 0, 2, 0, 0, 1, 1, 1,
       2, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 2, 0, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 0, 0, 1,
       2, 0, 1, 2])


y_train_df


# Class count by group:
group_class_counts = y_train_df.groupby('group')['cat_num'].value_counts().unstack(fill_value=0)
group_class_counts


y_train_df.groupby('simulation')['cat_num'].value_counts().unstack(fill_value=0)


# labels
y = y_train
print(y.shape)

(620,)


# Data
X = X_train
print(X.shape)

(620, 1, 90, 180)


# Initialize cross-validation
logo = LeaveOneGroupOut()


for train, test in logo.split(X, y, groups=groups):
    print("%s %s" % (len(train), len(test)))

417 203
433 187
390 230


X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.int64)


X_transformed = torch.stack([transform(x) for x in X_tensor])


# Placeholder for cross-validation results
accuracy_scores = []
balanced_accuracy_scores = []

# Iterate over each train/test split
for i, (train_idx, test_idx) in enumerate(logo.split(X_transformed, y_tensor, groups)):
    X_train, X_test = X_transformed[train_idx], X_transformed[test_idx]
    y_train, y_test = y_tensor[train_idx], y_tensor[test_idx]

    # DataLoader for training and testing
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

    # Initialize and train the model
    model = create_modified_mobilenet(num_classes=5).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(10):  # Adjust the number of epochs if needed
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    balanced_acc = balanced_accuracy_score(all_labels, all_preds)
    print(f"Fold {i}:")
    disp = ConfusionMatrixDisplay.from_predictions(all_labels, all_preds)
    plt.title(f'Confusion Matrix for Fold {i}')
    plt.show()
    #fig = disp.figure_
    #fig.set_figwidth(7)
    #fig.set_figheight(7)

    # Store results
    accuracy_scores.append(accuracy)
    balanced_accuracy_scores.append(balanced_acc)

# Calculate and print average metrics
average_accuracy = np.mean(accuracy_scores)
average_balanced_accuracy = np.mean(balanced_accuracy_scores)
print(f'Average Accuracy: {average_accuracy:.3f}')
print(f'Average Balanced Accuracy: {average_balanced_accuracy:.3f}')

Fold 0:

Fold 1:

Fold 2:

Average Accuracy: 0.701
Average Balanced Accuracy: 0.594


import numpy as np
import random
from sklearn.base import BaseEstimator


class Classifier(BaseEstimator):
    def __init__(self):
        return

    def fit(self, X, y):
        return

    def predict(self, X):
        return

    def predict_proba(self, X):
        # Create an array of zeros
        y_pred = np.zeros((X.shape[0], 5), dtype=int)
        # Set one random index per row to 1
        for i in range(X.shape[0]):
            random_index = random.randint(0, 4)
            y_pred[i, random_index] = 1
        return np.array(y_pred)


!ramp-test --submission starting_kit

Testing Hot Jupiter atmospheric pattern classification
Reading train and test files from ./data/ ...
Reading cv ...
Training submissions/starting_kit ...
CV fold 0
	score  bal_acc    acc      time
	train    0.214  0.209  0.007614
	valid    0.257  0.251  0.000699
	test     0.221  0.221  0.000097
CV fold 1
	score  bal_acc    acc      time
	train    0.213  0.176  0.006831
	valid    0.211  0.182  0.000787
	test     0.215  0.206  0.000118
CV fold 2
	score  bal_acc    acc      time
	train    0.206  0.213  0.006035
	valid    0.194  0.217  0.000530
	test     0.212  0.162  0.000059
----------------------------
Mean CV scores
----------------------------
	score         bal_acc             acc       time
	train  0.211 ± 0.0034  0.199 ± 0.0167  0.0 ± 0.0
	valid   0.22 ± 0.0265  0.217 ± 0.0283  0.0 ± 0.0
	test   0.216 ± 0.0036   0.196 ± 0.025  0.0 ± 0.0
----------------------------
Bagged scores
----------------------------
	score  bal_acc    acc
	valid    0.213  0.218
	test     0.179  0.147

Classification of patterns on the atmosphere of extra solar hot Jupiter¶

Introduction¶

Requirements¶

Data exploration¶

Train set¶

Distribution of classes in the train set:¶

Pixel intensity distribution¶

Plot of the average image for each class:¶

Test set¶

Distribution of classes in test set:¶

Base model¶

Cross-validation evaluation:¶

Groups for cross-val:¶

Submitting to the online challenge: ramp.studio ¶

More information¶

Questions¶

	simulation	category	cat_num
0	Hot_0036_Locked	banded	1
1	Hot_0036_Locked	banded	1
2	Hot_0021_match	banded	1
3	Hot_0021_Locked	banded	1
4	Hot_0012_Locked	banded	1
...	...	...	...
615	Cool_0334_Locked	no_pattern	4
616	Hot_0021_Locked	no_pattern	4
617	Hot_0036_Locked	no_pattern	4
618	Hot_0012_Locked	banded	1
619	Hot_0021_Locked	no_pattern	4

	simulation	category	cat_num
0	Hot_0036_match	banded	1
1	Cool_0192_Locked	asymetric	0
2	Cool_0192_Locked	asymetric	0
3	Cool_0192_Locked	no_pattern	4
4	Cool_0192_Locked	no_pattern	4
...	...	...	...
63	Cool_0192_Locked	no_pattern	4
64	Hot_0036_match	banded	1
65	Hot_0036_match	no_pattern	4
66	Cool_0192_Locked	no_pattern	4
67	Cool_0192_Locked	no_pattern	4

cat_num	0	1	2	3	4
simulation
Cool_0060_Locked	41	2	1	0	27
Cool_0060_match	0	1	8	0	14
Cool_0110_Locked	8	1	17	0	58
Cool_0110_match	0	0	5	0	18
Cool_0192_match	5	0	1	0	16
Cool_0334_Locked	31	0	9	0	39
Cool_0334_match	7	0	2	0	12
Hot_0012_Locked	0	28	1	36	20
Hot_0012_match	0	4	1	9	9
Hot_0021_Locked	3	18	1	0	58
Hot_0021_match	0	7	1	9	6
Hot_0036_Locked	0	22	9	0	55

cat_num	0	1	2	3	4
group
0	41	28	16	9	109
1	31	29	18	36	73
2	23	26	22	9	150