from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn


data = pd.read_parquet(Path("data") / "train.parquet")


data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455163 entries, 48321 to 928462
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 455163 non-null  category      
 1   counter_name               455163 non-null  category      
 2   site_id                    455163 non-null  int64         
 3   site_name                  455163 non-null  category      
 4   bike_count                 455163 non-null  float64       
 5   date                       455163 non-null  datetime64[ns]
 6   counter_installation_date  455163 non-null  datetime64[ns]
 7   counter_technical_id       455163 non-null  category      
 8   latitude                   455163 non-null  float64       
 9   longitude                  455163 non-null  float64       
 10  log_bike_count             455163 non-null  float64       
dtypes: category(4), datetime64[ns](2), float64(4), int64(1)
memory usage: 29.5 MB


data.nunique(axis=0)

counter_id                     56
counter_name                   56
site_id                        30
site_name                      30
bike_count                    977
date                         8230
counter_installation_date      22
counter_technical_id           30
latitude                       30
longitude                      30
log_bike_count                977
dtype: int64


data.groupby(["site_name", "counter_name"])["bike_count"].sum().sort_values(
    ascending=False
).head(10).to_frame()


import folium

m = folium.Map(location=data[["latitude", "longitude"]].mean(axis=0), zoom_start=13)

for _, row in (
    data[["counter_name", "latitude", "longitude"]]
    .drop_duplicates("counter_name")
    .iterrows()
):
    folium.Marker(
        row[["latitude", "longitude"]].values.tolist(), popup=row["counter_name"]
    ).add_to(m)

m


mask = data["counter_name"] == "Totem 73 boulevard de Sébastopol S-N"

data[mask].plot(x="date", y="bike_count")

<AxesSubplot: xlabel='date'>


mask = data["counter_name"] == "Totem 73 boulevard de Sébastopol S-N"

data[mask].groupby(pd.Grouper(freq="1w", key="date"))[["bike_count"]].sum().plot()

<AxesSubplot: xlabel='date'>


fig, ax = plt.subplots(figsize=(10, 4))

mask = (
    (data["counter_name"] == "Totem 73 boulevard de Sébastopol S-N")
    & (data["date"] > pd.to_datetime("2021/03/01"))
    & (data["date"] < pd.to_datetime("2021/03/08"))
)

data[mask].plot(x="date", y="bike_count", ax=ax)

<AxesSubplot: xlabel='date'>


import seaborn as sns


ax = sns.histplot(data, x="bike_count", kde=True, bins=50)

data['log_bike_count'] = np.log(1 + data['bike_count'])


ax = sns.histplot(data, x="log_bike_count", kde=True, bins=50)


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


data["date"].head()

48321   2020-09-01 02:00:00
48324   2020-09-01 03:00:00
48327   2020-09-01 04:00:00
48330   2020-09-01 15:00:00
48333   2020-09-01 18:00:00
Name: date, dtype: datetime64[ns]


_encode_dates(data[["date"]].head())


from sklearn.preprocessing import FunctionTransformer

date_encoder = FunctionTransformer(_encode_dates, validate=False)
date_encoder.fit_transform(data[["date"]]).head()


from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)

enc.fit_transform(_encode_dates(data[["date"]])[["hour"]].head())

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])


import problem

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()


X_train.head(2)


y_train

array([1.60943791, 1.38629436, 0.        , ..., 2.48490665, 1.60943791,
       1.38629436])


print(
    f'Train: n_samples={X_train.shape[0]},  {X_train["date"].min()} to {X_train["date"].max()}'
)
print(
    f'Test: n_samples={X_test.shape[0]},  {X_test["date"].min()} to {X_test["date"].max()}'
)

Train: n_samples=455163,  2020-09-01 01:00:00 to 2021-08-09 23:00:00
Test: n_samples=41608,  2021-08-10 01:00:00 to 2021-09-09 23:00:00


_encode_dates(X_train[["date"]]).columns.tolist()

['year', 'month', 'day', 'weekday', 'hour']


from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = Ridge()

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function _encode_dates at 0x7f5e41c58550>)),
                ('columntransformer',
                 ColumnTransformer(transformers=[('date',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['year', 'month', 'day',
                                                   'weekday', 'hour']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['counter_name',
                                                   'site_name'])])),
                ('ridge', Ridge())])

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function _encode_dates at 0x7f5e41c58550>)),
                ('columntransformer',
                 ColumnTransformer(transformers=[('date',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['year', 'month', 'day',
                                                   'weekday', 'hour']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['counter_name',
                                                   'site_name'])])),
                ('ridge', Ridge())])

FunctionTransformer(func=<function _encode_dates at 0x7f5e41c58550>)

ColumnTransformer(transformers=[('date', OneHotEncoder(handle_unknown='ignore'),
                                 ['year', 'month', 'day', 'weekday', 'hour']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['counter_name', 'site_name'])])

['year', 'month', 'day', 'weekday', 'hour']

OneHotEncoder(handle_unknown='ignore')

['counter_name', 'site_name']

OneHotEncoder(handle_unknown='ignore')

Ridge()


from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipe.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.80
Test set, RMSE=0.73


print("Baseline mean prediction.")
print(
    f"Train set, RMSE={mean_squared_error(y_train, np.full(y_train.shape, y_train.mean()), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, np.full(y_test.shape, y_test.mean()), squared=False):.2f}"
)

Baseline mean prediction.
Train set, RMSE=1.68
Test set, RMSE=1.44


mask = (
    (X_test["counter_name"] == "Totem 73 boulevard de Sébastopol S-N")
    & (X_test["date"] > pd.to_datetime("2021/09/01"))
    & (X_test["date"] < pd.to_datetime("2021/09/08"))
)

df_viz = X_test.loc[mask].copy()
df_viz["bike_count"] = np.exp(y_test[mask.values]) - 1
df_viz["bike_count (predicted)"] = np.exp(pipe.predict(X_test[mask])) - 1


fig, ax = plt.subplots(figsize=(12, 4))

df_viz.plot(x="date", y="bike_count", ax=ax)
df_viz.plot(x="date", y="bike_count (predicted)", ax=ax, ls="--")
ax.set_title("Predictions with Ridge")
ax.set_ylabel("bike_count")

Text(0, 0.5, 'bike_count')


fig, ax = plt.subplots()

df_viz = pd.DataFrame({"y_true": y_test, "y_pred": pipe.predict(X_test)}).sample(
    10000, random_state=0
)

df_viz.plot.scatter(x="y_true", y="y_pred", s=8, alpha=0.1, ax=ax)

<AxesSubplot: xlabel='y_true', ylabel='y_pred'>


from sklearn.model_selection import TimeSeriesSplit, cross_val_score

cv = TimeSeriesSplit(n_splits=6)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

RMSE:  [-0.96320864 -0.87311411 -0.85383867 -0.87000261 -1.0607788  -0.97297134]
RMSE (all folds): 0.932 ± 0.0738

	counter_id	counter_name	site_id	site_name	bike_count	date	counter_installation_date	counter_technical_id	latitude	longitude	log_bike_count
48321	100007049-102007049	28 boulevard Diderot E-O	100007049	28 boulevard Diderot	0.0	2020-09-01 02:00:00	2013-01-18	Y2H15027244	48.846028	2.375429	0.000000
48324	100007049-102007049	28 boulevard Diderot E-O	100007049	28 boulevard Diderot	1.0	2020-09-01 03:00:00	2013-01-18	Y2H15027244	48.846028	2.375429	0.693147
48327	100007049-102007049	28 boulevard Diderot E-O	100007049	28 boulevard Diderot	0.0	2020-09-01 04:00:00	2013-01-18	Y2H15027244	48.846028	2.375429	0.000000
48330	100007049-102007049	28 boulevard Diderot E-O	100007049	28 boulevard Diderot	4.0	2020-09-01 15:00:00	2013-01-18	Y2H15027244	48.846028	2.375429	1.609438
48333	100007049-102007049	28 boulevard Diderot E-O	100007049	28 boulevard Diderot	9.0	2020-09-01 18:00:00	2013-01-18	Y2H15027244	48.846028	2.375429	2.302585

		bike_count
site_name	counter_name
Totem 73 boulevard de Sébastopol	Totem 73 boulevard de Sébastopol S-N	1809231.0
Totem 64 Rue de Rivoli	Totem 64 Rue de Rivoli O-E	1406900.0
Totem 73 boulevard de Sébastopol	Totem 73 boulevard de Sébastopol N-S	1357868.0
67 boulevard Voltaire SE-NO	67 boulevard Voltaire SE-NO	1036575.0
Totem 64 Rue de Rivoli	Totem 64 Rue de Rivoli E-O	914089.0
27 quai de la Tournelle	27 quai de la Tournelle SE-NO	888717.0
Quai d'Orsay	Quai d'Orsay E-O	849724.0
Totem Cours la Reine	Totem Cours la Reine O-E	806149.0
Face au 48 quai de la marne	Face au 48 quai de la marne SO-NE	806071.0
Face au 48 quai de la marne	Face au 48 quai de la marne NE-SO	759194.0

	counter_id	counter_name	site_id	site_name	date	counter_installation_date	counter_technical_id	latitude	longitude
400125	100049407-353255860	152 boulevard du Montparnasse E-O	100049407	152 boulevard du Montparnasse	2020-09-01 01:00:00	2018-12-07	Y2H19070373	48.840801	2.333233
408305	100049407-353255859	152 boulevard du Montparnasse O-E	100049407	152 boulevard du Montparnasse	2020-09-01 01:00:00	2018-12-07	Y2H19070373	48.840801	2.333233

RAMP on predicting cyclist traffic in Paris¶

Introduction¶

Loading the data with pandas¶

Visualizing the data¶

Feature extraction¶

Linear model¶

	year	month	day	weekday	hour
48321	2020	9	1	1	2
48324	2020	9	1	1	3
48327	2020	9	1	1	4
48330	2020	9	1	1	15
48333	2020	9	1	1	18