Notebooks

The following notebooks provide working examples that you can copy-paste in Allonia Platform’s jupyter lab. We recommand exploring them all in the order they appear below.

House pricing regression

This simple example shows the basic concept of learning with AleiaModel.

Imports and functions

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline
from aleiamodel import AleiaModel
def get_metrics(predicted_y, real_y):
    mse = mean_squared_error(real_y, predicted_y)
    mae = mean_absolute_error(real_y, predicted_y)
    return {"mse": mse, "mae": mae}

Data

data = np.array(
   [
       [   100, 200000],
       [   150, 300000],
       [   120, 250000],
       [   180, 350000],
       [   300, 450000],
       [   200, 400000],
       [   400, 600000],
       [   500, 700000],
       [   550, 670000],
       [   600, 800000],
       [   250, 490000],
       [   230, 470000]
   ]
)

Model

model = AleiaModel('housemodel')
if model.new:
    model.set_variables(predictive_variables=(0,), target_variables=(1,))
    model.compute_metrics_function = get_metrics
    model.model = LinearRegression()
    model.set_set_sizes(0, 0.2)
model.raw_set = data
# LinearRegression needs 2-D arrays for X and Y, but AleiaModel squeezes arrays with only one column by default.
# Those two arguments force the shape of X and y to be 2-D.
model.learn(reshape_x=(-1, 1), reshape_y=(-1, 1))
model.save()
model.close()

Visualization

model.observations_set = data
predicted = model.apply(reshape_x=(-1, 1))
df = pd.DataFrame(model.raw_set, columns=["Superficie", "prix"]).set_index("Superficie")
df["Predit"] = predicted
df = df.sort_index()
df.plot()
model.learnings_summary

Spam classification

This example shows you how to use data that is not in simple dataframes or arrays.

Imports

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, coverage_error, f1_score, mean_absolute_error, recall_score
from aleiamodel import AleiaModel, SpecialDataFormat

Functions

def get_metrics(predicted, real, model):
    ac = accuracy_score(real, predicted)
    cm = pd.DataFrame(
        columns=["ham", "spam"],
        index=["ham", "spam"],
        data=confusion_matrix(real, predicted, labels=model.classes)
    )
    return {"accuracy": ac, "cm": cm}

def feat_eng(emails):
    X = [email[0] for email in emails]
    y = np.array([email[1] for email in emails])

    vectorizer = CountVectorizer()
    X_vectorized = vectorizer.fit_transform(X)
    # The resulting X_vectorized is a special numpy format that is not a basic np.ndarray, so
    # AleiaModel can not use it natively. We have to embed it in the SpecialDataFormat class,
    # that can be handled.
    return SpecialDataFormat(x=X_vectorized, y=y)

def custom_train_test_split(data, test_size, random_state):
    # If the derived data is in SpecialDataFormat, then train-test-split must be customized to handle it.
    X_train, X_test, y_train, y_test = train_test_split(data.x, data.y, test_size=test_size, random_state=random_state)
    return SpecialDataFormat(x=X_train, y=y_train), SpecialDataFormat(x=X_test, y=y_test)

Data

emails = [
    ("Offre spéciale ! Gagnez gros !", "spam"),
    ("Réunion demain à 14h.", "ham"),
    ("Vente exclusive en cours.", "spam"),
    ("Rappel : Facture à régler.", "ham")
]

AleiaModel

model = AleiaModel("emailsmodel")
if model.new:
    model.compute_metrics_function = get_metrics
    model.feature_engineering_function = feat_eng
    model.train_val_test_split_function = custom_train_test_split
    model.set_set_sizes(0, 0.2)
    model.model = SVC(kernel='linear')
model.raw_set = np.array(emails)
model.learn(metrics_kwargs={"model": model})
model.save()
model.close()
model.learnings_summary

Iris classification

This notebook will show you how one can do hyperparameter optimization using AleiaModel, by using a variating number of neighbors and two different weights with a KNeighborsClassifier.

Imports

import pandas as pd

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

from aleiamodel import AleiaModel

Global variables

test_size_ = 0.2
weights = ["uniform", "distance"]
neighbors = [1, 5, 10]

Functions

# For AleiaModel, the train-test split function must accept the "test_size" and
# "random_state" arguments. The three arguments are passed by AleiaModel automatiaclly.
def train_test_split(raw, test_size, random_state):
    split = StratifiedShuffleSplit(
       n_splits=1, test_size=test_size, random_state=random_state
    )
    strat_train_set = None
    strat_test_set = None
    for train_index, test_index in split.split(raw, raw["target"]):
        strat_train_set = raw.loc[train_index]
        strat_test_set = raw.loc[test_index]
    if strat_train_set is None:
       raise ValueError("Could not make train set.")
    return strat_train_set, strat_test_set


# We use that as a feature engineering function.
# The first argument must be the dataset to modify, that is passed to the
# function by AleiaModel. A new dataset must be returned.
# The other argument, 'names', must be given by the user when calling the
# function, through kwargs given to 'learn', 'feature_engineering', or 'apply'.
def add_target_names(data, names):
    """Adds a column to the dataframe with the target names"""
    if "target" in data:
        return data.assign(target_names=names[data["target"]])
    return data

# The function to compute metrics must accept at least x, possibly y.
def get_metrics(pred_y, real_y):
    return {"accuracy": accuracy_score(pred_y, real_y)}

Import data

iris = datasets.load_iris(as_frame=True)
target_names = iris['target_names']
df_iris = pd.concat(([iris["data"], iris["target"]]), axis=1)

Model

Declaration

# List previous versions
print(AleiaModel.list_versions("iris_knn"))
# If we want to start fresh, we can delete previous versions like this.
# AleiaModel.delete("iris_knn")
# Without 'read_only=True', the model will create a lockfile when instantiated,
# released only upon model deletion and garbage collection, or program end (which does not always work).
# While the lockfile exists, no other model with the same name can be created,
# unless if created with read_only=True (can not be saved) or override_lock=True (not recommended).
model = AleiaModel("iris_knn")  #, revision=..., read_only=True/False)
model.raw_set = df_iris
if model.new:
    # No validation set, and test is 20% of raw set
    model.set_set_sizes(0, test_size_)
    model.set_variables(
        predictive_variables=df_iris.drop(columns=["target"]).columns,
        target_variables=("target",)
    )
    # Do not instantiate the model class yet, so that we can pass it
    # 'n_neighbors' and 'weights` as hyperparameter to do hyperparameter optimisation.
    model.model_class = KNeighborsClassifier
    model.compute_metrics_function = get_metrics
    model.train_val_test_split_function = train_test_split
    model.feature_engineering_function = add_target_names
model.health_check_set = df_iris[:5]

Health check

# Good habit to define a very small sample of the dataset as a
# 'health check set'. That way, we can make the entire pipeline run on this
# small sample, just to check that everything is fine. No result from this
# learning will be saved in the object. You must make sure that it is not too
# small though, some models might requier a minimum amount of data depending on
# your train, validation and test sets sizes.
healthy = model.health_check(
    model_kwargs={"n_neighbors": neighbors[0], "weights": weights[0]},
    feature_engineering_kwargs={"names": target_names},
)

Train and evaluation

if healthy:
    for w in weights:
        for n in neighbors:
            model.learn(
               model_kwargs={"n_neighbors": n, "weights": w},
               feature_engineering_kwargs={"names": target_names},
            )
            # Will keep in memory the history of every learning that was done, but only
            # the last learned model is kept. To retain every model learned, we can do
            # model.save() inside the loop. That way, each revision of the model on S3
            # will match a different model. This is time-consuming, though.
            model.save()
model.close()
model.learnings_summary

Classification with XGBoost with AleiaModel

In this notebook, you will see how to use XGBoost classifier with AleiaModel.

Imports

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from aleiamodel import AleiaModel
import aleialib

Functions

def get_metrics(predicted, real):
    return {"accuracy": accuracy_score(real, predicted)}

Global variables

path = aleialib.s3.from_url_to_s3(
    "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
    s3_folder="notebooks/dataset",
    filename="pima-indians-diabetes.csv",
    force=False,
    from_s3_root=True,
    object_type="dataset",
)

Model

Declaration

model = AleiaModel('indian_diabetes_xgboost')
if model.new:
    model.raw_set = path
    model._raw_set.load_kwargs = {'index_col': None, 'header': None}
    model.compute_metrics_function = get_metrics
    model.set_set_sizes(0, 0.33)
    model.set_variables(predictive_variables=tuple(range(8)), target_variables=(8,))
    model.model = XGBClassifier()
# Since health check set is not given as a path but as raw data, it is not saved in the model and must be redefined each time.
model.health_check_set = model.raw_set[:10]

Health check

healthy = model.health_check()

Train and evaluation

if healthy:
    model.fit()
model.save()
model.fits_summary

Model deployment

import aleiamlops

ret = aleiamlops.seldon.deploy_model('indian_diabetes_xgboost')
print(ret)

TensorFlow with AleiaModel

In this notebook, you will see that you can also decide to specify directly a train and test set, which will skip the feature engineering and the train-test split steps of the pipeline.

Imports

import tensorflow as tf
import numpy as np
from aleiamodel import AleiaModel, SpecialDataFormat
from aleialib import s3
print("TensorFlow version:", tf.__version__)

Functions

def make_dataset(force=False):
    """In a real case scenario, the user is expected to give one dataset,
    not train and test already split. This function is here only to mimic this behavior
    by concatenating the train and test sets given by TF."""
    if force or not s3.file_exists("dataset/mnist_train.pkl"):
        (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
        s3.save_file(
            "mnist_train.pkl",
            SpecialDataFormat(x=x_train, y=y_train),
            object_type="dataset"
        )
        s3.save_file(
            "mnist_test.pkl",
            SpecialDataFormat(x=x_test, y=y_test),
            object_type="dataset"
        )

def get_metrics(x, real, model):
    loss, accuracy = model.evaluate(x, real)
    return {"loss": loss, "accuracy": accuracy}
# Needs to be executed at least once
make_dataset()

Buid machine learning model

Build a tf.keras.Sequential model:

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
tf_model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10)
    ]
)
# Required for TF models
tf_model.compile(
    optimizer='adam',
    loss=loss_fn,
    metrics=['accuracy']
)
model = AleiaModel("mnist")
if model.new:
    model.train_set = "notebooks/dataset/mnist_train.pkl"
    model.test_set = "notebooks/dataset/mnist_test.pkl"
    model.model = tf_model
    model.compute_metrics_function = get_metrics

Train and evaluation

model.learn(fit_kwargs={"epochs": 5}, metrics_kwargs={"model": model.model})
model.save()
model.close()
model.learnings_summary

Titanic classification

In this notebook, you will see how and why to use a custom model, and how to use classes and functions imported from an external .py file.

Note that to test this notebook on Allonia’s platform, you will also need to copy the file titanic_functions.py. Put it alongside your notebook by right-clicking on the left panel of the Jupyter interface, and click “New File”. Name it titanic_functions.py and past the file’s code inside it.

Imports

from aleiamodel import AleiaModel
import aleialib
import matplotlib.pyplot as plt
%matplotlib inline
aleialib.s3.import_from_s3("notebook/titanic_functions.py")
from titanic_functions import AdaptedRandomForest, getmetrics, TrainTestSplit, clean_data_titanic

Global variables

predictive_variables = ('Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'IsAlone', 'Age*Class')

Model

Declaration

model = AleiaModel('titanicmodel')
if model.new:
    # 'new' means that AleiaModel did not find a model with this name in S3, and thus did not load an existing model.
    # In that case, one needs to define all the important model's attributes. If the model is not new however, those are
    # already defined.
    # You could choose to redefine some of those attributes even if the model is not new, to test some new metrics,
    # or some new feature engineering.
    # The 'model' attribute however can not be changed : it would make no sense to have a model trained with a
    # LinearRegression in revision 0 and RandomForest in revision 1. Those are two very different models that should
    # use two different AleiaModel objects.
    # You could however choose to variate 'n_estimators' at the RandomForest creation though. To do that, do not
    # set 'model.model = AdaptedRandomForest(n_estimators=100)', but set 'model.model_class = AdaptedRandomForest'
    # instead, and pass 'model_kwargs={"n_estimators": <some value>}' to 'model.learn'.
    model.raw_set = 'notebooks/dataset/Titanic.csv'
    model.compute_metrics_function = getmetrics
    model.train_val_test_split_function = TrainTestSplit
    model.set_set_sizes(0,0.2)
    model.set_variables(predictive_variables, ("Survived",))
    model.model = AdaptedRandomForest(n_estimators=100)
    model.feature_engineering_function = clean_data_titanic
    model.add_validators(
        [
            ("Bob", "bob@company.com", "Project Manager"),
            ("Alice", "alice@company.com", "CTO")
        ]
    )
# Since health check set is not given as a path but as raw data, it is not saved in the model and must be redefined each time.
model.health_check_set = model.raw_set[:100]
# Show raw set, optional
model.raw_set

Health check

healthy = model.health_check(predict_for_metrics_kwargs={"which": "proba"}, metrics_kwargs={"model": model})

Train and evaluation

if healthy:
    res = model.learn(predict_for_metrics_kwargs={"which": "proba"}, metrics_kwargs={"model": model})
    model.save()
    print(res)
    print(model.learnings_summary)
    print(model.classes)

    no_skill = model.tests_summary["results"].iloc[0]['ROC No skill']
    logistic = model.tests_summary["results"].iloc[0]['ROC Logistic']

    plt.plot(*no_skill, linestyle='--', label='No Skill')
    plt.plot(*logistic, marker='.', label='Logistic')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()
model.close()