Notebooks
The following notebooks provide working examples that you can copy-paste in Allonia Platform’s jupyter lab. We recommand exploring them all in the order they appear below.
House pricing regression
This simple example shows the basic concept of learning with AleiaModel.
Imports and functions
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline
from aleiamodel import AleiaModel
def get_metrics(predicted_y, real_y):
mse = mean_squared_error(real_y, predicted_y)
mae = mean_absolute_error(real_y, predicted_y)
return {"mse": mse, "mae": mae}
Data
data = np.array(
[
[ 100, 200000],
[ 150, 300000],
[ 120, 250000],
[ 180, 350000],
[ 300, 450000],
[ 200, 400000],
[ 400, 600000],
[ 500, 700000],
[ 550, 670000],
[ 600, 800000],
[ 250, 490000],
[ 230, 470000]
]
)
Model
model = AleiaModel('housemodel')
if model.new:
model.set_variables(predictive_variables=(0,), target_variables=(1,))
model.compute_metrics_function = get_metrics
model.model = LinearRegression()
model.set_set_sizes(0, 0.2)
model.raw_set = data
# LinearRegression needs 2-D arrays for X and Y, but AleiaModel squeezes arrays with only one column by default.
# Those two arguments force the shape of X and y to be 2-D.
model.learn(reshape_x=(-1, 1), reshape_y=(-1, 1))
model.save()
model.close()
Spam classification
This example shows you how to use data that is not in simple dataframes or arrays.
Imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, coverage_error, f1_score, mean_absolute_error, recall_score
from aleiamodel import AleiaModel, SpecialDataFormat
Functions
def get_metrics(predicted, real, model):
ac = accuracy_score(real, predicted)
cm = pd.DataFrame(
columns=["ham", "spam"],
index=["ham", "spam"],
data=confusion_matrix(real, predicted, labels=model.classes)
)
return {"accuracy": ac, "cm": cm}
def feat_eng(emails):
X = [email[0] for email in emails]
y = np.array([email[1] for email in emails])
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)
# The resulting X_vectorized is a special numpy format that is not a basic np.ndarray, so
# AleiaModel can not use it natively. We have to embed it in the SpecialDataFormat class,
# that can be handled.
return SpecialDataFormat(x=X_vectorized, y=y)
def custom_train_test_split(data, test_size, random_state):
# If the derived data is in SpecialDataFormat, then train-test-split must be customized to handle it.
X_train, X_test, y_train, y_test = train_test_split(data.x, data.y, test_size=test_size, random_state=random_state)
return SpecialDataFormat(x=X_train, y=y_train), SpecialDataFormat(x=X_test, y=y_test)
Data
emails = [
("Offre spéciale ! Gagnez gros !", "spam"),
("Réunion demain à 14h.", "ham"),
("Vente exclusive en cours.", "spam"),
("Rappel : Facture à régler.", "ham")
]
AleiaModel
model = AleiaModel("emailsmodel")
if model.new:
model.compute_metrics_function = get_metrics
model.feature_engineering_function = feat_eng
model.train_val_test_split_function = custom_train_test_split
model.set_set_sizes(0, 0.2)
model.model = SVC(kernel='linear')
model.raw_set = np.array(emails)
model.learn(metrics_kwargs={"model": model})
model.save()
model.close()
model.learnings_summary
Iris classification
This notebook will show you how one can do hyperparameter optimization using AleiaModel, by using a variating number of neighbors and two different weights with a KNeighborsClassifier.
Imports
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from aleiamodel import AleiaModel
Functions
# For AleiaModel, the train-test split function must accept the "test_size" and
# "random_state" arguments. The three arguments are passed by AleiaModel automatiaclly.
def train_test_split(raw, test_size, random_state):
split = StratifiedShuffleSplit(
n_splits=1, test_size=test_size, random_state=random_state
)
strat_train_set = None
strat_test_set = None
for train_index, test_index in split.split(raw, raw["target"]):
strat_train_set = raw.loc[train_index]
strat_test_set = raw.loc[test_index]
if strat_train_set is None:
raise ValueError("Could not make train set.")
return strat_train_set, strat_test_set
# We use that as a feature engineering function.
# The first argument must be the dataset to modify, that is passed to the
# function by AleiaModel. A new dataset must be returned.
# The other argument, 'names', must be given by the user when calling the
# function, through kwargs given to 'learn', 'feature_engineering', or 'apply'.
def add_target_names(data, names):
"""Adds a column to the dataframe with the target names"""
if "target" in data:
return data.assign(target_names=names[data["target"]])
return data
# The function to compute metrics must accept at least x, possibly y.
def get_metrics(pred_y, real_y):
return {"accuracy": accuracy_score(pred_y, real_y)}
Import data
iris = datasets.load_iris(as_frame=True)
target_names = iris['target_names']
df_iris = pd.concat(([iris["data"], iris["target"]]), axis=1)
Model
Declaration
# List previous versions
print(AleiaModel.list_versions("iris_knn"))
# If we want to start fresh, we can delete previous versions like this.
# AleiaModel.delete("iris_knn")
# Without 'read_only=True', the model will create a lockfile when instantiated,
# released only upon model deletion and garbage collection, or program end (which does not always work).
# While the lockfile exists, no other model with the same name can be created,
# unless if created with read_only=True (can not be saved) or override_lock=True (not recommended).
model = AleiaModel("iris_knn") #, revision=..., read_only=True/False)
model.raw_set = df_iris
if model.new:
# No validation set, and test is 20% of raw set
model.set_set_sizes(0, test_size_)
model.set_variables(
predictive_variables=df_iris.drop(columns=["target"]).columns,
target_variables=("target",)
)
# Do not instantiate the model class yet, so that we can pass it
# 'n_neighbors' and 'weights` as hyperparameter to do hyperparameter optimisation.
model.model_class = KNeighborsClassifier
model.compute_metrics_function = get_metrics
model.train_val_test_split_function = train_test_split
model.feature_engineering_function = add_target_names
model.health_check_set = df_iris[:5]
Health check
# Good habit to define a very small sample of the dataset as a
# 'health check set'. That way, we can make the entire pipeline run on this
# small sample, just to check that everything is fine. No result from this
# learning will be saved in the object. You must make sure that it is not too
# small though, some models might requier a minimum amount of data depending on
# your train, validation and test sets sizes.
healthy = model.health_check(
model_kwargs={"n_neighbors": neighbors[0], "weights": weights[0]},
feature_engineering_kwargs={"names": target_names},
)
Train and evaluation
if healthy:
for w in weights:
for n in neighbors:
model.learn(
model_kwargs={"n_neighbors": n, "weights": w},
feature_engineering_kwargs={"names": target_names},
)
# Will keep in memory the history of every learning that was done, but only
# the last learned model is kept. To retain every model learned, we can do
# model.save() inside the loop. That way, each revision of the model on S3
# will match a different model. This is time-consuming, though.
model.save()
model.close()
model.learnings_summary
Classification with XGBoost with AleiaModel
In this notebook, you will see how to use XGBoost classifier with AleiaModel.
Imports
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from aleiamodel import AleiaModel
import aleialib
Global variables
path = aleialib.s3.from_url_to_s3(
"https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
s3_folder="notebooks/dataset",
filename="pima-indians-diabetes.csv",
force=False,
from_s3_root=True,
object_type="dataset",
)
Model
Declaration
model = AleiaModel('indian_diabetes_xgboost')
if model.new:
model.raw_set = path
model._raw_set.load_kwargs = {'index_col': None, 'header': None}
model.compute_metrics_function = get_metrics
model.set_set_sizes(0, 0.33)
model.set_variables(predictive_variables=tuple(range(8)), target_variables=(8,))
model.model = XGBClassifier()
# Since health check set is not given as a path but as raw data, it is not saved in the model and must be redefined each time.
model.health_check_set = model.raw_set[:10]
Health check
healthy = model.health_check()
Train and evaluation
if healthy:
model.fit()
model.save()
model.fits_summary
Model deployment
import aleiamlops
ret = aleiamlops.seldon.deploy_model('indian_diabetes_xgboost')
print(ret)
TensorFlow with AleiaModel
In this notebook, you will see that you can also decide to specify directly a train and test set, which will skip the feature engineering and the train-test split steps of the pipeline.
Imports
import tensorflow as tf
import numpy as np
from aleiamodel import AleiaModel, SpecialDataFormat
from aleialib import s3
print("TensorFlow version:", tf.__version__)
Functions
def make_dataset(force=False):
"""In a real case scenario, the user is expected to give one dataset,
not train and test already split. This function is here only to mimic this behavior
by concatenating the train and test sets given by TF."""
if force or not s3.file_exists("dataset/mnist_train.pkl"):
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
s3.save_file(
"mnist_train.pkl",
SpecialDataFormat(x=x_train, y=y_train),
object_type="dataset"
)
s3.save_file(
"mnist_test.pkl",
SpecialDataFormat(x=x_test, y=y_test),
object_type="dataset"
)
def get_metrics(x, real, model):
loss, accuracy = model.evaluate(x, real)
return {"loss": loss, "accuracy": accuracy}
# Needs to be executed at least once
make_dataset()
Buid machine learning model
Build a tf.keras.Sequential model:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
tf_model = tf.keras.models.Sequential(
[
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10)
]
)
# Required for TF models
tf_model.compile(
optimizer='adam',
loss=loss_fn,
metrics=['accuracy']
)
model = AleiaModel("mnist")
if model.new:
model.train_set = "notebooks/dataset/mnist_train.pkl"
model.test_set = "notebooks/dataset/mnist_test.pkl"
model.model = tf_model
model.compute_metrics_function = get_metrics
Titanic classification
In this notebook, you will see how and why to use a custom model, and how to use classes and functions imported from an external .py file.
Note that to test this notebook on Allonia’s platform, you will also need to copy the file titanic_functions.py. Put it alongside your notebook by right-clicking on the left panel of the Jupyter interface, and click “New File”. Name it titanic_functions.py and past the file’s code inside it.
Imports
from aleiamodel import AleiaModel
import aleialib
import matplotlib.pyplot as plt
%matplotlib inline
aleialib.s3.import_from_s3("notebook/titanic_functions.py")
from titanic_functions import AdaptedRandomForest, getmetrics, TrainTestSplit, clean_data_titanic
Global variables
predictive_variables = ('Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'IsAlone', 'Age*Class')
Model
Declaration
model = AleiaModel('titanicmodel')
if model.new:
# 'new' means that AleiaModel did not find a model with this name in S3, and thus did not load an existing model.
# In that case, one needs to define all the important model's attributes. If the model is not new however, those are
# already defined.
# You could choose to redefine some of those attributes even if the model is not new, to test some new metrics,
# or some new feature engineering.
# The 'model' attribute however can not be changed : it would make no sense to have a model trained with a
# LinearRegression in revision 0 and RandomForest in revision 1. Those are two very different models that should
# use two different AleiaModel objects.
# You could however choose to variate 'n_estimators' at the RandomForest creation though. To do that, do not
# set 'model.model = AdaptedRandomForest(n_estimators=100)', but set 'model.model_class = AdaptedRandomForest'
# instead, and pass 'model_kwargs={"n_estimators": <some value>}' to 'model.learn'.
model.raw_set = 'notebooks/dataset/Titanic.csv'
model.compute_metrics_function = getmetrics
model.train_val_test_split_function = TrainTestSplit
model.set_set_sizes(0,0.2)
model.set_variables(predictive_variables, ("Survived",))
model.model = AdaptedRandomForest(n_estimators=100)
model.feature_engineering_function = clean_data_titanic
model.add_validators(
[
("Bob", "bob@company.com", "Project Manager"),
("Alice", "alice@company.com", "CTO")
]
)
# Since health check set is not given as a path but as raw data, it is not saved in the model and must be redefined each time.
model.health_check_set = model.raw_set[:100]
# Show raw set, optional
model.raw_set
Health check
healthy = model.health_check(predict_for_metrics_kwargs={"which": "proba"}, metrics_kwargs={"model": model})
Train and evaluation
if healthy:
res = model.learn(predict_for_metrics_kwargs={"which": "proba"}, metrics_kwargs={"model": model})
model.save()
print(res)
print(model.learnings_summary)
print(model.classes)
no_skill = model.tests_summary["results"].iloc[0]['ROC No skill']
logistic = model.tests_summary["results"].iloc[0]['ROC Logistic']
plt.plot(*no_skill, linestyle='--', label='No Skill')
plt.plot(*logistic, marker='.', label='Logistic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
model.close()