https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction
import dalex as dx
import pandas as pd
import numpy as np
import sklearn
import tensorflow as tf
import autokeras as ak
import kerastuner as kt
import h2o
import catboost
import lightgbm
import warnings
warnings.filterwarnings('ignore')
# session info
pkg_dict = {}
for pkg in [dx, pd, np, sklearn, tf, ak, kt, h2o, catboost, lightgbm]:
pkg_dict[str.split(str(pkg))[1].replace("'", "")] = pkg.__version__
pd.DataFrame(pkg_dict, index=["version"])
data (train.csv
) from: https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction?select=train.csv
data = pd.read_csv("train.csv")
data.info()
data.head()
We have 380k observations, 11 variables and binary target. For the purpose of this comparison, let's use 10% of the data.
Clean data:
remove id
, Policy_Sales_Channel
, Region_Code
convert Vehicle_Age
to integer
convert Gender
, Vehicle_Damage
to binary
investigate Driving_License
, Annual_Premium
from sklearn.model_selection import train_test_split
data, _ = train_test_split(data, train_size=0.1, random_state=1, stratify=data.Response)
# drop columns
data.drop(["id", "Region_Code", "Policy_Sales_Channel"], axis=1, inplace=True)
# convert three columns
print(data.Vehicle_Age.unique())
data.replace({'Gender': ["Male", "Female"], 'Vehicle_Damage': ["Yes", "No"], 'Vehicle_Age': data.Vehicle_Age.unique()},
{'Gender': [1, 0], 'Vehicle_Damage': [1, 0], 'Vehicle_Age': [2, 1, 0]},
inplace=True)
# what about Driving License in selling the vehicle insurance?
print(data.Driving_License.mean())
# 5% people bought the vehicle insurance without the Driving License
print(data.Response[data.Driving_License==0].mean())
# let's remove this variable for the clarity
# in the model we could assign: IF Driving_License == 0 THEN Response = 0
data.drop("Driving_License", axis=1, inplace=True)
# what about the distribution of Annual_Premium?
import matplotlib.pyplot as plt
_ = plt.hist(data.Annual_Premium, bins='auto', log=True)
plt.show()
# where is the peek?
print(data.Annual_Premium.min())
# a lot of the same values
print((data.Annual_Premium==data.Annual_Premium.min()).sum())
# some very big values (0.2% above 100k)
print((data.Annual_Premium>100000).sum() / data.shape[0])
# let's make a variable indicating the baseline, and move the annual premium
data = data.assign(
Annual_Premium_Baseline=lambda x: (x.Annual_Premium==data.Annual_Premium.min()).astype(int),
Annual_Premium=data.Annual_Premium-data.Annual_Premium.min()
)
# for the sake of this comparison, let's remove heavy outliers as well
data = data[data.Annual_Premium<100000-2630]
_ = plt.hist(data.Annual_Premium, bins='auto')
plt.show()
data.shape
data.Response.mean() # uneven target
X, y = data.drop("Response", axis=1), data.Response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)
X_train.head()
model_baseline = lightgbm.LGBMClassifier(boosting_type="dart", n_estimators=1000, is_unbalance=True)
model_baseline.fit(X_train, y_train)
exp_baseline = dx.Explainer(model_baseline, X_test, y_test, verbose=False, label="lgbm_dart")
exp_baseline.model_performance()
_ = h2o.init(nthreads=-1, max_mem_size=16)
h2o.no_progress()
df = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
df.Response = df['Response'].asfactor()
model_h2o = h2o.estimators.H2ORandomForestEstimator(ntrees=1000,
nfolds=3,
balance_classes=True,
seed=1)
model_h2o.train(x=X_train.columns.to_list(),
y="Response",
training_frame=df)
exp_h2o = dx.Explainer(model_h2o, h2o.H2OFrame(X_test), y_test,
label="h2o_rf", model_type='classification', verbose=False)
exp_h2o.model_performance()
from sklearn.utils import class_weight
weights = dict(enumerate(class_weight.compute_class_weight(class_weight='balanced',
classes=y_train.unique(),
y=y_train)))
weights
import logging
tf.get_logger().setLevel(logging.ERROR)
model_autokeras = ak.StructuredDataClassifier(
max_trials=5,
metrics=[
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc')
],
num_classes= 2,
objective=kt.Objective("auc", direction="max"),
loss=tf.keras.losses.BinaryCrossentropy(from_logits = True),
tuner="random",
seed=1, overwrite=True
)
model_autokeras.fit(X_train, y_train, validation_split=0.25, class_weight=weights, epochs=20, verbose=0)
model_keras = model_autokeras.export_model()
model_keras.summary()
exp_keras = dx.Explainer(model=model_keras, data=X_test, y=y_test,
label="autokeras", model_type="classification", verbose=False)
exp_keras.model_performance()
from sklearn.utils import class_weight
weights = dict(enumerate(class_weight.compute_class_weight(class_weight='balanced',
classes=y_train.unique(),
y=y_train)))
y_weights = y_train.replace(list(weights.keys()), list(weights.values()))
y_weights
pool_train = catboost.Pool(X_train, y_train,
weight=y_weights)
model_catboost = catboost.CatBoostClassifier(iterations=2000)
model_catboost.fit(pool_train, verbose=False)
exp_catboost = dx.Explainer(model_catboost, X_test, y_test, verbose=False, label="catboost")
exp_catboost.model_performance()
# prepare list of Explainer objects
exp_list = [exp_baseline, exp_h2o, exp_keras, exp_catboost]
In this cross-selling task, the most important measures are precision & recall.
pd.concat([exp.model_performance().result for exp in exp_list])
exp_list[0].model_performance().plot([exp.model_performance() for exp in exp_list[1:]])
exp_list[0].model_parts().plot([exp.model_parts() for exp in exp_list[1:]])
exp_list[0].model_profile(variable_splits_type="quantiles").plot(
[exp.model_profile(variable_splits_type="quantiles") for exp in exp_list[1:]],
variables=['Age', 'Annual_Premium', 'Vintage'],
title="Partial Dependence"
)
variables = ['Previously_Insured', 'Vehicle_Damage', 'Vehicle_Age']
variable_splits = {var: data[var].unique() for var in variables}
exp_list[0].model_profile(variable_splits=variable_splits).plot(
[exp.model_profile(variable_splits=variable_splits) for exp in exp_list[1:]],
variables=variables,
title="Partial Dependence",
geom="bars"
)
exp_list[0].model_profile(type='ale', variable_splits_type="quantiles").plot(
[exp.model_profile(type='ale', variable_splits_type="quantiles") for exp in exp_list[1:]],
variables=['Age', 'Annual_Premium', 'Vintage'], title="Accumulated Local Effects"
)
exp_list[0].model_diagnostics().plot([exp.model_diagnostics() for exp in exp_list[1:]],
variable="Age", yvariable="abs_residuals", N=5000, marker_size=5, line_width=4)
new_observations = exp_baseline.data.iloc[0:10,]
pd.DataFrame({exp.label: exp.predict(new_observations) for exp in exp_list})
new_observation = exp_baseline.data.iloc[[9]]
exp_list[0].predict_parts(new_observation, label=exp_list[0].label).plot(
[exp.predict_parts(new_observation, label=exp.label) for exp in exp_list[1:]],
min_max=[0, 1]
)
new_observation = exp_baseline.data.iloc[[9]]
exp_list[0].predict_profile(new_observation, variable_splits_type="quantiles", label=exp_list[0].label).plot(
[exp.predict_profile(new_observation, variable_splits_type="quantiles", label=exp.label) for exp in exp_list[1:]],
variables=['Age', 'Annual_Premium', 'Vintage']
)
new_observation = exp_baseline.data.iloc[[9]]
for exp in exp_list:
exp.predict_surrogate(new_observation).plot()
This package uses plotly to render the plots:
plotly
in JupyterLab: Getting Started Troubleshootingshow=False
parameter in plot
method to return plotly Figure
objectdalex
package: Titanic: tutorial and examples