Compare various models with dalex ft. h2o, autokeras, catboost, lightgbm

In [1]:
import dalex as dx
import pandas as pd
import numpy as np
import sklearn
import tensorflow as tf
import autokeras as ak
import kerastuner as kt
import h2o
import catboost
import lightgbm

import warnings
warnings.filterwarnings('ignore')
In [2]:
# session info
pkg_dict = {}
for pkg in [dx, pd, np, sklearn, tf, ak, kt, h2o, catboost, lightgbm]:
    pkg_dict[str.split(str(pkg))[1].replace("'", "")] = pkg.__version__
pd.DataFrame(pkg_dict, index=["version"])
Out[2]:
dalex pandas numpy sklearn tensorflow autokeras kerastuner h2o catboost lightgbm
version 1.0.0 1.1.4 1.19.3 0.23.2 2.3.0 1.0.12 1.0.2 3.30.1.3 0.24.1 2.3.1

data

In [3]:
data = pd.read_csv("train.csv")
data.info()
data.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB
Out[3]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28.0 0 > 2 Years Yes 40454.0 26.0 217 1
1 2 Male 76 1 3.0 0 1-2 Year No 33536.0 26.0 183 0
2 3 Male 47 1 28.0 0 > 2 Years Yes 38294.0 26.0 27 1
3 4 Male 21 1 11.0 1 < 1 Year No 28619.0 152.0 203 0
4 5 Female 29 1 41.0 1 < 1 Year No 27496.0 152.0 39 0

We have 380k observations, 11 variables and binary target. For the purpose of this comparison, let's use 10% of the data.

Clean data:

  • remove id, Policy_Sales_Channel, Region_Code

  • convert Vehicle_Age to integer

  • convert Gender, Vehicle_Damage to binary

  • investigate Driving_License, Annual_Premium

In [4]:
from sklearn.model_selection import train_test_split
data, _ = train_test_split(data, train_size=0.1, random_state=1, stratify=data.Response)
In [5]:
# drop columns
data.drop(["id", "Region_Code", "Policy_Sales_Channel"], axis=1, inplace=True)
In [6]:
# convert three columns
print(data.Vehicle_Age.unique())
data.replace({'Gender': ["Male", "Female"], 'Vehicle_Damage': ["Yes", "No"], 'Vehicle_Age': data.Vehicle_Age.unique()},
             {'Gender': [1, 0], 'Vehicle_Damage': [1, 0], 'Vehicle_Age': [2, 1, 0]},
            inplace=True)
['1-2 Year' '< 1 Year' '> 2 Years']
In [7]:
# what about Driving License in selling the vehicle insurance?
print(data.Driving_License.mean())

# 5% people bought the vehicle insurance without the Driving License
print(data.Response[data.Driving_License==0].mean())

# let's remove this variable for the clarity
# in the model we could assign: IF Driving_License == 0 THEN Response = 0
data.drop("Driving_License", axis=1, inplace=True)
0.9978220939385988
0.04819277108433735
In [8]:
# what about the distribution of Annual_Premium?
import matplotlib.pyplot as plt
_ = plt.hist(data.Annual_Premium, bins='auto', log=True)
plt.show()
In [9]:
# where is the peek?
print(data.Annual_Premium.min())

# a lot of the same values
print((data.Annual_Premium==data.Annual_Premium.min()).sum())

# some very big values (0.2% above 100k)
print((data.Annual_Premium>100000).sum() / data.shape[0])
2630.0
6511
0.0019679874048806087
In [10]:
# let's make a variable indicating the baseline, and move the annual premium
data = data.assign(
    Annual_Premium_Baseline=lambda x: (x.Annual_Premium==data.Annual_Premium.min()).astype(int),
    Annual_Premium=data.Annual_Premium-data.Annual_Premium.min()
)

# for the sake of this comparison, let's remove heavy outliers as well
data = data[data.Annual_Premium<100000-2630]
_ = plt.hist(data.Annual_Premium, bins='auto')
plt.show()

split

In [11]:
data.shape
Out[11]:
(38035, 9)
In [12]:
data.Response.mean() # uneven target
Out[12]:
0.1225713158932562
In [13]:
X, y = data.drop("Response", axis=1), data.Response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)
In [14]:
X_train.head()
Out[14]:
Gender Age Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Vintage Annual_Premium_Baseline
340370 1 50 1 2 1 0.0 34 1
47839 0 47 0 2 1 24903.0 277 0
348313 0 36 0 2 1 32059.0 66 0
150735 1 43 0 0 1 36913.0 115 0
354291 0 33 1 1 0 21246.0 235 0

models

baseline

In [15]:
model_baseline = lightgbm.LGBMClassifier(boosting_type="dart", n_estimators=1000, is_unbalance=True)
model_baseline.fit(X_train, y_train)
exp_baseline = dx.Explainer(model_baseline, X_test, y_test, verbose=False, label="lgbm_dart")
exp_baseline.model_performance()
Out[15]:
recall precision f1 accuracy auc
lgbm_dart 0.80789 0.2911 0.427987 0.735198 0.83336

h2o

In [ ]:
_ = h2o.init(nthreads=-1, max_mem_size=16)
h2o.no_progress()
In [17]:
df = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
df.Response = df['Response'].asfactor()
model_h2o = h2o.estimators.H2ORandomForestEstimator(ntrees=1000,
                                                    nfolds=3,
                                                    balance_classes=True,
                                                    seed=1)
model_h2o.train(x=X_train.columns.to_list(),
                y="Response",
                training_frame=df)
In [18]:
exp_h2o = dx.Explainer(model_h2o, h2o.H2OFrame(X_test), y_test,
                       label="h2o_rf", model_type='classification', verbose=False)
exp_h2o.model_performance()
Out[18]:
recall precision f1 accuracy auc
h2o_rf 0.081475 0.305466 0.128639 0.864655 0.824528

autokeras

In [19]:
from sklearn.utils import class_weight
weights = dict(enumerate(class_weight.compute_class_weight(class_weight='balanced',
                                                           classes=y_train.unique(),
                                                           y=y_train)))
weights
Out[19]:
{0: 0.5698361965641231, 1: 4.079805491990847}
In [20]:
import logging
tf.get_logger().setLevel(logging.ERROR)
model_autokeras = ak.StructuredDataClassifier(
    max_trials=5,
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc')
    ],
    num_classes= 2,
    objective=kt.Objective("auc", direction="max"),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits = True),
    tuner="random",
    seed=1, overwrite=True
)

model_autokeras.fit(X_train, y_train, validation_split=0.25, class_weight=weights, epochs=20, verbose=0)
In [21]:
model_keras = model_autokeras.export_model()
model_keras.summary()
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
multi_category_encoding (Mul (None, 8)                 0         
_________________________________________________________________
normalization (Normalization (None, 8)                 17        
_________________________________________________________________
dense (Dense)                (None, 16)                144       
_________________________________________________________________
batch_normalization (BatchNo (None, 16)                64        
_________________________________________________________________
re_lu (ReLU)                 (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
re_lu_2 (ReLU)               (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
_________________________________________________________________
classification_head_1 (Activ (None, 1)                 0         
=================================================================
Total params: 13,986
Trainable params: 13,361
Non-trainable params: 625
_________________________________________________________________
In [22]:
exp_keras = dx.Explainer(model=model_keras, data=X_test, y=y_test,
                         label="autokeras", model_type="classification", verbose=False)
exp_keras.model_performance()
Out[22]:
recall precision f1 accuracy auc
autokeras 0.957118 0.257321 0.405597 0.65601 0.832841

catboost

In [23]:
from sklearn.utils import class_weight
weights = dict(enumerate(class_weight.compute_class_weight(class_weight='balanced',
                                                           classes=y_train.unique(),
                                                           y=y_train)))
y_weights = y_train.replace(list(weights.keys()), list(weights.values()))
y_weights
Out[23]:
340370    0.569836
47839     0.569836
348313    0.569836
150735    0.569836
354291    0.569836
            ...   
298505    0.569836
43816     0.569836
357934    0.569836
25826     0.569836
289619    4.079805
Name: Response, Length: 28526, dtype: float64
In [24]:
pool_train = catboost.Pool(X_train, y_train,
                           weight=y_weights)
model_catboost = catboost.CatBoostClassifier(iterations=2000)
model_catboost.fit(pool_train, verbose=False)
Out[24]:
<catboost.core.CatBoostClassifier at 0x2a08ef41250>
In [25]:
exp_catboost = dx.Explainer(model_catboost, X_test, y_test, verbose=False, label="catboost")
exp_catboost.model_performance()
Out[25]:
recall precision f1 accuracy auc
catboost 0.86964 0.290878 0.435942 0.724051 0.841005

comparison with dalex

In [26]:
# prepare list of Explainer objects
exp_list = [exp_baseline, exp_h2o, exp_keras, exp_catboost]

In this cross-selling task, the most important measures are precision & recall.

In [27]:
pd.concat([exp.model_performance().result for exp in exp_list])
Out[27]:
recall precision f1 accuracy auc
lgbm_dart 0.807890 0.291100 0.427987 0.735198 0.833360
h2o_rf 0.081475 0.305466 0.128639 0.864655 0.824528
autokeras 0.957118 0.257321 0.405597 0.656010 0.832841
catboost 0.869640 0.290878 0.435942 0.724051 0.841005

residual distribution

In [28]:
exp_list[0].model_performance().plot([exp.model_performance() for exp in exp_list[1:]])

In [29]:
exp_list[0].model_parts().plot([exp.model_parts() for exp in exp_list[1:]])

In [30]:
exp_list[0].model_profile(variable_splits_type="quantiles").plot(
    [exp.model_profile(variable_splits_type="quantiles") for exp in exp_list[1:]],
    variables=['Age', 'Annual_Premium', 'Vintage'],
    title="Partial Dependence"
)
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.96it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:34<00:00,  4.36s/it]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.97it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:00<00:00, 69.74it/s]
In [31]:
variables = ['Previously_Insured', 'Vehicle_Damage', 'Vehicle_Age']
variable_splits = {var: data[var].unique() for var in variables}
exp_list[0].model_profile(variable_splits=variable_splits).plot(
    [exp.model_profile(variable_splits=variable_splits) for exp in exp_list[1:]],
    variables=variables,
    title="Partial Dependence",
    geom="bars"
)
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 3/3 [00:00<00:00, 46.14it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.43it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 3/3 [00:00<00:00, 18.60it/s]
Calculating ceteris paribus: 100%|██████████████████████████████████████████████████████| 3/3 [00:00<00:00, 123.92it/s]

In [32]:
exp_list[0].model_profile(type='ale', variable_splits_type="quantiles").plot(
    [exp.model_profile(type='ale', variable_splits_type="quantiles") for exp in exp_list[1:]],
    variables=['Age', 'Annual_Premium', 'Vintage'], title="Accumulated Local Effects"
)
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.87it/s]
Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 12.85it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:33<00:00,  4.13s/it]
Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 12.17it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.23it/s]
Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 8/8 [00:01<00:00,  6.56it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:00<00:00, 41.88it/s]
Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 8/8 [00:01<00:00,  6.72it/s]

In [33]:
exp_list[0].model_diagnostics().plot([exp.model_diagnostics() for exp in exp_list[1:]],
                                     variable="Age", yvariable="abs_residuals", N=5000, marker_size=5, line_width=4)

predict

In [34]:
new_observations = exp_baseline.data.iloc[0:10,]
pd.DataFrame({exp.label: exp.predict(new_observations) for exp in exp_list})
Out[34]:
lgbm_dart h2o_rf autokeras catboost
0 0.784506 0.399221 0.772648 0.803115
1 0.640882 0.156924 0.745807 0.665040
2 0.000287 0.000115 0.000126 0.002320
3 0.000034 0.000020 0.000012 0.000057
4 0.000703 0.000023 0.000610 0.001277
5 0.000049 0.000026 0.000152 0.002021
6 0.000222 0.000023 0.000153 0.000947
7 0.755653 0.295621 0.736705 0.727939
8 0.000058 0.000144 0.000029 0.000718
9 0.643059 0.266292 0.608771 0.564415

In [35]:
new_observation = exp_baseline.data.iloc[[9]]
exp_list[0].predict_parts(new_observation, label=exp_list[0].label).plot(
    [exp.predict_parts(new_observation, label=exp.label) for exp in exp_list[1:]],
    min_max=[0, 1]
)

In [36]:
new_observation = exp_baseline.data.iloc[[9]]
exp_list[0].predict_profile(new_observation, variable_splits_type="quantiles", label=exp_list[0].label).plot(
    [exp.predict_profile(new_observation, variable_splits_type="quantiles", label=exp.label) for exp in exp_list[1:]],
    variables=['Age', 'Annual_Premium', 'Vintage']
)
Calculating ceteris paribus: 100%|██████████████████████████████████████████████████████| 8/8 [00:00<00:00, 159.87it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:01<00:00,  6.38it/s]
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 8/8 [00:00<00:00, 24.23it/s]
Calculating ceteris paribus: 100%|██████████████████████████████████████████████████████| 8/8 [00:00<00:00, 110.50it/s]

In [37]:
new_observation = exp_baseline.data.iloc[[9]]
for exp in exp_list:
    exp.predict_surrogate(new_observation).plot()

Plots

This package uses plotly to render the plots:

In [ ]: