import dalex as dx
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
dx.__version__
Load fifa, the preprocessed players_20 dataset. It contains 5000 'overall' best players and 43 columns. These are:
It is advised to leave only one target variable for modeling.
data = dx.datasets.load_fifa()
data.head(10)
Divide the data into variables X
and a target variable y
. Here we will be predicting the value of the best players.
X = data.drop(["nationality", "overall", "potential", "value_eur", "wage_eur"], axis = 1)
y = data['value_eur']
The target variable is skewed so we transform it with log for a better fit.
ylog = np.log(y)
import matplotlib.pyplot as plt
plt.hist(ylog, bins='auto')
plt.title("ln(value_eur)")
plt.show()
Split the data into train and test.
X_train, X_test, ylog_train, ylog_test, y_train, y_test = \
train_test_split(X, ylog, y, test_size=0.25, random_state=4)
gbm_default = LGBMRegressor()
gbm_default.fit(X_train, ylog_train, verbose = False)
gbm_default._estimator_type
#:# hp tuning
estimator = LGBMRegressor(n_jobs = -1)
param_test = {
'n_estimators': list(range(201,1202,50)),
'num_leaves': list(range(6, 42, 5)),
'min_child_weight': [1e-3, 1e-2, 1e-1, 15e-2],
'learning_rate': [1e-3, 1e-2, 1e-1, 15e-2]
}
rs = RandomizedSearchCV(
estimator=estimator,
param_distributions=param_test,
n_iter=100,
cv=4,
random_state=1
)
# rs.fit(X, ylog)
# print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))
#:# best parameters after 100 iterations
best_params = {'num_leaves': 6,
'n_estimators': 951,
'min_child_weight': 0.1,
'learning_rate': 0.15}
gbm_tuned = LGBMRegressor(**best_params)
gbm_tuned.fit(X_train, ylog_train)
We aim to see real values of the target variable in the explanations (not log). Therefore, we need to make a custom predict_function
.
def predict_function(model, data):
return np.exp(model.predict(data))
exp_default = dx.Explainer(gbm_default, X_test, y_test,
predict_function=predict_function, label='default')
exp_tuned = dx.Explainer(gbm_tuned, X_test, y_test,
predict_function=predict_function, label='tuned')
Above functionalities are accessible from the Explainer
object through its methods.
Model-level and predict-level methods return a new unique object that contains the result
attribute (pandas.DataFrame
) and the plot
method.
mp_default = exp_default.model_performance("regression")
mp_default.result
mp_tuned = exp_tuned.model_performance("regression")
mp_tuned.result
mp_default.plot(mp_tuned)