Machine Learning Exam
Elias Fischer, Noah Pokorny, Lucas Intveen, Niklas Jessen
Load Data and Required Packages
import
pandas
as
pd
import
numpy
as
np
import
seaborn
as
sn
import
matplotlib
.
pyplot
as
plt
from
IPython
.
display
import
Image
from
sklearn
import
preprocessing
from
sklearn
.
impute
import
KNNImputer
from
sklearn
.
model_selection
import
train_test_split
,
cross_val_score
,
GridSearchCV
,
RandomizedSearchCV
,
StratifiedKFold
,
RepeatedStratifiedKFold
from
sklearn
.
pipeline
import
Pipeline
from
sklearn
.
decomposition
import
PCA
from
sklearn
.
manifold
import
TSNE
from
sklearn
.
neighbors
import
KNeighborsClassifier
from
sklearn
.
linear_model
import
LogisticRegression
from
sklearn
.
metrics
import
classification_report
,
confusion_matrix
from
sklearn
.
tree
import
DecisionTreeClassifier
,
export_graphviz
from
pydotplus
import
graph_from_dot_data
from
sklearn
.
ensemble
import
RandomForestClassifier
from
sklearn
.
ensemble
import
GradientBoostingClassifier
from
sklearn
import
svm
from
keras
.
models
import
Sequential
from
keras
.
layers
import
Dense
from
keras
.
layers
import
Dropout
from
keras
.
wrappers
.
scikit_learn
import
KerasClassifier
# Read raw data file
xsell_raw
=
pd
.
read_csv
(
'xsell_raw.csv'
)
# First 5 rows of data set
xsell_raw
.
head
(
)
Exploratory Data Analysis
#Print shape, columns and info
print
(
xsell_raw
.
shape
)
print
(
xsell_raw
.
columns
)
print
(
xsell_raw
.
info
(
)
)
(10000, 32)
Index(['xsell', 'age', 'entry_age', 'gender', 'acad', 'marital', 'nprod',
'duration', 'last_acc', 'occupation', 'nr_hh', 'ppower', 'avg_res_dur',
'pop_km', 'car_seg', 'res_move_365', 'giro_mailing', 'total_mailings',
'extra_acc', 'fixed_acc', 'constr_loan', 'cons_loan', 'sec_acc',
'total_savings', 'total_loans', 'transactions_year', 'logins',
'mobile_logins', 'calls', 'referrals', 'complaints', 'pref_device'],
dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 xsell 10000 non-null int64
1 age 10000 non-null int64
2 entry_age 9999 non-null float64
3 gender 9999 non-null object
4 acad 10000 non-null int64
5 marital 10000 non-null object
6 nprod 10000 non-null int64
7 duration 10000 non-null int64
8 last_acc 10000 non-null int64
9 occupation 4149 non-null object
10 nr_hh 9933 non-null float64
11 ppower 9865 non-null float64
12 avg_res_dur 9071 non-null float64
13 pop_km 9922 non-null float64
14 car_seg 8904 non-null float64
15 res_move_365 10000 non-null int64
16 giro_mailing 10000 non-null int64
17 total_mailings 10000 non-null int64
18 extra_acc 10000 non-null int64
19 fixed_acc 10000 non-null int64
20 constr_loan 10000 non-null int64
21 cons_loan 10000 non-null int64
22 sec_acc 10000 non-null int64
23 total_savings 10000 non-null float64
24 total_loans 10000 non-null float64
25 transactions_year 9832 non-null float64
26 logins 10000 non-null int64
27 mobile_logins 10000 non-null int64
28 calls 10000 non-null int64
29 referrals 10000 non-null int64
30 complaints 10000 non-null int64
31 pref_device 4968 non-null object
dtypes: float64(9), int64(19), object(4)
memory usage: 2.4+ MB
# Number of missing values in columns
display
(
xsell_raw
.
isnull
(
)
.
sum
(
)
)
#Statistics for numerical columns
xsell_raw
.
describe
(
)
# Correlation matrix formation
corr_matrix
=
xsell_raw
.
loc
[
:
]
.
corr
(
)
# Using heatmap to visualize the correlation matrix
ax
=
plt
.
subplots
(
figsize
=
(
8.5
,
8.5
)
)
ax
=
sn
.
heatmap
(
corr_matrix
,
linewidths
=
.5
,
annot
=
False
,
yticklabels
=
True
,
xticklabels
=
True
,
cmap
=
sn
.
color_palette
(
"coolwarm"
,
12
)
)
# Create histograms
fig
,
(
(
ax0
,
ax1
)
,
(
ax2
,
ax3
)
,
(
ax4
,
ax5
)
,
(
ax6
,
ax7
)
)
=
plt
.
subplots
(
nrows
=
4
,
ncols
=
2
,
figsize
=
(
12
,
15
)
)
ax0
.
hist
(
xsell_raw
[
"age"
]
,
bins
=
70
,
color
=
"#6C90F1"
,
range
=
(
18
,
70
)
)
ax0
.
set_title
(
'Master Data: Age Between 18 and 70 Years'
)
# Transformation of gender column necessary
xsell_raw
[
"gender"
]
=
xsell_raw
[
"gender"
]
.
astype
(
str
)
ax1
.
hist
(
xsell_raw
[
"gender"
]
,
bins
=
4
,
color
=
"#6C90F1"
)
ax1
.
set_title
(
'Masta Data: Gender'
)
ax2
.
hist
(
xsell_raw
[
"nprod"
]
,
bins
=
6
,
color
=
"#6C90F1"
,
range
=
(
1
,
6
)
)
ax2
.
set_title
(
'Product Data: Products or Accounts Owned'
)
ax3
.
hist
(
xsell_raw
[
"total_savings"
]
,
bins
=
100
,
color
=
"#6C90F1"
,
range
=
(
0
,
100
)
)
ax3
.
set_title
(
'Account Data: Total Savings in EUR (Max. + 100€)'
)
ax4
.
hist
(
xsell_raw
[
"total_loans"
]
,
bins
=
100
,
range
=
(
-
100
,
0
)
,
color
=
"#6C90F1"
)
ax4
.
set_title
(
'Account Data: Total Loans in EUR (Max. - 100€)'
)
ax5
.
hist
(
xsell_raw
[
"total_exposure"
]
,
bins
=
100
,
range
=
(
-
100
,
0
)
,
color
=
"#6C90F1"
)
ax5
.
set_title
(
'Account Data: Total Exposure in EUR (Max. - 100€)'
)
ax6
.
hist
(
xsell_raw
[
"transactions_year"
]
,
bins
=
40
,
color
=
"#6C90F1"
,
range
=
(
0
,
40
)
)
ax6
.
set_title
(
'Action Data: Number of Transactions in 1 Year'
)
ax7
.
hist
(
xsell_raw
[
"total_mailings"
]
,
bins
=
9
,
color
=
"#6C90F1"
,
range
=
(
0
,
9
)
)
ax7
.
set_title
(
'Action Data: Mailings Received in the Last 2 Years'
)
Pre-processing and Feature Engineering
# Generate variable total_exposure
xsell_raw
[
"total_exposure"
]
=
xsell_raw
.
total_savings
+
xsell_raw
.
total_loans
xsell_raw
.
columns
Remove Columns With too Many Missing Values
MinMax-Scaler is used for normalization, because Standard Scaler standardizes below the assumption that distribution of variables fits a bell curve. This is not the case for the present dataset.
#Create restricted dataset
xsell_res
=
xsell_raw
[
:
]
xsell_res
=
xsell_res
.
drop
(
columns
=
[
"occupation"
,
"pref_device"
,
"car_seg"
,
"avg_res_dur"
]
)
.
dropna
(
)
#Get dummies
xsell_res
=
pd
.
get_dummies
(
xsell_res
)
#Normalize variables (min-max-scaler)
min_max_scaler
=
preprocessing
.
MinMaxScaler
(
)
xsell_res_scaled
=
xsell_res
[
:
]
xsell_res_scaled
[
:
]
=
min_max_scaler
.
fit_transform
(
xsell_res_scaled
[
:
]
)
print
(
'The restricted dataframe has the shape {0} with the following columns: {1}'
.
format
(
xsell_res_scaled
.
shape
,
xsell_res_scaled
.
columns
)
)
The restricted dataframe has the shape (9721, 38) with the following columns: Index(['xsell', 'age', 'entry_age', 'acad', 'nprod', 'duration', 'last_acc',
'nr_hh', 'ppower', 'pop_km', 'res_move_365', 'giro_mailing',
'total_mailings', 'extra_acc', 'fixed_acc', 'constr_loan', 'cons_loan',
'sec_acc', 'total_savings', 'total_loans', 'transactions_year',
'logins', 'mobile_logins', 'calls', 'referrals', 'complaints',
'total_exposure', 'gender_F', 'gender_M', 'gender_MF', 'gender_nan',
'marital_cohabiting', 'marital_divorced', 'marital_married',
'marital_separated', 'marital_single', 'marital_unmarried',
'marital_widowed'],
dtype='object')
Impute values of missing values
X
=
pd
.
get_dummies
(
xsell_raw
)
.
drop
(
columns
=
"xsell"
)
y
=
xsell_raw
.
xsell
# evaluate different k on the dataset
results
=
list
(
)
strategies
=
[
str
(
i
)
for
i
in
[
1
,
3
,
5
,
7
,
9
,
15
,
18
,
21
]
]
for
s
in
strategies
:
# create the modeling pipeline
pipeline
=
Pipeline
(
steps
=
[
(
'i'
,
KNNImputer
(
n_neighbors
=
int
(
s
)
)
)
,
(
'm'
,
RandomForestClassifier
(
)
)
]
)
# evaluate the model
scores
=
cross_val_score
(
pipeline
,
X
,
y
,
scoring
=
'accuracy'
,
cv
=
5
,
n_jobs
=
-
1
)
# store results
results
.
append
(
scores
)
print
(
'>%s %.3f (%.3f)'
%
(
s
,
np
.
mean
(
scores
)
,
np
.
std
(
scores
)
)
)
>1 0.857 (0.172)
>3 0.859 (0.170)
>5 0.857 (0.172)
>7 0.856 (0.174)
>9 0.858 (0.172)
>15 0.859 (0.171)
>18 0.859 (0.169)
>21 0.855 (0.174)
# plot model accuracy vs k
mean_scores
=
[
]
for
i
in
range
(
0
,
8
)
:
mean_scores
.
append
(
np
.
mean
(
results
[
i
]
)
)
plt
.
plot
(
strategies
,
mean_scores
)
plt
.
title
(
"Imputation k"
)
plt
.
xlabel
(
"Number of Neighbors K"
)
plt
.
ylabel
(
"Model Accuracy"
)
plt
.
show
(
)
# Calling the imputer with k=3
imputer
=
KNNImputer
(
n_neighbors
=
3
,
weights
=
"uniform"
)
# Get dummies
xsell_imp
=
xsell_raw
[
:
]
xsell_imp
=
pd
.
get_dummies
(
xsell_imp
)
xsell_scaled
=
xsell_imp
[
:
]
xsell_scaled
[
:
]
=
min_max_scaler
.
fit_transform
(
xsell_imp
[
:
]
)
# Impute xsell_raw
imputed_vals
=
imputer
.
fit_transform
(
xsell_scaled
)
xsell_imp_scaled
=
xsell_imp
[
:
]
xsell_imp_scaled
[
:
]
=
pd
.
DataFrame
(
imputed_vals
)
print
(
'The imputed dataframe has the shape {0} with the following columns: \n {1}'
.
format
(
xsell_imp_scaled
.
shape
,
xsell_imp_scaled
.
columns
)
)
The imputed dataframe has the shape (10000, 56) with the following columns:
Index(['xsell', 'age', 'entry_age', 'acad', 'nprod', 'duration', 'last_acc',
'nr_hh', 'ppower', 'avg_res_dur', 'pop_km', 'car_seg', 'res_move_365',
'giro_mailing', 'total_mailings', 'extra_acc', 'fixed_acc',
'constr_loan', 'cons_loan', 'sec_acc', 'total_savings', 'total_loans',
'transactions_year', 'logins', 'mobile_logins', 'calls', 'referrals',
'complaints', 'total_exposure', 'gender_F', 'gender_M', 'gender_MF',
'gender_nan', 'marital_cohabiting', 'marital_divorced',
'marital_married', 'marital_separated', 'marital_single',
'marital_unmarried', 'marital_widowed', 'occupation_apprentice',
'occupation_blue-collar worker', 'occupation_freelancer',
'occupation_housewife', 'occupation_pensioner/retiree',
'occupation_private means', 'occupation_public servant',
'occupation_self-employed', 'occupation_soldier', 'occupation_student',
'occupation_unemployed', 'occupation_university student',
'occupation_white-collar worker', 'pref_device_PC/Laptop',
'pref_device_Tablet', 'pref_device_mobile'],
dtype='object')
Split data in training/validation and test set
# Split dataframe into dependent variable (Y) and independent variables (X)
# Restricted Dataset
X_res
=
xsell_res_scaled
.
drop
(
'xsell'
,
axis
=
1
)
Y_res
=
xsell_res_scaled
[
[
'xsell'
]
]
# Imputed Dataset
X_imp
=
xsell_imp_scaled
.
drop
(
'xsell'
,
axis
=
1
)
Y_imp
=
xsell_imp_scaled
[
[
'xsell'
]
]
# Create Train (0.6), Validation (0.2) and Test (0.2) Sets
# Restricted Dataset
X_res_train
,
X_res_test
,
y_res_train
,
y_res_test
=
train_test_split
(
X_res
,
Y_res
,
test_size
=
0.2
,
random_state
=
42
)
X_res_train
,
X_res_val
,
y_res_train
,
y_res_val
=
train_test_split
(
X_res_train
,
y_res_train
,
test_size
=
0.25
,
random_state
=
42
)
# 0.25 x 0.8 = 0.2
# Create Train (0.6), Validation (0.2) and Test (0.2) Sets
# Imputed Dataset
X_imp_train
,
X_imp_test
,
y_imp_train
,
y_imp_test
=
train_test_split
(
X_imp
,
Y_imp
,
test_size
=
0.2
,
random_state
=
42
)
X_imp_train
,
X_imp_val
,
y_imp_train
,
y_imp_val
=
train_test_split
(
X_imp_train
,
y_imp_train
,
test_size
=
0.25
,
random_state
=
42
)
# 0.25 x 0.8 = 0.2
Visualization using PCA and t-SNE
# Visualize using PCA
pca
=
PCA
(
n_components
=
3
)
pca_res_3d
=
pca
.
fit_transform
(
X_res
)
pca_imp_3d
=
pca
.
fit_transform
(
X_imp
)
# Plot restricted dataset
fig
=
plt
.
figure
(
figsize
=
(
16
,
10
)
)
ax
=
fig
.
add_subplot
(
111
,
projection
=
'3d'
)
ax
.
scatter
(
xs
=
pca_res_3d
[
:
,
0
]
,
ys
=
pca_res_3d
[
:
,
1
]
,
zs
=
pca_res_3d
[
:
,
2
]
,
c
=
Y_res
[
"xsell"
]
ax
.
set_title
(
'Restricted Dataset using PCA'
)
ax
.
set_xlabel
(
'pca-one'
)
ax
.
set_ylabel
(
'pca-two'
)
ax
.
set_zlabel
(
'pca-three'
)
plt
.
show
(
)
# Plot imputed dataset
fig
=
plt
.
figure
(
figsize
=
(
16
,
10
)
)
ax
=
fig
.
add_subplot
(
111
,
projection
=
'3d'
)
ax
.
scatter
(
xs
=
pca_imp_3d
[
:
,
0
]
,
ys
=
pca_imp_3d
[
:
,
1
]
,
zs
=
pca_imp_3d
[
:
,
2
]
,
c
=
Y_imp
[
"xsell"
]
ax
.
set_title
(
'Imputed Dataset using PCA'
)
ax
.
set_xlabel
(
'pca-one'
)
ax
.
set_ylabel
(
'pca-two'
)
ax
.
set_zlabel
(
'pca-three'
)
plt
.
show
(
)
# Use tSNE to reduce 3D PCA to 2D
tsne
=
TSNE
(
n_components
=
2
,
verbose
=
1
,
perplexity
=
40
,
n_iter
=
300
)
tsne_res_2d
=
tsne
.
fit_transform
(
pca_res_3d
)
tsne_imp_2d
=
tsne
.
fit_transform
(
pca_imp_3d
)
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 9721 samples in 0.007s...
[t-SNE] Computed neighbors for 9721 samples in 0.371s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9721
[t-SNE] Computed conditional probabilities for sample 2000 / 9721
[t-SNE] Computed conditional probabilities for sample 3000 / 9721
[t-SNE] Computed conditional probabilities for sample 4000 / 9721
[t-SNE] Computed conditional probabilities for sample 5000 / 9721
[t-SNE] Computed conditional probabilities for sample 6000 / 9721
[t-SNE] Computed conditional probabilities for sample 7000 / 9721
[t-SNE] Computed conditional probabilities for sample 8000 / 9721
[t-SNE] Computed conditional probabilities for sample 9000 / 9721
[t-SNE] Computed conditional probabilities for sample 9721 / 9721
[t-SNE] Mean sigma: 0.012118
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.338646
[t-SNE] KL divergence after 300 iterations: 1.945432
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.018s...
[t-SNE] Computed neighbors for 10000 samples in 0.407s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.018020
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.045982
[t-SNE] KL divergence after 300 iterations: 1.871857
# Plot 2D representation
fig
,
ax
=
plt
.
subplots
(
)
ax
.
scatter
(
x
=
tsne_res_2d
[
:
,
0
]
,
y
=
tsne_res_2d
[
:
,
1
]
,
c
=
Y_res
[
"xsell"
]
ax
.
set_title
(
'Restricted Dataset using PCA and tSNE'
)
ax
.
set_xlabel
(
'tsne_pca-one'
)
ax
.
set_ylabel
(
'tsne_pca-two'
)
plt
.
show
(
)
fig
,
ax
=
plt
.
subplots
(
)
ax
.
scatter
(
x
=
tsne_imp_2d
[
:
,
0
]
,
y
=
tsne_imp_2d
[
:
,
1
]
,
c
=
Y_imp
[
"xsell"
]
ax
.
set_title
(
'Imputed Dataset using PCA and tSNE'
)
ax
.
set_xlabel
(
'tsne_pca-one'
)
ax
.
set_ylabel
(
'tsne_pca-two'
)
plt
.
show
(
)
Create Models
KNN
# Grid Search for optimzing Hyperparameter
param_grid_knn
=
{
"n_neighbors"
:
[
1
,
3
,
5
,
7
,
9
,
19
,
29
,
39
,
49
,
59
,
69
,
79
,
89
,
99
]
,
"weights"
:
[
"uniform"
,
"distance"
]
,
"metric"
:
[
'euclidean'
,
'manhattan'
]
}
knn_clf
=
KNeighborsClassifier
(
)
# Restricted dataset
knn_grid_search_res
=
GridSearchCV
(
knn_clf
,
param_grid_knn
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
knn_gs_res_fit
=
knn_grid_search_res
.
fit
(
X_res_val
,
y_res_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}'
.
format
(
knn_grid_search_res
.
best_score_
,
knn_grid_search_res
.
best_params_
)
)
# Imputed dataset
knn_grid_search_imp
=
GridSearchCV
(
knn_clf
,
param_grid_knn
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
knn_gs_imp_fit
=
knn_grid_search_imp
.
fit
(
X_imp_val
,
y_imp_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}'
.
format
(
knn_grid_search_imp
.
best_score_
,
knn_grid_search_imp
.
best_params_
)
)
Fitting 10 folds for each of 56 candidates, totalling 560 fits
The best results (Accuracy: 0.8909357652656622) on the restricted dataset were achieved with the following parameters:
{'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'distance'}
Fitting 10 folds for each of 56 candidates, totalling 560 fits
The best results (Accuracy: 0.898) on the imputed dataset were achieved with the following parameters:
{'metric': 'euclidean', 'n_neighbors': 29, 'weights': 'distance'}
# Run KNN on test set with optimized parameters
# Restricted dataset
knn_clf_res
=
knn_gs_res_fit
.
best_estimator_
knn_res_test_fit
=
knn_clf_res
.
fit
(
X_res_train
,
y_res_train
[
'xsell'
]
)
y_res_pred_test
=
knn_res_test_fit
.
predict
(
X_res_test
)
print
(
'Metrics for the restricted dataset: \n'
,
classification_report
(
y_res_pred_test
,
y_res_test
[
'xsell'
]
)
)
# Imputed dataset
params_imp
=
knn_gs_imp_fit
.
best_params_
knn_clf_imp
=
knn_gs_imp_fit
.
best_estimator_
knn_imp_test_fit
=
knn_clf_imp
.
fit
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
y_imp_pred_test
=
knn_imp_test_fit
.
predict
(
X_imp_test
)
print
(
'Metrics for the imputed dataset: \n'
,
classification_report
(
y_imp_pred_test
,
y_imp_test
[
'xsell'
]
)
)
Metrics for the restricted dataset:
precision recall f1-score support
0.0 0.91 0.91 0.91 975
1.0 0.91 0.91 0.91 970
accuracy 0.91 1945
macro avg 0.91 0.91 0.91 1945
weighted avg 0.91 0.91 0.91 1945
Metrics for the imputed dataset:
precision recall f1-score support
0.0 0.91 0.91 0.91 1001
1.0 0.91 0.91 0.91 999
accuracy 0.91 2000
macro avg 0.91 0.91 0.91 2000
weighted avg 0.91 0.91 0.91 2000
Logistic Regression
# Grid Search for optimzing Hyperparameters
log_clf
=
LogisticRegression
(
max_iter
=
10000
)
param_grid_log
=
{
"C"
:
np
.
logspace
(
-
3
,
3
,
7
)
,
"penalty"
:
[
"l1"
,
"l2"
]
}
# l1 lasso l2 ridge
# Restricted dataset
log_grid_search_res
=
GridSearchCV
(
log_clf
,
param_grid_log
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
log_gs_res_fit
=
log_grid_search_res
.
fit
(
X_res_val
,
y_res_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}'
.
format
(
log_gs_res_fit
.
best_score_
,
log_gs_res_fit
.
best_params_
)
)
# Imputed dataset
log_grid_search_imp
=
GridSearchCV
(
log_clf
,
param_grid_log
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
log_gs_imp_fit
=
log_grid_search_imp
.
fit
(
X_imp_val
,
y_imp_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}'
.
format
(
log_gs_imp_fit
.
best_score_
,
log_gs_imp_fit
.
best_params_
)
)
Fitting 10 folds for each of 14 candidates, totalling 140 fits
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the test scores are non-finite: [ nan 0.8904203 nan 0.8904203 nan 0.8904203
nan 0.8904203 nan 0.8904203 nan 0.88938937
nan 0.88836373]
category=UserWarning
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the train scores are non-finite: [ nan 0.89043195 nan 0.89043195 nan 0.89043195
nan 0.89043195 nan 0.89117504 nan 0.89271822
nan 0.89294686]
category=UserWarning
The best results (Accuracy: 0.8904203013481364) on the restricted dataset were achieved with the following parameters:
{'C': 0.001, 'penalty': 'l2'}
Fitting 10 folds for each of 14 candidates, totalling 140 fits
The best results (Accuracy: 0.8985) on the imputed dataset were achieved with the following parameters:
{'C': 0.001, 'penalty': 'l2'}
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the test scores are non-finite: [ nan 0.8985 nan 0.8985 nan 0.8985 nan 0.898 nan 0.8985
nan 0.897 nan 0.895 ]
category=UserWarning
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the train scores are non-finite: [ nan 0.8985 nan 0.8985 nan 0.8985
nan 0.89866667 nan 0.89961111 nan 0.89972222
nan 0.89977778]
category=UserWarning
log_clf_res
=
log_gs_res_fit
.
best_estimator_
log_clf_imp
=
log_gs_imp_fit
.
best_estimator_
# Restrictred dataset
log_res
=
log_clf_res
.
fit
(
X_res_train
,
y_res_train
[
'xsell'
]
)
log_res_score
=
log_res
.
score
(
X_res_train
,
y_res_train
[
'xsell'
]
)
print
(
'Restricted Dataset: Logistic Regression score on training set: {0}'
.
format
(
log_res_score
)
)
y_res_pred_log
=
log_res
.
predict
(
X_res_test
)
print
(
classification_report
(
y_res_test
,
y_res_pred_log
)
)
#Imputed dataset
log_imp
=
log_clf_imp
.
fit
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
log_imp_score
=
log_imp
.
score
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
print
(
'Imputed Dataset: Logistic Regression score on training set: {0}'
.
format
(
log_imp_score
)
)
y_imp_pred_log
=
log_imp
.
predict
(
X_imp_test
)
print
(
classification_report
(
y_imp_test
,
y_imp_pred_log
)
)
if
log_imp_score
>
log_res_score
:
print
(
"\nLogistic Regression performs better on the imputed dataset."
)
else
:
print
(
"\nLogistic Regression performs better on the restricted dataset."
)
Restricted Dataset: Logistic Regression score on training set: 0.9005486968449932
precision recall f1-score support
0.0 0.91 0.91 0.91 968
1.0 0.91 0.91 0.91 977
accuracy 0.91 1945
macro avg 0.91 0.91 0.91 1945
weighted avg 0.91 0.91 0.91 1945
Imputed Dataset: Logistic Regression score on training set: 0.898
precision recall f1-score support
0.0 0.91 0.91 0.91 996
1.0 0.91 0.91 0.91 1004
accuracy 0.91 2000
macro avg 0.91 0.91 0.91 2000
weighted avg 0.91 0.91 0.91 2000
Logistic Regression performs better on the restricted dataset.
# Feature importance restricted dataset
log_importances
=
pd
.
Series
(
log_res
.
coef_
[
0
]
,
X_res_train
.
columns
)
log_importances
=
log_importances
.
sort_values
(
ascending
=
False
)
print
(
log_importances
)
imp_log
=
plt
.
subplots
(
figsize
=
(
8.5
,
8.5
)
)
imp_log
=
plt
.
bar
(
X_res_train
.
columns
,
log_res
.
coef_
[
0
]
)
print
(
log_res
.
coef_
)
marital_married 0.080031
age 0.061307
gender_MF 0.042160
entry_age 0.030141
last_acc 0.025576
duration 0.023563
pop_km 0.016969
constr_loan 0.012987
marital_cohabiting 0.008162
acad 0.006113
marital_divorced 0.004660
logins 0.004535
sec_acc 0.004123
calls 0.002944
extra_acc 0.002768
nprod 0.002475
marital_widowed 0.002279
complaints 0.001182
marital_separated 0.001178
nr_hh 0.000730
mobile_logins 0.000271
gender_nan 0.000000
transactions_year -0.000060
total_savings -0.000491
fixed_acc -0.000561
ppower -0.001031
referrals -0.001935
total_exposure -0.004080
res_move_365 -0.004093
total_loans -0.010466
cons_loan -0.015873
gender_F -0.020259
gender_M -0.021901
marital_unmarried -0.037191
total_mailings -0.056862
marital_single -0.059117
giro_mailing -0.849402
dtype: float64
[[ 6.13066771e-02 3.01407003e-02 6.11330406e-03 2.47464439e-03
2.35634195e-02 2.55760153e-02 7.30384784e-04 -1.03094117e-03
1.69691310e-02 -4.09313762e-03 -8.49401927e-01 -5.68619178e-02
2.76784987e-03 -5.60969443e-04 1.29866962e-02 -1.58726780e-02
4.12316366e-03 -4.91285019e-04 -1.04663913e-02 -5.99806312e-05
4.53540079e-03 2.71063492e-04 2.94399396e-03 -1.93498395e-03
1.18158155e-03 -4.08002783e-03 -2.02589927e-02 -2.19011260e-02
4.21600717e-02 0.00000000e+00 8.16154887e-03 4.65980655e-03
8.00305484e-02 1.17799812e-03 -5.91174580e-02 -3.71911041e-02
2.27861324e-03]]
# Feature importance imputed
log_importances
=
pd
.
Series
(
log_imp
.
coef_
[
0
]
,
X_imp_train
.
columns
)
log_importances
=
log_importances
.
sort_values
(
ascending
=
False
)
print
(
log_importances
)
marital_married 0.091648
age 0.058591
gender_MF 0.052158
last_acc 0.044697
duration 0.036174
occupation_self-employed 0.030339
entry_age 0.019694
constr_loan 0.015286
marital_divorced 0.011058
occupation_freelancer 0.009931
occupation_public servant 0.009856
occupation_white-collar worker 0.009738
avg_res_dur 0.009368
occupation_unemployed 0.008912
extra_acc 0.006635
ppower 0.006374
occupation_housewife 0.006167
sec_acc 0.005832
occupation_pensioner/retiree 0.004857
pop_km 0.004653
logins 0.003816
nprod 0.002890
occupation_university student 0.002768
acad 0.002683
occupation_private means 0.002206
calls 0.001179
mobile_logins 0.001169
complaints 0.000973
referrals 0.000936
marital_cohabiting 0.000502
occupation_soldier 0.000482
gender_nan 0.000393
fixed_acc 0.000279
marital_separated 0.000196
total_savings 0.000083
transactions_year -0.000030
marital_widowed -0.000527
car_seg -0.000582
occupation_apprentice -0.000998
occupation_student -0.001444
nr_hh -0.001462
total_exposure -0.003923
occupation_blue-collar worker -0.004411
res_move_365 -0.004620
gender_F -0.010119
total_loans -0.011050
pref_device_Tablet -0.013789
pref_device_mobile -0.018566
cons_loan -0.022746
gender_M -0.042432
marital_single -0.049188
total_mailings -0.050189
marital_unmarried -0.053691
pref_device_PC/Laptop -0.061794
giro_mailing -0.858249
dtype: float64
Tree-based methods
# Decision Tree
# Grid Search for Hyperparameters
dtree_clf
=
DecisionTreeClassifier
(
)
param_grid_tree
=
{
'criterion'
:
[
'gini'
,
'entropy'
]
,
'max_features'
:
[
'auto'
,
'sqrt'
]
,
'min_samples_leaf'
:
[
1
,
3
,
5
]
,
'min_samples_split'
:
[
2
,
5
,
10
]
,
'max_depth'
:
[
10
,
30
,
50
,
70
,
90
,
None
]
}
tree_grid_search_res
=
GridSearchCV
(
dtree_clf
,
param_grid_tree
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
tree_gs_res_fit
=
tree_grid_search_res
.
fit
(
X_res_val
,
y_res_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}'
.
format
(
tree_gs_res_fit
.
best_score_
,
tree_gs_res_fit
.
best_params_
)
)
tree_grid_search_imp
=
GridSearchCV
(
dtree_clf
,
param_grid_tree
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
tree_gs_imp_fit
=
tree_grid_search_res
.
fit
(
X_imp_val
,
y_imp_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}'
.
format
(
tree_gs_imp_fit
.
best_score_
,
tree_gs_imp_fit
.
best_params_
)
)
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
The best results (Accuracy: 0.8497805974094634) on the restricted dataset were achieved with the following parameters:
{'criterion': 'entropy', 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5}
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
The best results (Accuracy: 0.8489999999999999) on the imputed dataset were achieved with the following parameters:
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10}
dtree_res
=
tree_gs_res_fit
.
best_estimator_
# Restricted dataset
dtree_res_fit
=
dtree_res
.
fit
(
X_res_train
,
y_res_train
[
'xsell'
]
)
y_res_pred_tree_gs
=
dtree_res
.
predict
(
X_res_test
)
print
(
classification_report
(
y_res_pred_tree_gs
,
y_res_test
[
'xsell'
]
)
)
mat_imp
=
confusion_matrix
(
y_res_pred_tree_gs
,
y_res_test
[
'xsell'
]
)
sn
.
heatmap
(
mat_imp
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
precision recall f1-score support
0.0 0.85 0.89 0.87 923
1.0 0.90 0.86 0.88 1022
accuracy 0.88 1945
macro avg 0.87 0.88 0.87 1945
weighted avg 0.88 0.88 0.88 1945
#Most important features
dtree_importances
=
pd
.
Series
(
dtree_res_fit
.
feature_importances_
,
X_res_train
.
columns
)
dtree_importances
=
dtree_importances
.
sort_values
(
ascending
=
False
)
print
(
dtree_importances
)
imp_tree
=
plt
.
subplots
(
figsize
=
(
8.5
,
8.5
)
)
imp_tree
=
plt
.
bar
(
X_res_train
.
columns
,
dtree_res_fit
.
feature_importances_
)
giro_mailing 0.720036
duration 0.073823
last_acc 0.043999
logins 0.035227
entry_age 0.021432
transactions_year 0.017323
total_mailings 0.014349
total_exposure 0.013042
total_savings 0.011623
age 0.008430
nr_hh 0.007799
marital_unmarried 0.003957
total_loans 0.003913
ppower 0.003391
pop_km 0.003119
gender_MF 0.002866
gender_M 0.001821
referrals 0.001724
res_move_365 0.001662
extra_acc 0.001424
nprod 0.001371
marital_married 0.001360
sec_acc 0.001262
calls 0.000948
mobile_logins 0.000851
constr_loan 0.000779
marital_divorced 0.000697
marital_single 0.000696
marital_cohabiting 0.000690
cons_loan 0.000320
gender_F 0.000067
marital_separated 0.000000
acad 0.000000
fixed_acc 0.000000
gender_nan 0.000000
complaints 0.000000
marital_widowed 0.000000
dtype: float64
dtree_imp
=
tree_gs_imp_fit
.
best_estimator_
#Imputed dataset
dtree_imp_fit
=
dtree_imp
.
fit
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
y_imp_pred_tree_gs
=
dtree_imp
.
predict
(
X_imp_test
)
print
(
classification_report
(
y_imp_pred_tree_gs
,
y_imp_test
[
'xsell'
]
)
)
# Visualize tree
data_imp
=
export_graphviz
(
dtree_imp_fit
,
out_file
=
None
,
feature_names
=
X_imp_train
.
columns
)
graph_imp
=
graph_from_dot_data
(
data_imp
)
Image
(
graph_imp
.
create_png
(
)
)
mat_imp
=
confusion_matrix
(
y_imp_pred_tree_gs
,
y_imp_test
[
'xsell'
]
)
sn
.
heatmap
(
mat_imp
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
precision recall f1-score support
0.0 0.89 0.88 0.89 1010
1.0 0.88 0.89 0.88 990
accuracy 0.89 2000
macro avg 0.89 0.89 0.88 2000
weighted avg 0.89 0.89 0.89 2000
#Most important features
dtree_importances
=
pd
.
Series
(
dtree_imp_fit
.
feature_importances_
,
X_imp_train
.
columns
)
dtree_importances
=
dtree_importances
.
sort_values
(
ascending
=
False
)
#forest_importances = forest_importances.sort_values('importance')
print
(
dtree_importances
)
#plt.bar(X_res_train.columns, dtree_imp_fit.feature_importances_)
giro_mailing 0.555181
duration 0.135891
last_acc 0.050360
age 0.044987
logins 0.031553
transactions_year 0.029876
total_exposure 0.021039
total_savings 0.018081
nr_hh 0.014706
entry_age 0.013577
total_mailings 0.013466
ppower 0.010012
pop_km 0.009118
total_loans 0.006873
cons_loan 0.005123
car_seg 0.004500
nprod 0.004442
calls 0.003268
constr_loan 0.003177
gender_M 0.003037
marital_married 0.003011
gender_MF 0.002694
avg_res_dur 0.002399
marital_single 0.002223
occupation_white-collar worker 0.002182
extra_acc 0.001953
res_move_365 0.001867
pref_device_mobile 0.001312
occupation_housewife 0.001292
mobile_logins 0.001021
occupation_blue-collar worker 0.000837
marital_divorced 0.000540
pref_device_PC/Laptop 0.000396
gender_F 0.000012
occupation_private means 0.000000
pref_device_Tablet 0.000000
fixed_acc 0.000000
occupation_university student 0.000000
occupation_unemployed 0.000000
occupation_student 0.000000
occupation_soldier 0.000000
occupation_self-employed 0.000000
occupation_public servant 0.000000
occupation_freelancer 0.000000
occupation_pensioner/retiree 0.000000
complaints 0.000000
occupation_apprentice 0.000000
marital_widowed 0.000000
marital_unmarried 0.000000
sec_acc 0.000000
marital_separated 0.000000
acad 0.000000
marital_cohabiting 0.000000
referrals 0.000000
gender_nan 0.000000
dtype: float64
# Random Forest
rf_clf
=
RandomForestClassifier
(
)
# Grid Search for optimzing Hyperparameter
param_grid_rf
=
[
{
'bootstrap'
:
[
True
,
False
]
,
'max_depth'
:
[
10
,
30
,
50
,
70
,
90
,
None
]
,
'max_features'
:
[
'auto'
,
'sqrt'
]
,
'min_samples_leaf'
:
[
1
,
3
,
5
]
,
'min_samples_split'
:
[
2
,
5
,
10
]
,
'n_estimators'
:
[
50
,
100
,
500
,
1000
]
}
# Random search of parameters, using 3 fold cross validation with 50 iterations
rf_grid_search_res
=
RandomizedSearchCV
(
estimator
=
rf_clf
,
param_distributions
=
param_grid_rf
,
n_iter
=
50
,
cv
=
3
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
random_state
=
42
,
n_jobs
=
-
1
)
rf_gs_res_fit
=
rf_grid_search_res
.
fit
(
X_res_val
,
y_res_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}'
.
format
(
rf_gs_res_fit
.
best_score_
,
rf_gs_res_fit
.
best_params_
)
)
rf_grid_search_imp
=
RandomizedSearchCV
(
estimator
=
rf_clf
,
param_distributions
=
param_grid_rf
,
n_iter
=
50
,
cv
=
3
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
random_state
=
42
,
n_jobs
=
-
1
)
rf_gs_imp_fit
=
rf_grid_search_imp
.
fit
(
X_imp_val
,
y_imp_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}'
.
format
(
rf_gs_imp_fit
.
best_score_
,
rf_gs_imp_fit
.
best_params_
)
)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.8991769547325102) on the restricted dataset were achieved with the following parameters:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.9035062048555301) on the imputed dataset were achieved with the following parameters:
{'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False}
rf_res
=
rf_gs_res_fit
.
best_estimator_
# Restricted dataset
print
(
'Running RF with parameters {0} on the restricted Dataset: \n'
.
format
(
rf_gs_res_fit
.
best_params_
)
)
rf_res_fit
=
rf_res
.
fit
(
X_res_train
,
y_res_train
[
'xsell'
]
)
y_res_pred_rf
=
rf_res_fit
.
predict
(
X_res_test
)
# Classification report for this classifier
print
(
classification_report
(
y_res_pred_rf
,
y_res_test
[
'xsell'
]
)
)
# Confusion matrix
mat_res
=
confusion_matrix
(
y_res_test
[
'xsell'
]
,
y_res_pred_rf
)
sn
.
heatmap
(
mat_res
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
Running RF with parameters {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False} on the restricted Dataset:
precision recall f1-score support
0.0 0.93 0.91 0.92 987
1.0 0.91 0.93 0.92 958
accuracy 0.92 1945
macro avg 0.92 0.92 0.92 1945
weighted avg 0.92 0.92 0.92 1945
# Most important features
forest_importances
=
pd
.
Series
(
rf_res_fit
.
feature_importances_
,
X_res_train
.
columns
)
forest_importances
=
forest_importances
.
sort_values
(
ascending
=
False
)
# forest_importances = forest_importances.sort_values('importance')
print
(
forest_importances
)
plt
.
bar
(
X_res_train
.
columns
,
rf_res_fit
.
feature_importances_
)
giro_mailing 0.640577
duration 0.093880
last_acc 0.048013
age 0.026012
logins 0.024710
total_mailings 0.022301
total_exposure 0.020774
entry_age 0.020022
total_savings 0.018770
nr_hh 0.014135
transactions_year 0.013338
ppower 0.008324
pop_km 0.008187
total_loans 0.006213
marital_married 0.004724
nprod 0.004446
marital_single 0.002840
cons_loan 0.002640
constr_loan 0.002506
extra_acc 0.002502
gender_M 0.002269
sec_acc 0.002183
gender_F 0.002117
gender_MF 0.001790
calls 0.001721
mobile_logins 0.001508
res_move_365 0.001325
marital_unmarried 0.000796
marital_divorced 0.000410
marital_cohabiting 0.000367
referrals 0.000186
acad 0.000126
marital_separated 0.000121
fixed_acc 0.000117
marital_widowed 0.000044
complaints 0.000007
gender_nan 0.000000
dtype: float64
rf_imp
=
rf_gs_imp_fit
.
best_estimator_
# Imputed dataset
print
(
'Running RF with parameters {0} on the imputed Dataset: \n'
.
format
(
rf_gs_imp_fit
.
best_params_
)
)
rf_imp_fit
=
rf_imp
.
fit
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
y_imp_pred_rf
=
rf_imp_fit
.
predict
(
X_imp_test
)
# Classification report for this classifier
print
(
classification_report
(
y_imp_pred_rf
,
y_imp_test
[
'xsell'
]
)
)
#Confusion matrix
mat_imp
=
confusion_matrix
(
y_imp_test
[
'xsell'
]
,
y_imp_pred_rf
)
sn
.
heatmap
(
mat_imp
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
Running RF with parameters {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False} on the imputed Dataset:
precision recall f1-score support
0.0 0.93 0.91 0.92 1019
1.0 0.91 0.93 0.92 981
accuracy 0.92 2000
macro avg 0.92 0.92 0.92 2000
weighted avg 0.92 0.92 0.92 2000
#Most important features
forest_importances
=
pd
.
Series
(
rf_imp_fit
.
feature_importances_
,
X_imp_train
.
columns
)
forest_importances
=
forest_importances
.
sort_values
(
ascending
=
False
)
#forest_importances = forest_importances.sort_values('importance')
print
(
forest_importances
)
#plt.bar(X_res_train.columns, rf_imp_fit.feature_importances_)
giro_mailing 0.621187
duration 0.094728
last_acc 0.053538
logins 0.023384
age 0.020476
total_mailings 0.019647
total_exposure 0.018496
entry_age 0.018444
total_savings 0.015949
transactions_year 0.015451
nr_hh 0.012833
pop_km 0.008173
ppower 0.008070
car_seg 0.007866
avg_res_dur 0.007473
total_loans 0.006882
marital_married 0.004990
marital_single 0.004545
nprod 0.004008
pref_device_PC/Laptop 0.003895
constr_loan 0.002846
sec_acc 0.002609
gender_M 0.002550
gender_F 0.002443
marital_unmarried 0.002437
gender_MF 0.002319
occupation_white-collar worker 0.002304
occupation_self-employed 0.002279
cons_loan 0.002105
extra_acc 0.001955
calls 0.001440
mobile_logins 0.000780
res_move_365 0.000688
pref_device_Tablet 0.000616
pref_device_mobile 0.000540
marital_divorced 0.000342
occupation_blue-collar worker 0.000311
occupation_freelancer 0.000256
marital_cohabiting 0.000251
occupation_unemployed 0.000202
occupation_public servant 0.000193
acad 0.000126
referrals 0.000107
marital_widowed 0.000072
occupation_pensioner/retiree 0.000059
marital_separated 0.000058
fixed_acc 0.000041
occupation_housewife 0.000025
occupation_student 0.000010
complaints 0.000000
occupation_soldier 0.000000
occupation_private means 0.000000
occupation_university student 0.000000
occupation_apprentice 0.000000
gender_nan 0.000000
dtype: float64
xgb_clf
=
GradientBoostingClassifier
(
)
param_grid_xgb
=
{
'n_estimators'
:
[
50
,
100
,
500
]
,
'learning_rate'
:
[
1
,
0.6
,
0.2
,
0.05
,
0.01
]
,
'max_features'
:
[
'auto'
,
'sqrt'
]
,
'min_samples_leaf'
:
[
1
,
3
,
5
]
,
'min_samples_split'
:
[
2
,
5
,
10
]
,
'max_depth'
:
[
10
,
30
,
50
,
70
,
None
]
}
#xgb_grid_search_res = GridSearchCV(xgb_clf, param_grid_xgb, cv=10, scoring="accuracy", return_train_score=True, verbose=True, n_jobs=-1)
xgb_grid_search_res
=
RandomizedSearchCV
(
estimator
=
xgb_clf
,
param_distributions
=
param_grid_xgb
,
n_iter
=
50
,
cv
=
3
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
random_state
=
42
,
n_jobs
=
-
1
)
xgb_gs_res_fit
=
xgb_grid_search_res
.
fit
(
X_res_val
,
y_res_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}'
.
format
(
xgb_gs_res_fit
.
best_score_
,
xgb_gs_res_fit
.
best_params_
)
)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.9038065843621399) on the restricted dataset were achieved with the following parameters:
{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.6}
#xgb_grid_search_imp = GridSearchCV(xgb_clf, param_grid_tree, cv=10, scoring="accuracy", return_train_score=True, verbose=True, n_jobs=-1)
xgb_grid_search_imp
=
RandomizedSearchCV
(
estimator
=
xgb_clf
,
param_distributions
=
param_grid_xgb
,
n_iter
=
50
,
cv
=
3
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
random_state
=
42
,
n_jobs
=
-
1
)
xgb_gs_imp_fit
=
xgb_grid_search_imp
.
fit
(
X_imp_val
,
y_imp_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}'
.
format
(
xgb_gs_imp_fit
.
best_score_
,
xgb_gs_imp_fit
.
best_params_
)
)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.9199987093540317) on the imputed dataset were achieved with the following parameters:
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 30, 'learning_rate': 0.6}
xgb_res
=
xgb_gs_res_fit
.
best_estimator_
# Restricted Dataset
print
(
'Running XGBoost with parameters {0} on the restricted Dataset: \n'
.
format
(
xgb_gs_res_fit
.
best_params_
)
)
xgb_res_fit
=
xgb_res
.
fit
(
X_res_train
,
y_res_train
[
'xsell'
]
)
y_res_pred_xgb
=
xgb_res_fit
.
predict
(
X_res_test
)
#Classification report for this classifier
print
(
classification_report
(
y_res_pred_xgb
,
y_res_test
[
'xsell'
]
)
)
#Confusion matrix
mat_res
=
confusion_matrix
(
y_res_test
[
'xsell'
]
,
y_res_pred_xgb
)
sn
.
heatmap
(
mat_res
.
T
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
Running XGBoost with parameters {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.6} on the restricted Dataset:
precision recall f1-score support
0.0 0.92 0.93 0.93 966
1.0 0.93 0.93 0.93 979
accuracy 0.93 1945
macro avg 0.93 0.93 0.93 1945
weighted avg 0.93 0.93 0.93 1945
# Most important features
xgb_importances
=
pd
.
Series
(
xgb_res_fit
.
feature_importances_
,
X_res_train
.
columns
)
xgb_importances
=
xgb_importances
.
sort_values
(
ascending
=
False
)
print
(
xgb_importances
)
giro_mailing 5.816048e-01
duration 9.312843e-02
last_acc 7.137942e-02
total_mailings 2.935098e-02
logins 2.920605e-02
age 2.581673e-02
total_exposure 2.222142e-02
marital_married 2.050267e-02
entry_age 1.890555e-02
total_savings 1.831603e-02
nr_hh 1.636405e-02
total_loans 1.278161e-02
transactions_year 1.204004e-02
ppower 9.491902e-03
pop_km 8.490154e-03
nprod 5.133055e-03
extra_acc 3.863216e-03
calls 3.826234e-03
sec_acc 3.298887e-03
marital_single 2.591253e-03
gender_F 2.454651e-03
constr_loan 2.347785e-03
mobile_logins 1.608224e-03
gender_MF 1.512336e-03
gender_M 1.440673e-03
res_move_365 8.338529e-04
cons_loan 7.860614e-04
marital_unmarried 3.550378e-04
marital_cohabiting 1.343705e-04
marital_divorced 1.176337e-04
acad 8.157319e-05
fixed_acc 9.315930e-06
complaints 3.242547e-06
marital_separated 2.145037e-06
referrals 6.257511e-07
marital_widowed 3.572586e-08
gender_nan 0.000000e+00
dtype: float64
xgb_imp
=
xgb_gs_imp_fit
.
best_estimator_
# Imputed dataset
print
(
'Running XGBoost with parameters {0} on the imputed Dataset: \n'
.
format
(
xgb_gs_imp_fit
.
best_params_
)
)
xgb_imp_fit
=
xgb_imp
.
fit
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
y_imp_pred_xgb
=
xgb_imp_fit
.
predict
(
X_imp_test
)
# Classification report for this classifier
print
(
classification_report
(
y_imp_pred_xgb
,
y_imp_test
[
'xsell'
]
)
)
# Confusion matrix
mat_imp
=
confusion_matrix
(
y_imp_test
[
'xsell'
]
,
y_imp_pred_xgb
)
sn
.
heatmap
(
mat_imp
.
T
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
Running XGBoost with parameters {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 30, 'learning_rate': 0.6} on the imputed Dataset:
precision recall f1-score support
0.0 0.92 0.92 0.92 998
1.0 0.92 0.92 0.92 1002
accuracy 0.92 2000
macro avg 0.92 0.92 0.92 2000
weighted avg 0.92 0.92 0.92 2000
# Most important features
xgb_importances
=
pd
.
Series
(
xgb_imp_fit
.
feature_importances_
,
X_imp_train
.
columns
)
xgb_importances
=
xgb_importances
.
sort_values
(
ascending
=
False
)
print
(
xgb_importances
)
giro_mailing 6.445461e-01
duration 1.573122e-01
last_acc 2.871299e-02
total_savings 2.075701e-02
nr_hh 1.818409e-02
logins 1.664149e-02
age 1.369750e-02
entry_age 1.243769e-02
avg_res_dur 1.050905e-02
total_exposure 9.521362e-03
ppower 8.948275e-03
transactions_year 8.636020e-03
pop_km 8.402746e-03
total_mailings 7.671126e-03
car_seg 6.856799e-03
nprod 3.274113e-03
marital_married 2.932066e-03
gender_MF 2.822208e-03
sec_acc 2.579480e-03
gender_F 2.577289e-03
extra_acc 2.069599e-03
gender_M 1.818490e-03
occupation_white-collar worker 1.673652e-03
occupation_blue-collar worker 1.259882e-03
pref_device_Tablet 1.197092e-03
pref_device_PC/Laptop 1.125939e-03
calls 9.080079e-04
marital_single 8.360617e-04
fixed_acc 7.094792e-04
total_loans 4.726436e-04
cons_loan 1.918540e-04
marital_unmarried 1.781453e-04
res_move_365 1.753657e-04
occupation_self-employed 1.188641e-04
referrals 1.015899e-04
acad 5.142036e-05
mobile_logins 3.474689e-05
marital_separated 3.386300e-05
marital_divorced 1.489116e-05
occupation_public servant 6.036330e-06
occupation_pensioner/retiree 2.254548e-06
marital_widowed 4.565261e-07
occupation_university student 6.236806e-08
marital_cohabiting 2.666113e-08
constr_loan 1.303055e-08
pref_device_mobile 2.259301e-09
occupation_soldier 9.720828e-12
occupation_student 2.143495e-12
occupation_freelancer 4.490857e-16
occupation_housewife 2.303686e-35
occupation_unemployed 5.168225e-36
occupation_private means 0.000000e+00
occupation_apprentice 0.000000e+00
gender_nan 0.000000e+00
complaints 0.000000e+00
dtype: float64
SVM
svm_clf
=
svm
.
SVC
(
)
param_grid_svm
=
{
'C'
:
[
0.1
,
1
,
10
,
100
]
,
'gamma'
:
[
1
,
0.1
,
0.01
,
0.001
]
,
'kernel'
:
[
'rbf'
,
'poly'
,
'sigmoid'
]
}
svm_grid_search_res
=
GridSearchCV
(
svm_clf
,
param_grid_svm
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
svm_gs_res_fit
=
svm_grid_search_res
.
fit
(
X_res_val
,
y_res_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}'
.
format
(
svm_gs_res_fit
.
best_score_
,
svm_gs_res_fit
.
best_params_
)
)
svm_grid_search_imp
=
GridSearchCV
(
svm_clf
,
param_grid_svm
,
cv
=
10
,
scoring
=
"accuracy"
,
return_train_score
=
True
,
verbose
=
True
,
n_jobs
=
-
1
)
svm_gs_imp_fit
=
svm_grid_search_imp
.
fit
(
X_imp_val
,
y_imp_val
[
'xsell'
]
)
print
(
'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}'
.
format
(
svm_gs_imp_fit
.
best_score_
,
svm_gs_imp_fit
.
best_params_
)
)
Fitting 10 folds for each of 48 candidates, totalling 480 fits
The best results (Accuracy: 0.8909357652656622) on the restricted dataset were achieved with the following parameters:
{'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 48 candidates, totalling 480 fits
The best results (Accuracy: 0.8985) on the imputed dataset were achieved with the following parameters:
{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
svm_res
=
svm_gs_res_fit
.
best_estimator_
# {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
# Restricted Dataset
print
(
'Running SVM with parameters {0} on the restricted Dataset: \n'
.
format
(
svm_gs_res_fit
.
best_params_
)
)
svm_res_fit
=
svm_res
.
fit
(
X_res_train
,
y_res_train
[
'xsell'
]
)
y_res_pred_svm
=
svm_res_fit
.
predict
(
X_res_test
)
#Classification report for this classifier
print
(
classification_report
(
y_res_pred_svm
,
y_res_test
[
'xsell'
]
)
)
#Confusion matrix
mat_res
=
confusion_matrix
(
y_res_test
[
'xsell'
]
,
y_res_pred_svm
)
sn
.
heatmap
(
mat_res
.
T
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
Running SVM with parameters {'C': 1, 'gamma': 1, 'kernel': 'rbf'} on the restricted Dataset:
precision recall f1-score support
0.0 0.91 0.91 0.91 972
1.0 0.91 0.91 0.91 973
accuracy 0.91 1945
macro avg 0.91 0.91 0.91 1945
weighted avg 0.91 0.91 0.91 1945
svm_imp
=
svm_gs_imp_fit
.
best_estimator_
#{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
# Imputed dataset
print
(
'Running SVM with parameters {0} on the imputed Dataset: \n'
.
format
(
svm_gs_res_fit
.
best_params_
)
)
svm_imp_fit
=
svm_imp
.
fit
(
X_imp_train
,
y_imp_train
[
'xsell'
]
)
y_imp_pred_svm
=
svm_imp_fit
.
predict
(
X_imp_test
)
#Classification report for this classifier
print
(
classification_report
(
y_imp_pred_svm
,
y_imp_test
[
'xsell'
]
)
)
#Confusion matrix
mat_imp
=
confusion_matrix
(
y_imp_test
[
'xsell'
]
,
y_imp_pred_svm
)
sn
.
heatmap
(
mat_imp
.
T
,
square
=
True
,
annot
=
True
,
fmt
=
'd'
,
cbar
=
False
)
plt
.
xlabel
(
'True label'
)
Running SVM with parameters {'C': 1, 'gamma': 1, 'kernel': 'rbf'} on the imputed Dataset:
precision recall f1-score support
0.0 0.91 0.91 0.91 1001
1.0 0.91 0.91 0.91 999
accuracy 0.91 2000
macro avg 0.91 0.91 0.91 2000
weighted avg 0.91 0.91 0.91 2000
Neural Network
HIDDEN_NEURONS_res
=
len
(
X_res
.
columns
)
HIDDEN_NEURONS_imp
=
len
(
X_imp
.
columns
)
def
create_res_baseline
(
)
:
# create model
model
=
Sequential
(
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_res
,
input_dim
=
HIDDEN_NEURONS_res
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.2
,
input_shape
=
(
HIDDEN_NEURONS_res
,
)
)
)
model
.
add
(
Dense
(
1
,
activation
=
'sigmoid'
)
)
# Compile model
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'accuracy'
]
)
return
model
def
create_imp_baseline
(
)
:
# create model
model
=
Sequential
(
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_imp
,
input_dim
=
HIDDEN_NEURONS_imp
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.2
,
input_shape
=
(
HIDDEN_NEURONS_imp
,
)
)
)
model
.
add
(
Dense
(
1
,
activation
=
'sigmoid'
)
)
# Compile model
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'accuracy'
]
)
return
model
# Dropout after every non-linear activation function
def
create_res_model
(
)
:
# create model
model
=
Sequential
(
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_res
,
input_dim
=
HIDDEN_NEURONS_res
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.4
,
input_shape
=
(
HIDDEN_NEURONS_res
,
)
)
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_res
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.4
,
input_shape
=
(
HIDDEN_NEURONS_res
,
)
)
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_res
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.4
,
input_shape
=
(
HIDDEN_NEURONS_res
,
)
)
)
model
.
add
(
Dense
(
1
,
activation
=
'sigmoid'
)
)
# Compile model
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'accuracy'
]
)
return
model
# Dropout after every non-linear activation function
def
create_imp_model
(
)
:
# create model
model
=
Sequential
(
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_imp
,
input_dim
=
HIDDEN_NEURONS_imp
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.4
,
input_shape
=
(
HIDDEN_NEURONS_imp
,
)
)
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_imp
,
activation
=
'relu'
)
)
model
.
add
(
Dropout
(
.4
,
input_shape
=
(
HIDDEN_NEURONS_imp
,
)
)
)
model
.
add
(
Dense
(
HIDDEN_NEURONS_imp
,
activation
=
'relu'
)
)
model
.
add
(
Dense
(
1
,
activation
=
'sigmoid'
)
)
# Compile model
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'accuracy'
]
)
return
model
estimator
=
KerasClassifier
(
build_fn
=
create_res_baseline
,
epochs
=
50
,
batch_size
=
5
,
verbose
=
0
)
kfold
=
StratifiedKFold
(
n_splits
=
7
,
shuffle
=
True
)
results
=
cross_val_score
(
estimator
,
X_res
,
Y_res
[
'xsell'
]
,
cv
=
kfold
)
print
(
"Baseline restricted data: {0} ({1})"
.
format
(
results
.
mean
(
)
*
100
,
results
.
std
(
)
*
100
)
)
Baseline: 89.86725636890957 (0.9019227930462493)
estimator
=
KerasClassifier
(
build_fn
=
create_imp_baseline
,
epochs
=
25
,
batch_size
=
5
,
verbose
=
0
)
kfold
=
StratifiedKFold
(
n_splits
=
5
,
shuffle
=
True
)
results
=
cross_val_score
(
estimator
,
X_imp
,
Y_imp
[
'xsell'
]
,
cv
=
kfold
)
print
(
"Baseline imputed data: {0} ({1})"
.
format
(
results
.
mean
(
)
*
100
,
results
.
std
(
)
*
100
)
)
Baseline: 89.86999869346619 (0.4019956615884608)
estimator
=
KerasClassifier
(
build_fn
=
create_res_model
,
epochs
=
25
,
batch_size
=
5
,
verbose
=
0
)
kfold
=
StratifiedKFold
(
n_splits
=
5
,
shuffle
=
True
)