Machine Learning Exam

Elias Fischer, Noah Pokorny, Lucas Intveen, Niklas Jessen 

Load Data and Required Packages

import pandas as pd import numpy as np import seaborn as sn import matplotlib . pyplot as plt from IPython . display import Image from sklearn import preprocessing from sklearn . impute import KNNImputer from sklearn . model_selection import train_test_split , cross_val_score , GridSearchCV , RandomizedSearchCV , StratifiedKFold , RepeatedStratifiedKFold from sklearn . pipeline import Pipeline from sklearn . decomposition import PCA from sklearn . manifold import TSNE from sklearn . neighbors import KNeighborsClassifier from sklearn . linear_model import LogisticRegression from sklearn . metrics import classification_report , confusion_matrix from sklearn . tree import DecisionTreeClassifier , export_graphviz from pydotplus import graph_from_dot_data from sklearn . ensemble import RandomForestClassifier from sklearn . ensemble import GradientBoostingClassifier from sklearn import svm from keras . models import Sequential from keras . layers import Dense from keras . layers import Dropout from keras . wrappers . scikit_learn import KerasClassifier

# Read raw data file xsell_raw = pd . read_csv ( 'xsell_raw.csv' ) # First 5 rows of data set xsell_raw . head ( )

Exploratory Data Analysis

#Print shape, columns and info print ( xsell_raw . shape ) print ( xsell_raw . columns ) print ( xsell_raw . info ( ) )


              
               (10000, 32)
Index(['xsell', 'age', 'entry_age', 'gender', 'acad', 'marital', 'nprod',
       'duration', 'last_acc', 'occupation', 'nr_hh', 'ppower', 'avg_res_dur',
       'pop_km', 'car_seg', 'res_move_365', 'giro_mailing', 'total_mailings',
       'extra_acc', 'fixed_acc', 'constr_loan', 'cons_loan', 'sec_acc',
       'total_savings', 'total_loans', 'transactions_year', 'logins',
       'mobile_logins', 'calls', 'referrals', 'complaints', 'pref_device'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   xsell              10000 non-null  int64  
 1   age                10000 non-null  int64  
 2   entry_age          9999 non-null   float64
 3   gender             9999 non-null   object 
 4   acad               10000 non-null  int64  
 5   marital            10000 non-null  object 
 6   nprod              10000 non-null  int64  
 7   duration           10000 non-null  int64  
 8   last_acc           10000 non-null  int64  
 9   occupation         4149 non-null   object 
 10  nr_hh              9933 non-null   float64
 11  ppower             9865 non-null   float64
 12  avg_res_dur        9071 non-null   float64
 13  pop_km             9922 non-null   float64
 14  car_seg            8904 non-null   float64
 15  res_move_365       10000 non-null  int64  
 16  giro_mailing       10000 non-null  int64  
 17  total_mailings     10000 non-null  int64  
 18  extra_acc          10000 non-null  int64  
 19  fixed_acc          10000 non-null  int64  
 20  constr_loan        10000 non-null  int64  
 21  cons_loan          10000 non-null  int64  
 22  sec_acc            10000 non-null  int64  
 23  total_savings      10000 non-null  float64
 24  total_loans        10000 non-null  float64
 25  transactions_year  9832 non-null   float64
 26  logins             10000 non-null  int64  
 27  mobile_logins      10000 non-null  int64  
 28  calls              10000 non-null  int64  
 29  referrals          10000 non-null  int64  
 30  complaints         10000 non-null  int64  
 31  pref_device        4968 non-null   object 
dtypes: float64(9), int64(19), object(4)
memory usage: 2.4+ MB

# Number of missing values in columns display ( xsell_raw . isnull ( ) . sum ( ) )

#Statistics for numerical columns xsell_raw . describe ( )

# Correlation matrix formation corr_matrix = xsell_raw . loc [ : ] . corr ( ) # Using heatmap to visualize the correlation matrix ax = plt . subplots ( figsize = ( 8.5 , 8.5 ) ) ax = sn . heatmap ( corr_matrix , linewidths = .5 , annot = False , yticklabels = True , xticklabels = True , cmap = sn . color_palette ( "coolwarm" , 12 ) )

# Create histograms fig , ( ( ax0 , ax1 ) , ( ax2 , ax3 ) , ( ax4 , ax5 ) , ( ax6 , ax7 ) ) = plt . subplots ( nrows = 4 , ncols = 2 , figsize = ( 12 , 15 ) ) ax0 . hist ( xsell_raw [ "age" ] , bins = 70 , color = "#6C90F1" , range = ( 18 , 70 ) ) ax0 . set_title ( 'Master Data: Age Between 18 and 70 Years' ) # Transformation of gender column necessary xsell_raw [ "gender" ] = xsell_raw [ "gender" ] . astype ( str ) ax1 . hist ( xsell_raw [ "gender" ] , bins = 4 , color = "#6C90F1" ) ax1 . set_title ( 'Masta Data: Gender' ) ax2 . hist ( xsell_raw [ "nprod" ] , bins = 6 , color = "#6C90F1" , range = ( 1 , 6 ) ) ax2 . set_title ( 'Product Data: Products or Accounts Owned' ) ax3 . hist ( xsell_raw [ "total_savings" ] , bins = 100 , color = "#6C90F1" , range = ( 0 , 100 ) ) ax3 . set_title ( 'Account Data: Total Savings in EUR (Max. + 100€)' ) ax4 . hist ( xsell_raw [ "total_loans" ] , bins = 100 , range = ( - 100 , 0 ) , color = "#6C90F1" ) ax4 . set_title ( 'Account Data: Total Loans in EUR (Max. - 100€)' ) ax5 . hist ( xsell_raw [ "total_exposure" ] , bins = 100 , range = ( - 100 , 0 ) , color = "#6C90F1" ) ax5 . set_title ( 'Account Data: Total Exposure in EUR (Max. - 100€)' ) ax6 . hist ( xsell_raw [ "transactions_year" ] , bins = 40 , color = "#6C90F1" , range = ( 0 , 40 ) ) ax6 . set_title ( 'Action Data: Number of Transactions in 1 Year' ) ax7 . hist ( xsell_raw [ "total_mailings" ] , bins = 9 , color = "#6C90F1" , range = ( 0 , 9 ) ) ax7 . set_title ( 'Action Data: Mailings Received in the Last 2 Years' )

Pre-processing and Feature Engineering

# Generate variable total_exposure xsell_raw [ "total_exposure" ] = xsell_raw . total_savings + xsell_raw . total_loans

xsell_raw . columns

Remove Columns With too Many Missing Values

MinMax-Scaler is used for normalization, because Standard Scaler standardizes below the assumption that distribution of variables fits a bell curve. This is not the case for the present dataset.

#Create restricted dataset xsell_res = xsell_raw [ : ] xsell_res = xsell_res . drop ( columns = [ "occupation" , "pref_device" , "car_seg" , "avg_res_dur" ] ) . dropna ( ) #Get dummies xsell_res = pd . get_dummies ( xsell_res ) #Normalize variables (min-max-scaler) min_max_scaler = preprocessing . MinMaxScaler ( ) xsell_res_scaled = xsell_res [ : ] xsell_res_scaled [ : ] = min_max_scaler . fit_transform ( xsell_res_scaled [ : ] ) print ( 'The restricted dataframe has the shape {0} with the following columns: {1}' . format ( xsell_res_scaled . shape , xsell_res_scaled . columns ) )


              
               The restricted dataframe has the shape (9721, 38) with the following columns: Index(['xsell', 'age', 'entry_age', 'acad', 'nprod', 'duration', 'last_acc',
       'nr_hh', 'ppower', 'pop_km', 'res_move_365', 'giro_mailing',
       'total_mailings', 'extra_acc', 'fixed_acc', 'constr_loan', 'cons_loan',
       'sec_acc', 'total_savings', 'total_loans', 'transactions_year',
       'logins', 'mobile_logins', 'calls', 'referrals', 'complaints',
       'total_exposure', 'gender_F', 'gender_M', 'gender_MF', 'gender_nan',
       'marital_cohabiting', 'marital_divorced', 'marital_married',
       'marital_separated', 'marital_single', 'marital_unmarried',
       'marital_widowed'],
      dtype='object')

Impute values of missing values

X = pd . get_dummies ( xsell_raw ) . drop ( columns = "xsell" ) y = xsell_raw . xsell # evaluate different k on the dataset results = list ( ) strategies = [ str ( i ) for i in [ 1 , 3 , 5 , 7 , 9 , 15 , 18 , 21 ] ] for s in strategies : # create the modeling pipeline pipeline = Pipeline ( steps = [ ( 'i' , KNNImputer ( n_neighbors = int ( s ) ) ) , ( 'm' , RandomForestClassifier ( ) ) ] ) # evaluate the model scores = cross_val_score ( pipeline , X , y , scoring = 'accuracy' , cv = 5 , n_jobs = - 1 ) # store results results . append ( scores ) print ( '>%s %.3f (%.3f)' % ( s , np . mean ( scores ) , np . std ( scores ) ) )


              
               >1 0.857 (0.172)
>3 0.859 (0.170)
>5 0.857 (0.172)
>7 0.856 (0.174)
>9 0.858 (0.172)
>15 0.859 (0.171)
>18 0.859 (0.169)
>21 0.855 (0.174)

# plot model accuracy vs k mean_scores = [ ] for i in range ( 0 , 8 ) : mean_scores . append ( np . mean ( results [ i ] ) ) plt . plot ( strategies , mean_scores ) plt . title ( "Imputation k" ) plt . xlabel ( "Number of Neighbors K" ) plt . ylabel ( "Model Accuracy" ) plt . show ( )

# Calling the imputer with k=3 imputer = KNNImputer ( n_neighbors = 3 , weights = "uniform" ) # Get dummies xsell_imp = xsell_raw [ : ] xsell_imp = pd . get_dummies ( xsell_imp ) xsell_scaled = xsell_imp [ : ] xsell_scaled [ : ] = min_max_scaler . fit_transform ( xsell_imp [ : ] ) # Impute xsell_raw imputed_vals = imputer . fit_transform ( xsell_scaled ) xsell_imp_scaled = xsell_imp [ : ] xsell_imp_scaled [ : ] = pd . DataFrame ( imputed_vals ) print ( 'The imputed dataframe has the shape {0} with the following columns: \n {1}' . format ( xsell_imp_scaled . shape , xsell_imp_scaled . columns ) )


              
               The imputed dataframe has the shape (10000, 56) with the following columns: 
 Index(['xsell', 'age', 'entry_age', 'acad', 'nprod', 'duration', 'last_acc',
       'nr_hh', 'ppower', 'avg_res_dur', 'pop_km', 'car_seg', 'res_move_365',
       'giro_mailing', 'total_mailings', 'extra_acc', 'fixed_acc',
       'constr_loan', 'cons_loan', 'sec_acc', 'total_savings', 'total_loans',
       'transactions_year', 'logins', 'mobile_logins', 'calls', 'referrals',
       'complaints', 'total_exposure', 'gender_F', 'gender_M', 'gender_MF',
       'gender_nan', 'marital_cohabiting', 'marital_divorced',
       'marital_married', 'marital_separated', 'marital_single',
       'marital_unmarried', 'marital_widowed', 'occupation_apprentice',
       'occupation_blue-collar worker', 'occupation_freelancer',
       'occupation_housewife', 'occupation_pensioner/retiree',
       'occupation_private means', 'occupation_public servant',
       'occupation_self-employed', 'occupation_soldier', 'occupation_student',
       'occupation_unemployed', 'occupation_university student',
       'occupation_white-collar worker', 'pref_device_PC/Laptop',
       'pref_device_Tablet', 'pref_device_mobile'],
      dtype='object')

Split data in training/validation and test set

# Split dataframe into dependent variable (Y) and independent variables (X) # Restricted Dataset X_res = xsell_res_scaled . drop ( 'xsell' , axis = 1 ) Y_res = xsell_res_scaled [ [ 'xsell' ] ] # Imputed Dataset X_imp = xsell_imp_scaled . drop ( 'xsell' , axis = 1 ) Y_imp = xsell_imp_scaled [ [ 'xsell' ] ] # Create Train (0.6), Validation (0.2) and Test (0.2) Sets # Restricted Dataset X_res_train , X_res_test , y_res_train , y_res_test = train_test_split ( X_res , Y_res , test_size = 0.2 , random_state = 42 ) X_res_train , X_res_val , y_res_train , y_res_val = train_test_split ( X_res_train , y_res_train , test_size = 0.25 , random_state = 42 ) # 0.25 x 0.8 = 0.2 # Create Train (0.6), Validation (0.2) and Test (0.2) Sets # Imputed Dataset X_imp_train , X_imp_test , y_imp_train , y_imp_test = train_test_split ( X_imp , Y_imp , test_size = 0.2 , random_state = 42 ) X_imp_train , X_imp_val , y_imp_train , y_imp_val = train_test_split ( X_imp_train , y_imp_train , test_size = 0.25 , random_state = 42 ) # 0.25 x 0.8 = 0.2

Visualization using PCA and t-SNE

# Visualize using PCA pca = PCA ( n_components = 3 ) pca_res_3d = pca . fit_transform ( X_res ) pca_imp_3d = pca . fit_transform ( X_imp )

# Plot restricted dataset fig = plt . figure ( figsize = ( 16 , 10 ) ) ax = fig . add_subplot ( 111 , projection = '3d' ) ax . scatter ( xs = pca_res_3d [ : , 0 ] , ys = pca_res_3d [ : , 1 ] , zs = pca_res_3d [ : , 2 ] , c = Y_res [ "xsell" ] ax . set_title ( 'Restricted Dataset using PCA' ) ax . set_xlabel ( 'pca-one' ) ax . set_ylabel ( 'pca-two' ) ax . set_zlabel ( 'pca-three' ) plt . show ( ) # Plot imputed dataset fig = plt . figure ( figsize = ( 16 , 10 ) ) ax = fig . add_subplot ( 111 , projection = '3d' ) ax . scatter ( xs = pca_imp_3d [ : , 0 ] , ys = pca_imp_3d [ : , 1 ] , zs = pca_imp_3d [ : , 2 ] , c = Y_imp [ "xsell" ] ax . set_title ( 'Imputed Dataset using PCA' ) ax . set_xlabel ( 'pca-one' ) ax . set_ylabel ( 'pca-two' ) ax . set_zlabel ( 'pca-three' ) plt . show ( )

# Use tSNE to reduce 3D PCA to 2D tsne = TSNE ( n_components = 2 , verbose = 1 , perplexity = 40 , n_iter = 300 ) tsne_res_2d = tsne . fit_transform ( pca_res_3d ) tsne_imp_2d = tsne . fit_transform ( pca_imp_3d )


              
               [t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 9721 samples in 0.007s...
[t-SNE] Computed neighbors for 9721 samples in 0.371s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9721
[t-SNE] Computed conditional probabilities for sample 2000 / 9721
[t-SNE] Computed conditional probabilities for sample 3000 / 9721
[t-SNE] Computed conditional probabilities for sample 4000 / 9721
[t-SNE] Computed conditional probabilities for sample 5000 / 9721
[t-SNE] Computed conditional probabilities for sample 6000 / 9721
[t-SNE] Computed conditional probabilities for sample 7000 / 9721
[t-SNE] Computed conditional probabilities for sample 8000 / 9721
[t-SNE] Computed conditional probabilities for sample 9000 / 9721
[t-SNE] Computed conditional probabilities for sample 9721 / 9721
[t-SNE] Mean sigma: 0.012118
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.338646
[t-SNE] KL divergence after 300 iterations: 1.945432
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.018s...
[t-SNE] Computed neighbors for 10000 samples in 0.407s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.018020
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.045982
[t-SNE] KL divergence after 300 iterations: 1.871857

# Plot 2D representation fig , ax = plt . subplots ( ) ax . scatter ( x = tsne_res_2d [ : , 0 ] , y = tsne_res_2d [ : , 1 ] , c = Y_res [ "xsell" ] ax . set_title ( 'Restricted Dataset using PCA and tSNE' ) ax . set_xlabel ( 'tsne_pca-one' ) ax . set_ylabel ( 'tsne_pca-two' ) plt . show ( ) fig , ax = plt . subplots ( ) ax . scatter ( x = tsne_imp_2d [ : , 0 ] , y = tsne_imp_2d [ : , 1 ] , c = Y_imp [ "xsell" ] ax . set_title ( 'Imputed Dataset using PCA and tSNE' ) ax . set_xlabel ( 'tsne_pca-one' ) ax . set_ylabel ( 'tsne_pca-two' ) plt . show ( )

Create Models

KNN

# Grid Search for optimzing Hyperparameter param_grid_knn = { "n_neighbors" : [ 1 , 3 , 5 , 7 , 9 , 19 , 29 , 39 , 49 , 59 , 69 , 79 , 89 , 99 ] , "weights" : [ "uniform" , "distance" ] , "metric" : [ 'euclidean' , 'manhattan' ] } knn_clf = KNeighborsClassifier ( ) # Restricted dataset knn_grid_search_res = GridSearchCV ( knn_clf , param_grid_knn , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) knn_gs_res_fit = knn_grid_search_res . fit ( X_res_val , y_res_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}' . format ( knn_grid_search_res . best_score_ , knn_grid_search_res . best_params_ ) ) # Imputed dataset knn_grid_search_imp = GridSearchCV ( knn_clf , param_grid_knn , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) knn_gs_imp_fit = knn_grid_search_imp . fit ( X_imp_val , y_imp_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}' . format ( knn_grid_search_imp . best_score_ , knn_grid_search_imp . best_params_ ) )


              
               Fitting 10 folds for each of 56 candidates, totalling 560 fits
The best results (Accuracy: 0.8909357652656622) on the restricted dataset were achieved with the following parameters: 
 {'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'distance'}
Fitting 10 folds for each of 56 candidates, totalling 560 fits
The best results (Accuracy: 0.898) on the imputed dataset were achieved with the following parameters: 
 {'metric': 'euclidean', 'n_neighbors': 29, 'weights': 'distance'}

# Run KNN on test set with optimized parameters # Restricted dataset knn_clf_res = knn_gs_res_fit . best_estimator_ knn_res_test_fit = knn_clf_res . fit ( X_res_train , y_res_train [ 'xsell' ] ) y_res_pred_test = knn_res_test_fit . predict ( X_res_test ) print ( 'Metrics for the restricted dataset: \n' , classification_report ( y_res_pred_test , y_res_test [ 'xsell' ] ) ) # Imputed dataset params_imp = knn_gs_imp_fit . best_params_ knn_clf_imp = knn_gs_imp_fit . best_estimator_ knn_imp_test_fit = knn_clf_imp . fit ( X_imp_train , y_imp_train [ 'xsell' ] ) y_imp_pred_test = knn_imp_test_fit . predict ( X_imp_test ) print ( 'Metrics for the imputed dataset: \n' , classification_report ( y_imp_pred_test , y_imp_test [ 'xsell' ] ) )


              
               Metrics for the restricted dataset: 
               precision    recall  f1-score   support
         0.0       0.91      0.91      0.91       975
         1.0       0.91      0.91      0.91       970
    accuracy                           0.91      1945
   macro avg       0.91      0.91      0.91      1945
weighted avg       0.91      0.91      0.91      1945
Metrics for the imputed dataset: 
               precision    recall  f1-score   support
         0.0       0.91      0.91      0.91      1001
         1.0       0.91      0.91      0.91       999
    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000

Logistic Regression

# Grid Search for optimzing Hyperparameters log_clf = LogisticRegression ( max_iter = 10000 ) param_grid_log = { "C" : np . logspace ( - 3 , 3 , 7 ) , "penalty" : [ "l1" , "l2" ] } # l1 lasso l2 ridge # Restricted dataset log_grid_search_res = GridSearchCV ( log_clf , param_grid_log , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) log_gs_res_fit = log_grid_search_res . fit ( X_res_val , y_res_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}' . format ( log_gs_res_fit . best_score_ , log_gs_res_fit . best_params_ ) ) # Imputed dataset log_grid_search_imp = GridSearchCV ( log_clf , param_grid_log , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) log_gs_imp_fit = log_grid_search_imp . fit ( X_imp_val , y_imp_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}' . format ( log_gs_imp_fit . best_score_ , log_gs_imp_fit . best_params_ ) )


              
               Fitting 10 folds for each of 14 candidates, totalling 140 fits
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the test scores are non-finite: [       nan 0.8904203         nan 0.8904203         nan 0.8904203
        nan 0.8904203         nan 0.8904203         nan 0.88938937
        nan 0.88836373]
  category=UserWarning
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the train scores are non-finite: [       nan 0.89043195        nan 0.89043195        nan 0.89043195
        nan 0.89043195        nan 0.89117504        nan 0.89271822
        nan 0.89294686]
  category=UserWarning
The best results (Accuracy: 0.8904203013481364) on the restricted dataset were achieved with the following parameters: 
 {'C': 0.001, 'penalty': 'l2'}
Fitting 10 folds for each of 14 candidates, totalling 140 fits
The best results (Accuracy: 0.8985) on the imputed dataset were achieved with the following parameters: 
 {'C': 0.001, 'penalty': 'l2'}
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the test scores are non-finite: [   nan 0.8985    nan 0.8985    nan 0.8985    nan 0.898     nan 0.8985
    nan 0.897     nan 0.895 ]
  category=UserWarning
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/model_selection/_search.py:925: UserWarning: One or more of the train scores are non-finite: [       nan 0.8985            nan 0.8985            nan 0.8985
        nan 0.89866667        nan 0.89961111        nan 0.89972222
        nan 0.89977778]
  category=UserWarning

log_clf_res = log_gs_res_fit . best_estimator_ log_clf_imp = log_gs_imp_fit . best_estimator_ # Restrictred dataset log_res = log_clf_res . fit ( X_res_train , y_res_train [ 'xsell' ] ) log_res_score = log_res . score ( X_res_train , y_res_train [ 'xsell' ] ) print ( 'Restricted Dataset: Logistic Regression score on training set: {0}' . format ( log_res_score ) ) y_res_pred_log = log_res . predict ( X_res_test ) print ( classification_report ( y_res_test , y_res_pred_log ) ) #Imputed dataset log_imp = log_clf_imp . fit ( X_imp_train , y_imp_train [ 'xsell' ] ) log_imp_score = log_imp . score ( X_imp_train , y_imp_train [ 'xsell' ] ) print ( 'Imputed Dataset: Logistic Regression score on training set: {0}' . format ( log_imp_score ) ) y_imp_pred_log = log_imp . predict ( X_imp_test ) print ( classification_report ( y_imp_test , y_imp_pred_log ) ) if log_imp_score > log_res_score : print ( "\nLogistic Regression performs better on the imputed dataset." ) else : print ( "\nLogistic Regression performs better on the restricted dataset." )


              
               Restricted Dataset: Logistic Regression score on training set: 0.9005486968449932
              precision    recall  f1-score   support
         0.0       0.91      0.91      0.91       968
         1.0       0.91      0.91      0.91       977
    accuracy                           0.91      1945
   macro avg       0.91      0.91      0.91      1945
weighted avg       0.91      0.91      0.91      1945
Imputed Dataset: Logistic Regression score on training set: 0.898
              precision    recall  f1-score   support
         0.0       0.91      0.91      0.91       996
         1.0       0.91      0.91      0.91      1004
    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000
Logistic Regression performs better on the restricted dataset.

# Feature importance restricted dataset log_importances = pd . Series ( log_res . coef_ [ 0 ] , X_res_train . columns ) log_importances = log_importances . sort_values ( ascending = False ) print ( log_importances ) imp_log = plt . subplots ( figsize = ( 8.5 , 8.5 ) ) imp_log = plt . bar ( X_res_train . columns , log_res . coef_ [ 0 ] ) print ( log_res . coef_ )


              
               marital_married       0.080031
age                   0.061307
gender_MF             0.042160
entry_age             0.030141
last_acc              0.025576
duration              0.023563
pop_km                0.016969
constr_loan           0.012987
marital_cohabiting    0.008162
acad                  0.006113
marital_divorced      0.004660
logins                0.004535
sec_acc               0.004123
calls                 0.002944
extra_acc             0.002768
nprod                 0.002475
marital_widowed       0.002279
complaints            0.001182
marital_separated     0.001178
nr_hh                 0.000730
mobile_logins         0.000271
gender_nan            0.000000
transactions_year    -0.000060
total_savings        -0.000491
fixed_acc            -0.000561
ppower               -0.001031
referrals            -0.001935
total_exposure       -0.004080
res_move_365         -0.004093
total_loans          -0.010466
cons_loan            -0.015873
gender_F             -0.020259
gender_M             -0.021901
marital_unmarried    -0.037191
total_mailings       -0.056862
marital_single       -0.059117
giro_mailing         -0.849402
dtype: float64
[[ 6.13066771e-02  3.01407003e-02  6.11330406e-03  2.47464439e-03
   2.35634195e-02  2.55760153e-02  7.30384784e-04 -1.03094117e-03
   1.69691310e-02 -4.09313762e-03 -8.49401927e-01 -5.68619178e-02
   2.76784987e-03 -5.60969443e-04  1.29866962e-02 -1.58726780e-02
   4.12316366e-03 -4.91285019e-04 -1.04663913e-02 -5.99806312e-05
   4.53540079e-03  2.71063492e-04  2.94399396e-03 -1.93498395e-03
   1.18158155e-03 -4.08002783e-03 -2.02589927e-02 -2.19011260e-02
   4.21600717e-02  0.00000000e+00  8.16154887e-03  4.65980655e-03
   8.00305484e-02  1.17799812e-03 -5.91174580e-02 -3.71911041e-02
   2.27861324e-03]]

# Feature importance imputed log_importances = pd . Series ( log_imp . coef_ [ 0 ] , X_imp_train . columns ) log_importances = log_importances . sort_values ( ascending = False ) print ( log_importances )


              
               marital_married                   0.091648
age                               0.058591
gender_MF                         0.052158
last_acc                          0.044697
duration                          0.036174
occupation_self-employed          0.030339
entry_age                         0.019694
constr_loan                       0.015286
marital_divorced                  0.011058
occupation_freelancer             0.009931
occupation_public servant         0.009856
occupation_white-collar worker    0.009738
avg_res_dur                       0.009368
occupation_unemployed             0.008912
extra_acc                         0.006635
ppower                            0.006374
occupation_housewife              0.006167
sec_acc                           0.005832
occupation_pensioner/retiree      0.004857
pop_km                            0.004653
logins                            0.003816
nprod                             0.002890
occupation_university student     0.002768
acad                              0.002683
occupation_private means          0.002206
calls                             0.001179
mobile_logins                     0.001169
complaints                        0.000973
referrals                         0.000936
marital_cohabiting                0.000502
occupation_soldier                0.000482
gender_nan                        0.000393
fixed_acc                         0.000279
marital_separated                 0.000196
total_savings                     0.000083
transactions_year                -0.000030
marital_widowed                  -0.000527
car_seg                          -0.000582
occupation_apprentice            -0.000998
occupation_student               -0.001444
nr_hh                            -0.001462
total_exposure                   -0.003923
occupation_blue-collar worker    -0.004411
res_move_365                     -0.004620
gender_F                         -0.010119
total_loans                      -0.011050
pref_device_Tablet               -0.013789
pref_device_mobile               -0.018566
cons_loan                        -0.022746
gender_M                         -0.042432
marital_single                   -0.049188
total_mailings                   -0.050189
marital_unmarried                -0.053691
pref_device_PC/Laptop            -0.061794
giro_mailing                     -0.858249
dtype: float64

Tree-based methods

# Decision Tree # Grid Search for Hyperparameters dtree_clf = DecisionTreeClassifier ( ) param_grid_tree = { 'criterion' : [ 'gini' , 'entropy' ] , 'max_features' : [ 'auto' , 'sqrt' ] , 'min_samples_leaf' : [ 1 , 3 , 5 ] , 'min_samples_split' : [ 2 , 5 , 10 ] , 'max_depth' : [ 10 , 30 , 50 , 70 , 90 , None ] } tree_grid_search_res = GridSearchCV ( dtree_clf , param_grid_tree , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) tree_gs_res_fit = tree_grid_search_res . fit ( X_res_val , y_res_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}' . format ( tree_gs_res_fit . best_score_ , tree_gs_res_fit . best_params_ ) ) tree_grid_search_imp = GridSearchCV ( dtree_clf , param_grid_tree , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) tree_gs_imp_fit = tree_grid_search_res . fit ( X_imp_val , y_imp_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}' . format ( tree_gs_imp_fit . best_score_ , tree_gs_imp_fit . best_params_ ) )


              
               Fitting 10 folds for each of 216 candidates, totalling 2160 fits
The best results (Accuracy: 0.8497805974094634) on the restricted dataset were achieved with the following parameters: 
 {'criterion': 'entropy', 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5}
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
The best results (Accuracy: 0.8489999999999999) on the imputed dataset were achieved with the following parameters: 
 {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10}

dtree_res = tree_gs_res_fit . best_estimator_ # Restricted dataset dtree_res_fit = dtree_res . fit ( X_res_train , y_res_train [ 'xsell' ] ) y_res_pred_tree_gs = dtree_res . predict ( X_res_test ) print ( classification_report ( y_res_pred_tree_gs , y_res_test [ 'xsell' ] ) ) mat_imp = confusion_matrix ( y_res_pred_tree_gs , y_res_test [ 'xsell' ] ) sn . heatmap ( mat_imp , square = True , annot = True , fmt = 'd' , cbar = False )


              
               precision    recall  f1-score   support
         0.0       0.85      0.89      0.87       923
         1.0       0.90      0.86      0.88      1022
    accuracy                           0.88      1945
   macro avg       0.87      0.88      0.87      1945
weighted avg       0.88      0.88      0.88      1945

#Most important features dtree_importances = pd . Series ( dtree_res_fit . feature_importances_ , X_res_train . columns ) dtree_importances = dtree_importances . sort_values ( ascending = False ) print ( dtree_importances ) imp_tree = plt . subplots ( figsize = ( 8.5 , 8.5 ) ) imp_tree = plt . bar ( X_res_train . columns , dtree_res_fit . feature_importances_ )


              
               giro_mailing          0.720036
duration              0.073823
last_acc              0.043999
logins                0.035227
entry_age             0.021432
transactions_year     0.017323
total_mailings        0.014349
total_exposure        0.013042
total_savings         0.011623
age                   0.008430
nr_hh                 0.007799
marital_unmarried     0.003957
total_loans           0.003913
ppower                0.003391
pop_km                0.003119
gender_MF             0.002866
gender_M              0.001821
referrals             0.001724
res_move_365          0.001662
extra_acc             0.001424
nprod                 0.001371
marital_married       0.001360
sec_acc               0.001262
calls                 0.000948
mobile_logins         0.000851
constr_loan           0.000779
marital_divorced      0.000697
marital_single        0.000696
marital_cohabiting    0.000690
cons_loan             0.000320
gender_F              0.000067
marital_separated     0.000000
acad                  0.000000
fixed_acc             0.000000
gender_nan            0.000000
complaints            0.000000
marital_widowed       0.000000
dtype: float64

dtree_imp = tree_gs_imp_fit . best_estimator_ #Imputed dataset dtree_imp_fit = dtree_imp . fit ( X_imp_train , y_imp_train [ 'xsell' ] ) y_imp_pred_tree_gs = dtree_imp . predict ( X_imp_test ) print ( classification_report ( y_imp_pred_tree_gs , y_imp_test [ 'xsell' ] ) ) # Visualize tree data_imp = export_graphviz ( dtree_imp_fit , out_file = None , feature_names = X_imp_train . columns ) graph_imp = graph_from_dot_data ( data_imp ) Image ( graph_imp . create_png ( ) ) mat_imp = confusion_matrix ( y_imp_pred_tree_gs , y_imp_test [ 'xsell' ] ) sn . heatmap ( mat_imp , square = True , annot = True , fmt = 'd' , cbar = False )


              
               precision    recall  f1-score   support
         0.0       0.89      0.88      0.89      1010
         1.0       0.88      0.89      0.88       990
    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.88      2000
weighted avg       0.89      0.89      0.89      2000

#Most important features dtree_importances = pd . Series ( dtree_imp_fit . feature_importances_ , X_imp_train . columns ) dtree_importances = dtree_importances . sort_values ( ascending = False ) #forest_importances = forest_importances.sort_values('importance') print ( dtree_importances ) #plt.bar(X_res_train.columns, dtree_imp_fit.feature_importances_)


              
               giro_mailing                      0.555181
duration                          0.135891
last_acc                          0.050360
age                               0.044987
logins                            0.031553
transactions_year                 0.029876
total_exposure                    0.021039
total_savings                     0.018081
nr_hh                             0.014706
entry_age                         0.013577
total_mailings                    0.013466
ppower                            0.010012
pop_km                            0.009118
total_loans                       0.006873
cons_loan                         0.005123
car_seg                           0.004500
nprod                             0.004442
calls                             0.003268
constr_loan                       0.003177
gender_M                          0.003037
marital_married                   0.003011
gender_MF                         0.002694
avg_res_dur                       0.002399
marital_single                    0.002223
occupation_white-collar worker    0.002182
extra_acc                         0.001953
res_move_365                      0.001867
pref_device_mobile                0.001312
occupation_housewife              0.001292
mobile_logins                     0.001021
occupation_blue-collar worker     0.000837
marital_divorced                  0.000540
pref_device_PC/Laptop             0.000396
gender_F                          0.000012
occupation_private means          0.000000
pref_device_Tablet                0.000000
fixed_acc                         0.000000
occupation_university student     0.000000
occupation_unemployed             0.000000
occupation_student                0.000000
occupation_soldier                0.000000
occupation_self-employed          0.000000
occupation_public servant         0.000000
occupation_freelancer             0.000000
occupation_pensioner/retiree      0.000000
complaints                        0.000000
occupation_apprentice             0.000000
marital_widowed                   0.000000
marital_unmarried                 0.000000
sec_acc                           0.000000
marital_separated                 0.000000
acad                              0.000000
marital_cohabiting                0.000000
referrals                         0.000000
gender_nan                        0.000000
dtype: float64

# Random Forest rf_clf = RandomForestClassifier ( ) # Grid Search for optimzing Hyperparameter param_grid_rf = [ { 'bootstrap' : [ True , False ] , 'max_depth' : [ 10 , 30 , 50 , 70 , 90 , None ] , 'max_features' : [ 'auto' , 'sqrt' ] , 'min_samples_leaf' : [ 1 , 3 , 5 ] , 'min_samples_split' : [ 2 , 5 , 10 ] , 'n_estimators' : [ 50 , 100 , 500 , 1000 ] } # Random search of parameters, using 3 fold cross validation with 50 iterations rf_grid_search_res = RandomizedSearchCV ( estimator = rf_clf , param_distributions = param_grid_rf , n_iter = 50 , cv = 3 , scoring = "accuracy" , return_train_score = True , verbose = True , random_state = 42 , n_jobs = - 1 ) rf_gs_res_fit = rf_grid_search_res . fit ( X_res_val , y_res_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}' . format ( rf_gs_res_fit . best_score_ , rf_gs_res_fit . best_params_ ) ) rf_grid_search_imp = RandomizedSearchCV ( estimator = rf_clf , param_distributions = param_grid_rf , n_iter = 50 , cv = 3 , scoring = "accuracy" , return_train_score = True , verbose = True , random_state = 42 , n_jobs = - 1 ) rf_gs_imp_fit = rf_grid_search_imp . fit ( X_imp_val , y_imp_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}' . format ( rf_gs_imp_fit . best_score_ , rf_gs_imp_fit . best_params_ ) )


              
               Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.8991769547325102) on the restricted dataset were achieved with the following parameters: 
 {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.9035062048555301) on the imputed dataset were achieved with the following parameters: 
 {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False}

rf_res = rf_gs_res_fit . best_estimator_ # Restricted dataset print ( 'Running RF with parameters {0} on the restricted Dataset: \n' . format ( rf_gs_res_fit . best_params_ ) ) rf_res_fit = rf_res . fit ( X_res_train , y_res_train [ 'xsell' ] ) y_res_pred_rf = rf_res_fit . predict ( X_res_test ) # Classification report for this classifier print ( classification_report ( y_res_pred_rf , y_res_test [ 'xsell' ] ) ) # Confusion matrix mat_res = confusion_matrix ( y_res_test [ 'xsell' ] , y_res_pred_rf ) sn . heatmap ( mat_res , square = True , annot = True , fmt = 'd' , cbar = False ) plt . xlabel ( 'True label' ) plt . ylabel ( 'Predicted label' )


              
               Running RF with parameters {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False} on the restricted Dataset: 
              precision    recall  f1-score   support
         0.0       0.93      0.91      0.92       987
         1.0       0.91      0.93      0.92       958
    accuracy                           0.92      1945
   macro avg       0.92      0.92      0.92      1945
weighted avg       0.92      0.92      0.92      1945

# Most important features forest_importances = pd . Series ( rf_res_fit . feature_importances_ , X_res_train . columns ) forest_importances = forest_importances . sort_values ( ascending = False ) # forest_importances = forest_importances.sort_values('importance') print ( forest_importances ) plt . bar ( X_res_train . columns , rf_res_fit . feature_importances_ )


              
               giro_mailing          0.640577
duration              0.093880
last_acc              0.048013
age                   0.026012
logins                0.024710
total_mailings        0.022301
total_exposure        0.020774
entry_age             0.020022
total_savings         0.018770
nr_hh                 0.014135
transactions_year     0.013338
ppower                0.008324
pop_km                0.008187
total_loans           0.006213
marital_married       0.004724
nprod                 0.004446
marital_single        0.002840
cons_loan             0.002640
constr_loan           0.002506
extra_acc             0.002502
gender_M              0.002269
sec_acc               0.002183
gender_F              0.002117
gender_MF             0.001790
calls                 0.001721
mobile_logins         0.001508
res_move_365          0.001325
marital_unmarried     0.000796
marital_divorced      0.000410
marital_cohabiting    0.000367
referrals             0.000186
acad                  0.000126
marital_separated     0.000121
fixed_acc             0.000117
marital_widowed       0.000044
complaints            0.000007
gender_nan            0.000000
dtype: float64

rf_imp = rf_gs_imp_fit . best_estimator_ # Imputed dataset print ( 'Running RF with parameters {0} on the imputed Dataset: \n' . format ( rf_gs_imp_fit . best_params_ ) ) rf_imp_fit = rf_imp . fit ( X_imp_train , y_imp_train [ 'xsell' ] ) y_imp_pred_rf = rf_imp_fit . predict ( X_imp_test ) # Classification report for this classifier print ( classification_report ( y_imp_pred_rf , y_imp_test [ 'xsell' ] ) ) #Confusion matrix mat_imp = confusion_matrix ( y_imp_test [ 'xsell' ] , y_imp_pred_rf ) sn . heatmap ( mat_imp , square = True , annot = True , fmt = 'd' , cbar = False ) plt . xlabel ( 'True label' ) plt . ylabel ( 'Predicted label' )


              
               Running RF with parameters {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False} on the imputed Dataset: 
              precision    recall  f1-score   support
         0.0       0.93      0.91      0.92      1019
         1.0       0.91      0.93      0.92       981
    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

#Most important features forest_importances = pd . Series ( rf_imp_fit . feature_importances_ , X_imp_train . columns ) forest_importances = forest_importances . sort_values ( ascending = False ) #forest_importances = forest_importances.sort_values('importance') print ( forest_importances ) #plt.bar(X_res_train.columns, rf_imp_fit.feature_importances_)


              
               giro_mailing                      0.621187
duration                          0.094728
last_acc                          0.053538
logins                            0.023384
age                               0.020476
total_mailings                    0.019647
total_exposure                    0.018496
entry_age                         0.018444
total_savings                     0.015949
transactions_year                 0.015451
nr_hh                             0.012833
pop_km                            0.008173
ppower                            0.008070
car_seg                           0.007866
avg_res_dur                       0.007473
total_loans                       0.006882
marital_married                   0.004990
marital_single                    0.004545
nprod                             0.004008
pref_device_PC/Laptop             0.003895
constr_loan                       0.002846
sec_acc                           0.002609
gender_M                          0.002550
gender_F                          0.002443
marital_unmarried                 0.002437
gender_MF                         0.002319
occupation_white-collar worker    0.002304
occupation_self-employed          0.002279
cons_loan                         0.002105
extra_acc                         0.001955
calls                             0.001440
mobile_logins                     0.000780
res_move_365                      0.000688
pref_device_Tablet                0.000616
pref_device_mobile                0.000540
marital_divorced                  0.000342
occupation_blue-collar worker     0.000311
occupation_freelancer             0.000256
marital_cohabiting                0.000251
occupation_unemployed             0.000202
occupation_public servant         0.000193
acad                              0.000126
referrals                         0.000107
marital_widowed                   0.000072
occupation_pensioner/retiree      0.000059
marital_separated                 0.000058
fixed_acc                         0.000041
occupation_housewife              0.000025
occupation_student                0.000010
complaints                        0.000000
occupation_soldier                0.000000
occupation_private means          0.000000
occupation_university student     0.000000
occupation_apprentice             0.000000
gender_nan                        0.000000
dtype: float64

xgb_clf = GradientBoostingClassifier ( ) param_grid_xgb = { 'n_estimators' : [ 50 , 100 , 500 ] , 'learning_rate' : [ 1 , 0.6 , 0.2 , 0.05 , 0.01 ] , 'max_features' : [ 'auto' , 'sqrt' ] , 'min_samples_leaf' : [ 1 , 3 , 5 ] , 'min_samples_split' : [ 2 , 5 , 10 ] , 'max_depth' : [ 10 , 30 , 50 , 70 , None ] }

#xgb_grid_search_res = GridSearchCV(xgb_clf, param_grid_xgb, cv=10, scoring="accuracy", return_train_score=True, verbose=True, n_jobs=-1) xgb_grid_search_res = RandomizedSearchCV ( estimator = xgb_clf , param_distributions = param_grid_xgb , n_iter = 50 , cv = 3 , scoring = "accuracy" , return_train_score = True , verbose = True , random_state = 42 , n_jobs = - 1 ) xgb_gs_res_fit = xgb_grid_search_res . fit ( X_res_val , y_res_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}' . format ( xgb_gs_res_fit . best_score_ , xgb_gs_res_fit . best_params_ ) )


              
               Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.9038065843621399) on the restricted dataset were achieved with the following parameters: 
 {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.6}

#xgb_grid_search_imp = GridSearchCV(xgb_clf, param_grid_tree, cv=10, scoring="accuracy", return_train_score=True, verbose=True, n_jobs=-1) xgb_grid_search_imp = RandomizedSearchCV ( estimator = xgb_clf , param_distributions = param_grid_xgb , n_iter = 50 , cv = 3 , scoring = "accuracy" , return_train_score = True , verbose = True , random_state = 42 , n_jobs = - 1 ) xgb_gs_imp_fit = xgb_grid_search_imp . fit ( X_imp_val , y_imp_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}' . format ( xgb_gs_imp_fit . best_score_ , xgb_gs_imp_fit . best_params_ ) )


              
               Fitting 3 folds for each of 50 candidates, totalling 150 fits
The best results (Accuracy: 0.9199987093540317) on the imputed dataset were achieved with the following parameters: 
 {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 30, 'learning_rate': 0.6}

xgb_res = xgb_gs_res_fit . best_estimator_ # Restricted Dataset print ( 'Running XGBoost with parameters {0} on the restricted Dataset: \n' . format ( xgb_gs_res_fit . best_params_ ) ) xgb_res_fit = xgb_res . fit ( X_res_train , y_res_train [ 'xsell' ] ) y_res_pred_xgb = xgb_res_fit . predict ( X_res_test ) #Classification report for this classifier print ( classification_report ( y_res_pred_xgb , y_res_test [ 'xsell' ] ) ) #Confusion matrix mat_res = confusion_matrix ( y_res_test [ 'xsell' ] , y_res_pred_xgb ) sn . heatmap ( mat_res . T , square = True , annot = True , fmt = 'd' , cbar = False ) plt . xlabel ( 'True label' ) plt . ylabel ( 'Predicted label' )


              
               Running XGBoost with parameters {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.6} on the restricted Dataset: 
              precision    recall  f1-score   support
         0.0       0.92      0.93      0.93       966
         1.0       0.93      0.93      0.93       979
    accuracy                           0.93      1945
   macro avg       0.93      0.93      0.93      1945
weighted avg       0.93      0.93      0.93      1945

# Most important features xgb_importances = pd . Series ( xgb_res_fit . feature_importances_ , X_res_train . columns ) xgb_importances = xgb_importances . sort_values ( ascending = False ) print ( xgb_importances )


              
               giro_mailing          5.816048e-01
duration              9.312843e-02
last_acc              7.137942e-02
total_mailings        2.935098e-02
logins                2.920605e-02
age                   2.581673e-02
total_exposure        2.222142e-02
marital_married       2.050267e-02
entry_age             1.890555e-02
total_savings         1.831603e-02
nr_hh                 1.636405e-02
total_loans           1.278161e-02
transactions_year     1.204004e-02
ppower                9.491902e-03
pop_km                8.490154e-03
nprod                 5.133055e-03
extra_acc             3.863216e-03
calls                 3.826234e-03
sec_acc               3.298887e-03
marital_single        2.591253e-03
gender_F              2.454651e-03
constr_loan           2.347785e-03
mobile_logins         1.608224e-03
gender_MF             1.512336e-03
gender_M              1.440673e-03
res_move_365          8.338529e-04
cons_loan             7.860614e-04
marital_unmarried     3.550378e-04
marital_cohabiting    1.343705e-04
marital_divorced      1.176337e-04
acad                  8.157319e-05
fixed_acc             9.315930e-06
complaints            3.242547e-06
marital_separated     2.145037e-06
referrals             6.257511e-07
marital_widowed       3.572586e-08
gender_nan            0.000000e+00
dtype: float64

xgb_imp = xgb_gs_imp_fit . best_estimator_ # Imputed dataset print ( 'Running XGBoost with parameters {0} on the imputed Dataset: \n' . format ( xgb_gs_imp_fit . best_params_ ) ) xgb_imp_fit = xgb_imp . fit ( X_imp_train , y_imp_train [ 'xsell' ] ) y_imp_pred_xgb = xgb_imp_fit . predict ( X_imp_test ) # Classification report for this classifier print ( classification_report ( y_imp_pred_xgb , y_imp_test [ 'xsell' ] ) ) # Confusion matrix mat_imp = confusion_matrix ( y_imp_test [ 'xsell' ] , y_imp_pred_xgb ) sn . heatmap ( mat_imp . T , square = True , annot = True , fmt = 'd' , cbar = False ) plt . xlabel ( 'True label' ) plt . ylabel ( 'Predicted label' )


              
               Running XGBoost with parameters {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 30, 'learning_rate': 0.6} on the imputed Dataset: 
              precision    recall  f1-score   support
         0.0       0.92      0.92      0.92       998
         1.0       0.92      0.92      0.92      1002
    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

# Most important features xgb_importances = pd . Series ( xgb_imp_fit . feature_importances_ , X_imp_train . columns ) xgb_importances = xgb_importances . sort_values ( ascending = False ) print ( xgb_importances )


              
               giro_mailing                      6.445461e-01
duration                          1.573122e-01
last_acc                          2.871299e-02
total_savings                     2.075701e-02
nr_hh                             1.818409e-02
logins                            1.664149e-02
age                               1.369750e-02
entry_age                         1.243769e-02
avg_res_dur                       1.050905e-02
total_exposure                    9.521362e-03
ppower                            8.948275e-03
transactions_year                 8.636020e-03
pop_km                            8.402746e-03
total_mailings                    7.671126e-03
car_seg                           6.856799e-03
nprod                             3.274113e-03
marital_married                   2.932066e-03
gender_MF                         2.822208e-03
sec_acc                           2.579480e-03
gender_F                          2.577289e-03
extra_acc                         2.069599e-03
gender_M                          1.818490e-03
occupation_white-collar worker    1.673652e-03
occupation_blue-collar worker     1.259882e-03
pref_device_Tablet                1.197092e-03
pref_device_PC/Laptop             1.125939e-03
calls                             9.080079e-04
marital_single                    8.360617e-04
fixed_acc                         7.094792e-04
total_loans                       4.726436e-04
cons_loan                         1.918540e-04
marital_unmarried                 1.781453e-04
res_move_365                      1.753657e-04
occupation_self-employed          1.188641e-04
referrals                         1.015899e-04
acad                              5.142036e-05
mobile_logins                     3.474689e-05
marital_separated                 3.386300e-05
marital_divorced                  1.489116e-05
occupation_public servant         6.036330e-06
occupation_pensioner/retiree      2.254548e-06
marital_widowed                   4.565261e-07
occupation_university student     6.236806e-08
marital_cohabiting                2.666113e-08
constr_loan                       1.303055e-08
pref_device_mobile                2.259301e-09
occupation_soldier                9.720828e-12
occupation_student                2.143495e-12
occupation_freelancer             4.490857e-16
occupation_housewife              2.303686e-35
occupation_unemployed             5.168225e-36
occupation_private means          0.000000e+00
occupation_apprentice             0.000000e+00
gender_nan                        0.000000e+00
complaints                        0.000000e+00
dtype: float64

SVM

svm_clf = svm . SVC ( ) param_grid_svm = { 'C' : [ 0.1 , 1 , 10 , 100 ] , 'gamma' : [ 1 , 0.1 , 0.01 , 0.001 ] , 'kernel' : [ 'rbf' , 'poly' , 'sigmoid' ] } svm_grid_search_res = GridSearchCV ( svm_clf , param_grid_svm , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) svm_gs_res_fit = svm_grid_search_res . fit ( X_res_val , y_res_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the restricted dataset were achieved with the following parameters: \n {1}' . format ( svm_gs_res_fit . best_score_ , svm_gs_res_fit . best_params_ ) ) svm_grid_search_imp = GridSearchCV ( svm_clf , param_grid_svm , cv = 10 , scoring = "accuracy" , return_train_score = True , verbose = True , n_jobs = - 1 ) svm_gs_imp_fit = svm_grid_search_imp . fit ( X_imp_val , y_imp_val [ 'xsell' ] ) print ( 'The best results (Accuracy: {0}) on the imputed dataset were achieved with the following parameters: \n {1}' . format ( svm_gs_imp_fit . best_score_ , svm_gs_imp_fit . best_params_ ) )


              
               Fitting 10 folds for each of 48 candidates, totalling 480 fits
The best results (Accuracy: 0.8909357652656622) on the restricted dataset were achieved with the following parameters: 
 {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Fitting 10 folds for each of 48 candidates, totalling 480 fits
The best results (Accuracy: 0.8985) on the imputed dataset were achieved with the following parameters: 
 {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

svm_res = svm_gs_res_fit . best_estimator_ # {'C': 1, 'gamma': 1, 'kernel': 'rbf'} # Restricted Dataset print ( 'Running SVM with parameters {0} on the restricted Dataset: \n' . format ( svm_gs_res_fit . best_params_ ) ) svm_res_fit = svm_res . fit ( X_res_train , y_res_train [ 'xsell' ] ) y_res_pred_svm = svm_res_fit . predict ( X_res_test ) #Classification report for this classifier print ( classification_report ( y_res_pred_svm , y_res_test [ 'xsell' ] ) ) #Confusion matrix mat_res = confusion_matrix ( y_res_test [ 'xsell' ] , y_res_pred_svm ) sn . heatmap ( mat_res . T , square = True , annot = True , fmt = 'd' , cbar = False ) plt . xlabel ( 'True label' ) plt . ylabel ( 'Predicted label' )


              
               Running SVM with parameters {'C': 1, 'gamma': 1, 'kernel': 'rbf'} on the restricted Dataset: 
              precision    recall  f1-score   support
         0.0       0.91      0.91      0.91       972
         1.0       0.91      0.91      0.91       973
    accuracy                           0.91      1945
   macro avg       0.91      0.91      0.91      1945
weighted avg       0.91      0.91      0.91      1945

svm_imp = svm_gs_imp_fit . best_estimator_ #{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} # Imputed dataset print ( 'Running SVM with parameters {0} on the imputed Dataset: \n' . format ( svm_gs_res_fit . best_params_ ) ) svm_imp_fit = svm_imp . fit ( X_imp_train , y_imp_train [ 'xsell' ] ) y_imp_pred_svm = svm_imp_fit . predict ( X_imp_test ) #Classification report for this classifier print ( classification_report ( y_imp_pred_svm , y_imp_test [ 'xsell' ] ) ) #Confusion matrix mat_imp = confusion_matrix ( y_imp_test [ 'xsell' ] , y_imp_pred_svm ) sn . heatmap ( mat_imp . T , square = True , annot = True , fmt = 'd' , cbar = False ) plt . xlabel ( 'True label' )


              
               Running SVM with parameters {'C': 1, 'gamma': 1, 'kernel': 'rbf'} on the imputed Dataset: 
              precision    recall  f1-score   support
         0.0       0.91      0.91      0.91      1001
         1.0       0.91      0.91      0.91       999
    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000

Neural Network

HIDDEN_NEURONS_res = len ( X_res . columns ) HIDDEN_NEURONS_imp = len ( X_imp . columns ) def create_res_baseline ( ) : # create model model = Sequential ( ) model . add ( Dense ( HIDDEN_NEURONS_res , input_dim = HIDDEN_NEURONS_res , activation = 'relu' ) ) model . add ( Dropout ( .2 , input_shape = ( HIDDEN_NEURONS_res , ) ) ) model . add ( Dense ( 1 , activation = 'sigmoid' ) ) # Compile model model . compile ( loss = 'binary_crossentropy' , optimizer = 'adam' , metrics = [ 'accuracy' ] ) return model def create_imp_baseline ( ) : # create model model = Sequential ( ) model . add ( Dense ( HIDDEN_NEURONS_imp , input_dim = HIDDEN_NEURONS_imp , activation = 'relu' ) ) model . add ( Dropout ( .2 , input_shape = ( HIDDEN_NEURONS_imp , ) ) ) model . add ( Dense ( 1 , activation = 'sigmoid' ) ) # Compile model model . compile ( loss = 'binary_crossentropy' , optimizer = 'adam' , metrics = [ 'accuracy' ] ) return model

# Dropout after every non-linear activation function def create_res_model ( ) : # create model model = Sequential ( ) model . add ( Dense ( HIDDEN_NEURONS_res , input_dim = HIDDEN_NEURONS_res , activation = 'relu' ) ) model . add ( Dropout ( .4 , input_shape = ( HIDDEN_NEURONS_res , ) ) ) model . add ( Dense ( HIDDEN_NEURONS_res , activation = 'relu' ) ) model . add ( Dropout ( .4 , input_shape = ( HIDDEN_NEURONS_res , ) ) ) model . add ( Dense ( HIDDEN_NEURONS_res , activation = 'relu' ) ) model . add ( Dropout ( .4 , input_shape = ( HIDDEN_NEURONS_res , ) ) ) model . add ( Dense ( 1 , activation = 'sigmoid' ) ) # Compile model model . compile ( loss = 'binary_crossentropy' , optimizer = 'adam' , metrics = [ 'accuracy' ] ) return model # Dropout after every non-linear activation function def create_imp_model ( ) : # create model model = Sequential ( ) model . add ( Dense ( HIDDEN_NEURONS_imp , input_dim = HIDDEN_NEURONS_imp , activation = 'relu' ) ) model . add ( Dropout ( .4 , input_shape = ( HIDDEN_NEURONS_imp , ) ) ) model . add ( Dense ( HIDDEN_NEURONS_imp , activation = 'relu' ) ) model . add ( Dropout ( .4 , input_shape = ( HIDDEN_NEURONS_imp , ) ) ) model . add ( Dense ( HIDDEN_NEURONS_imp , activation = 'relu' ) ) model . add ( Dense ( 1 , activation = 'sigmoid' ) ) # Compile model model . compile ( loss = 'binary_crossentropy' , optimizer = 'adam' , metrics = [ 'accuracy' ] ) return model

estimator = KerasClassifier ( build_fn = create_res_baseline , epochs = 50 , batch_size = 5 , verbose = 0 ) kfold = StratifiedKFold ( n_splits = 7 , shuffle = True ) results = cross_val_score ( estimator , X_res , Y_res [ 'xsell' ] , cv = kfold ) print ( "Baseline restricted data: {0} ({1})" . format ( results . mean ( ) * 100 , results . std ( ) * 100 ) )


              
               Baseline: 89.86725636890957 (0.9019227930462493)

estimator = KerasClassifier ( build_fn = create_imp_baseline , epochs = 25 , batch_size = 5 , verbose = 0 ) kfold = StratifiedKFold ( n_splits = 5 , shuffle = True ) results = cross_val_score ( estimator , X_imp , Y_imp [ 'xsell' ] , cv = kfold ) print ( "Baseline imputed data: {0} ({1})" . format ( results . mean ( ) * 100 , results . std ( ) * 100 ) )


              
               Baseline: 89.86999869346619 (0.4019956615884608)

estimator = KerasClassifier ( build_fn = create_res_model , epochs = 25 , batch_size = 5 , verbose = 0 ) kfold = StratifiedKFold ( n_splits = 5 , shuffle = True )