The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (or not) subscribed.
Number of Instances: 45211
Number of Attributes: 16 + output attribute.
The classification goal is to predict if the client will subscribe a term deposit (variable y).
Attribute information:
Input variables:
Bank client data:
1 – age (numeric)
2 – job : type of job (categorical: “admin.”,”unknown”,”unemployed”,”management”,”housemaid”,”entrepreneur”,”student”, “blue-collar”,”self-employed”,”retired”,”technician”,”services”)
3 – marital : marital status (categorical: “married”,”divorced”,”single”; note: “divorced” means divorced or widowed)
4 – education (categorical: “unknown”,”secondary”,”primary”,”tertiary”)
5 – default: has credit in default? (binary: “yes”,”no”)
6 – balance: average yearly balance, in euros (numeric)
7 – housing: has housing loan? (binary: “yes”,”no”)
8 – loan: has personal loan? (binary: “yes”,”no”)
Related with the last contact of the current campaign:
9 – contact: contact communication type (categorical: “unknown”,”telephone”,”cellular”)
10 – day: last contact day of the month (numeric)
11 – month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
12 – duration: last contact duration, in seconds (numeric)
Other attributes:
13 – campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14 – pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15 – previous: number of contacts performed before this campaign and for this client (numeric)
16 – poutcome: outcome of the previous marketing campaign (categorical: “unknown”,”other”,”failure”,”success”)
Output variable (desired target):
17 – y – has the client subscribed a term deposit? (binary: “yes”,”no”)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
path = '../input/bank-marketing/bank-additional-full.csv'
df = pd.read_csv(path,sep=';')
path = 'D:/Dataset/bank-additional/bank-additional-full.csv'
df = pd.read_csv(path,sep=';')
df.head()

df.info()

df.y.value_counts()

EDA
int_column = df.dtypes[df.dtypes == 'int64'].index | df.dtypes[df.dtypes == 'float64'].index
for column in int_column:
plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
sns.distplot(df[column])
plt.xlabel(column)
plt.ylabel('Density')
plt.title(f'{column} Distribution')
plt.subplot(1,3,2)
sns.boxplot(x='y', y=column, data =df, showmeans=True )
plt.xlabel('Target')
plt.ylabel(column)
plt.title(f'{column} Distribution')
plt.subplot(1,3,3)
counts, bins = np.histogram(df[column], bins=20, normed=True)
cdf = np.cumsum (counts)
plt.plot (bins[1:], cdf/cdf[-1])
#plt.xticks(range(15,100,5))
plt.yticks(np.arange(0,1.1,.1))
plt.title(f'{column} cdf')
plt.show()
print()










# Quantiles
for column in int_column:
print(f'For {column}:')
print('Min:', df[column].quantile(q = 0))
print('1º Quartile:', df[column].quantile(q = 0.25))
print('2º Quartile:', df[column].quantile(q = 0.50))
print('3º Quartile:', df[column].quantile(q = 0.75))
print('Max:', df[column].quantile(q = 1.00),'\n')
For age: Min: 17.0 1º Quartile: 32.0 2º Quartile: 38.0 3º Quartile: 47.0 Max: 98.0 For campaign: Min: 1.0 1º Quartile: 1.0 2º Quartile: 2.0 3º Quartile: 3.0 Max: 56.0 For cons.conf.idx: Min: -50.8 1º Quartile: -42.7 2º Quartile: -41.8 3º Quartile: -36.4 Max: -26.9 For cons.price.idx: Min: 92.201 1º Quartile: 93.075 2º Quartile: 93.749 3º Quartile: 93.994 Max: 94.767 For duration: Min: 0.0 1º Quartile: 102.0 2º Quartile: 180.0 3º Quartile: 319.0 Max: 4918.0 For emp.var.rate: Min: -3.4 1º Quartile: -1.8 2º Quartile: 1.1 3º Quartile: 1.4 Max: 1.4 For euribor3m: Min: 0.634 1º Quartile: 1.344 2º Quartile: 4.857 3º Quartile: 4.961 Max: 5.045 For nr.employed: Min: 4963.6 1º Quartile: 5099.1 2º Quartile: 5191.0 3º Quartile: 5228.1 Max: 5228.1 For pdays: Min: 0.0 1º Quartile: 999.0 2º Quartile: 999.0 3º Quartile: 999.0 Max: 999.0 For previous: Min: 0.0 1º Quartile: 0.0 2º Quartile: 0.0 3º Quartile: 0.0 Max: 7.0
df.drop(df[df.age>60].index, inplace=True)
df.drop(df[df.campaign>10].index, inplace=True)
df.drop(df[df.duration>1000].index, inplace=True)
df.drop('pdays', axis=1, inplace=True)
For object type
dfgrouped = df.groupby('y')
def plot_barh(array,incrementer, bias, text_color ='blue', palette_style = 'darkgrid',palette_color = 'RdBu'):
sns.set_style(palette_style)
sns.set_palette(palette_color)
plt.barh(array.index, width = array.values, height = .5)
plt.yticks(np.arange(len(array)))
plt.xticks( range(0, round(max(array)) +bias, incrementer ))
for index, value in enumerate(array.values):
plt.text(value +.5, index, s= '{:.1f}%'.format(value), color = text_color)
#plt.show()
return plt
def feature_perc(feature,groupby= 'yes'):
count = dfgrouped.get_group(groupby)[feature].value_counts()
total_count = df[feature].value_counts()[count.index]
perc = (count/total_count)*100
return perc
obj_column = df.dtypes[df.dtypes == 'object'].index
obj_column

for column in obj_column[:-1]:
yes_perc = feature_perc(column, groupby='yes')
no_perc = feature_perc(column, groupby='no')
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.title(f'Success rate by {column}')
plot_barh(yes_perc.sort_values(),5,10)
plt.subplot(1,2,2)
plt.title(f'Failure rate by {column}')
plot_barh(no_perc.sort_values(),5,10)
plt.show()
print()







Modeling
df1 = df.copy()
df1['y'] = df1.y.apply(lambda x:0 if x=='no' else 1)
df1.y.value_counts()

from sklearn.utils import resample
# Separate majority and minority classes
df1_majority = df1[df1.y==0]
df1_minority = df1[df1.y==1]
# Upsample minority class
df1_minority_upsampled = resample(df1_minority,
replace=True, # sample with replacement
n_samples=36962, # to match majority class
random_state=42) # reproducible results
# Combine majority class with upsampled minority class
df = pd.concat([df1_majority, df1_minority_upsampled])
# Display new class counts
df.y.value_counts()

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
obj_column = df.dtypes[df.dtypes == 'object'].index
mapingdf = pd.DataFrame()
for column in obj_column:
labelencoder = LabelEncoder()
df[column] = labelencoder.fit_transform(df[column])
mapingdf[column] = df[column]
mapingdf['_'+column] = labelencoder.inverse_transform(df[column])
#for reference
mapingdf

df.head()

df.corr().y.sort_values()

X_train, X_test, y_train, y_test = train_test_split(df.drop('y',axis=1),df['y'],test_size=.3, random_state = 42,stratify= df['y'])
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
models = [DecisionTreeClassifier(),
RandomForestClassifier(),
XGBClassifier()]
names = [ 'DecisionTreeClassifier',
'RandomForestClassifier',
'XGBClassifier']
for model,name in zip(models,names):
m = model.fit(X_train,y_train)
print(name, 'report:')
print('Train score',model.score(X_train,y_train))
print('Test score',model.score(X_test,y_test))
print()
print("Train confusion matrix:\n",confusion_matrix(y_train, model.predict(X_train)),'\n')
print("Test confusion matrix:\n",confusion_matrix(y_test, model.predict(X_test)))
print('*'*50)
DecisionTreeClassifier report: Train score 1.0 Test score 0.9706128133704736 Train confusion matrix: [[24386 0] [ 0 25873]] Test confusion matrix: [[ 9822 629] [ 4 11085]] ************************************************** RandomForestClassifier report: Train score 1.0 Test score 0.9730733519034355 Train confusion matrix: [[24386 0] [ 0 25873]] Test confusion matrix: [[ 9871 580] [ 0 11089]] ************************************************** XGBClassifier report: Train score 0.8981475954555403 Test score 0.8935468895078923 Train confusion matrix: [[20601 3785] [ 1334 24539]] Test confusion matrix: [[ 8758 1693] [ 600 10489]] **************************************************
model = DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)
from sklearn.tree import plot_tree
plt.figure(figsize=(20,15))
plot_tree(model,
feature_names= df1.drop('y', axis=1).columns,
class_names= ['yes','no'],
filled=True)
plt.show()

Read More Python Blogs here.