import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

dataset = pd.read_csv('/Users/rianyan/Desktop/train_ctrUa4K.csv')

dataset.head()

dataset.shape

(614, 13)

dataset = dataset.sample(n=550, random_state=72)

dataset.to_csv('RianYan_2424072.csv')

data = pd.read_csv('RianYan_2424072.csv')

data.head()

data = data.drop('Unnamed: 0', axis = 1)

data.describe()

print(data.size)

7150

print(data.shape[0], data.shape[1])

550 13

print(data.ndim)

2

print(data.shape)

(550, 13)

data.value_counts('Education')

Education
Graduate        428
Not Graduate    122
Name: count, dtype: int64

data.Education.unique()

array(['Graduate', 'Not Graduate'], dtype=object)

columns = data.columns
print(columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

data.head()

data['ApplicantIncome'].hist(bins=50)

<Axes: >

data.boxplot(column = 'ApplicantIncome')

<Axes: >

Max_value = data.ApplicantIncome.max()
Min_value = data.ApplicantIncome.min()
print(Max_value, Min_value)

81000 150

q1 = data.ApplicantIncome.quantile(0.25)
q3 = data.ApplicantIncome.quantile(0.75)
iqr = q3 - q1

print(q1, q3, iqr)

2877.5 5844.0 2966.5

lower_threshold = q1 - 1.5 * iqr
upper_threshold = q3 + 1.5 * iqr

print (lower_threshold, upper_threshold)

-1572.25 10293.75

outliers = data[data['ApplicantIncome'] > upper_threshold]

print(outliers)

      Loan_ID  Gender Married Dependents     Education Self_Employed  \
13   LP001369    Male     Yes          2      Graduate            No   
18   LP002855    Male     Yes          2      Graduate            No   
51   LP001508    Male     Yes          2      Graduate            No   
61   LP002386    Male      No          0      Graduate           NaN   
73   LP001020    Male     Yes          1      Graduate            No   
80   LP001585     NaN     Yes         3+      Graduate            No   
95   LP001422  Female      No          0      Graduate            No   
106  LP002959  Female     Yes          1      Graduate            No   
117  LP002731  Female      No          0  Not Graduate           Yes   
126  LP001637    Male     Yes          1      Graduate            No   
131  LP002813  Female     Yes          1      Graduate           Yes   
134  LP001640    Male     Yes          0      Graduate           Yes   
145  LP001448     NaN     Yes         3+      Graduate            No   
165  LP002729    Male      No          1      Graduate            No   
167  LP002403    Male      No          0      Graduate           Yes   
180  LP001922    Male     Yes          0      Graduate            No   
263  LP001656    Male      No          0      Graduate            No   
267  LP001844    Male      No          0      Graduate           Yes   
273  LP002634  Female      No          1      Graduate            No   
287  LP001907    Male     Yes          0      Graduate            No   
292  LP002531    Male     Yes          1      Graduate           Yes   
294  LP002501     NaN     Yes          0      Graduate            No   
297  LP002547    Male     Yes          1      Graduate            No   
303  LP002065    Male     Yes         3+      Graduate            No   
305  LP001516  Female     Yes          2      Graduate            No   
336  LP002541    Male     Yes          0      Graduate            No   
349  LP001350    Male     Yes        NaN      Graduate            No   
366  LP002191    Male     Yes          0      Graduate            No   
373  LP001186  Female     Yes          1      Graduate           Yes   
385  LP001536    Male     Yes         3+      Graduate            No   
388  LP002317    Male     Yes         3+      Graduate            No   
392  LP002422    Male      No          1      Graduate            No   
416  LP002101    Male     Yes          0      Graduate           NaN   
427  LP002624    Male     Yes          0      Graduate            No   
460  LP001233    Male     Yes          1      Graduate            No   
475  LP001996    Male      No          0      Graduate            No   
490  LP001891    Male     Yes          0      Graduate            No   
492  LP001859    Male     Yes          0      Graduate            No   
509  LP001100    Male      No         3+      Graduate            No   
511  LP001673    Male      No          0      Graduate           Yes   
516  LP002194  Female      No          0      Graduate           Yes   
518  LP002938    Male     Yes          0      Graduate           Yes   
524  LP002582  Female      No          0  Not Graduate           Yes   
528  LP002527    Male     Yes          2      Graduate           Yes   
544  LP001492    Male      No          0      Graduate            No   
546  LP001451    Male     Yes          1      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
13             11417             1126.0       225.0             360.0   
18             16666                0.0       275.0             360.0   
51             11757                0.0       187.0             180.0   
61             12876                0.0       405.0             360.0   
73             12841            10968.0       349.0             360.0   
80             51763                0.0       700.0             300.0   
95             10408                0.0       259.0             360.0   
106            12000                0.0       496.0             360.0   
117            18165                0.0       125.0             360.0   
126            33846                0.0       260.0             360.0   
131            19484                0.0       600.0             360.0   
134            39147             4750.0       120.0             360.0   
145            23803                0.0       370.0             360.0   
165            11250                0.0       196.0             360.0   
167            10416                0.0       187.0             360.0   
180            20667                0.0         NaN             360.0   
263            12000                0.0       164.0             360.0   
267            16250                0.0       192.0             360.0   
273            13262                0.0        40.0             360.0   
287            14583                0.0       436.0             360.0   
292            16667             2250.0        86.0             360.0   
294            16692                0.0       110.0             360.0   
297            18333                0.0       500.0             360.0   
303            15000                0.0       300.0             360.0   
305            14866                0.0        70.0             360.0   
336            10833                0.0       234.0             360.0   
349            13650                0.0         NaN             360.0   
366            19730             5266.0       570.0             360.0   
373            11500                0.0       286.0             360.0   
385            39999                0.0       600.0             180.0   
388            81000                0.0       360.0             360.0   
392            37719                0.0       152.0             360.0   
416            63337                0.0       490.0             180.0   
427            20833             6667.0       480.0             360.0   
460            10750                0.0       312.0             360.0   
475            20233                0.0       480.0             360.0   
490            11146                0.0       136.0             360.0   
492            14683             2100.0       304.0             360.0   
509            12500             3000.0       320.0             360.0   
511            11000                0.0        83.0             360.0   
516            15759                0.0        55.0             360.0   
518            16120                0.0       260.0             360.0   
524            17263                0.0       225.0             360.0   
528            16525             1014.0       150.0             360.0   
544            14999                0.0       242.0             360.0   
546            10513             3850.0       160.0             180.0   

     Credit_History Property_Area Loan_Status  
13              1.0         Urban           Y  
18              1.0         Urban           Y  
51              1.0         Urban           Y  
61              1.0     Semiurban           Y  
73              1.0     Semiurban           N  
80              1.0         Urban           Y  
95              1.0         Urban           Y  
106             1.0     Semiurban           Y  
117             1.0         Urban           Y  
126             1.0     Semiurban           N  
131             1.0     Semiurban           Y  
134             1.0     Semiurban           Y  
145             1.0         Rural           Y  
165             NaN     Semiurban           N  
167             0.0         Urban           N  
180             1.0         Rural           N  
263             1.0     Semiurban           N  
267             0.0         Urban           N  
273             1.0         Urban           Y  
287             1.0     Semiurban           Y  
292             1.0     Semiurban           Y  
294             1.0     Semiurban           Y  
297             1.0         Urban           N  
303             1.0         Rural           Y  
305             1.0         Urban           Y  
336             1.0     Semiurban           Y  
349             1.0         Urban           Y  
366             1.0         Rural           N  
373             0.0         Urban           N  
385             0.0     Semiurban           Y  
388             0.0         Rural           N  
392             1.0     Semiurban           Y  
416             1.0         Urban           Y  
427             NaN         Urban           Y  
460             1.0         Urban           Y  
475             1.0         Rural           N  
490             1.0         Urban           Y  
492             1.0         Rural           N  
509             1.0         Rural           N  
511             1.0         Urban           N  
516             1.0     Semiurban           Y  
518             1.0         Urban           Y  
524             1.0     Semiurban           Y  
528             1.0         Rural           Y  
544             0.0     Semiurban           N  
546             0.0         Urban           N

print(outliers.count())

Loan_ID              46
Gender               43
Married              46
Dependents           45
Education            46
Self_Employed        44
ApplicantIncome      46
CoapplicantIncome    46
LoanAmount           44
Loan_Amount_Term     46
Credit_History       44
Property_Area        46
Loan_Status          46
dtype: int64

data['Credit_History'].value_counts()

Credit_History
1.0    423
0.0     80
Name: count, dtype: int64

credit_history = data['Credit_History'].value_counts(ascending=True) 

def transform (x):
    return x.map({'Y':1,'N':0}).mean()
    
loan_probability = data.pivot_table(values='Loan_Status', index=['Credit_History'],
aggfunc = transform)

print('Frequency Table for Credit History:')
print(credit_history)
print('\nProbability of getting loan for each Credit History class:') 
print(loan_probability)

Frequency Table for Credit History:
Credit_History
0.0     80
1.0    423
Name: count, dtype: int64

Probability of getting loan for each Credit History class:
                Loan_Status
Credit_History             
0.0                0.062500
1.0                0.801418

data['Loan_Status'].value_counts()

Loan_Status
Y    378
N    172
Name: count, dtype: int64

data.shape

(550, 13)

data.head()

fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Credit_History')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by Credit_History")
credit_history.plot(kind='bar')
plt.show()
ax2 = fig.add_subplot(122)
ax2.set_xlabel('Credit_History')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by credit history")
loan_probability.plot(kind = 'bar')
plt.show()

data.Gender.value_counts()

Gender
Male      435
Female    103
Name: count, dtype: int64

def missing_value (x):
    return sum(x.isnull())
data.apply(missing_value, axis = 0)

Loan_ID               0
Gender               12
Married               3
Dependents           15
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     12
Credit_History       47
Property_Area         0
Loan_Status           0
dtype: int64

data.head()

data.LoanAmount.fillna(data.LoanAmount.mean(), inplace = True)

data.apply(missing_value, axis=0)

Loan_ID               0
Gender               12
Married               3
Dependents           15
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     12
Credit_History       47
Property_Area         0
Loan_Status           0
dtype: int64

data.shape

(550, 13)

data.head()

data.to_csv('new_train.csv')

data.boxplot(column = 'LoanAmount', by = ['Education', 'Self_Employed'], grid=False, rot=45, fontsize= 10)

<Axes: title={'center': 'LoanAmount'}, xlabel='[Education, Self_Employed]'>

data.Self_Employed.value_counts()

Self_Employed
No     445
Yes     76
Name: count, dtype: int64

data.apply(missing_value, axis=0)

Loan_ID               0
Gender               12
Married               3
Dependents           15
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     12
Credit_History       47
Property_Area         0
Loan_Status           0
dtype: int64

data.Self_Employed.fillna('No', inplace=True)

data.apply(missing_value, axis=0)

Loan_ID               0
Gender               12
Married               3
Dependents           15
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     12
Credit_History       47
Property_Area         0
Loan_Status           0
dtype: int64

data.describe()

plt.hist(data['LoanAmount'], 20, facecolor='b') 
plt.xlabel('Loan Amount')
plt.ylabel('Count')
plt.title('Histogram for loan amount') 
plt.grid(True)
plt.show()

data.boxplot(column='LoanAmount')

<Axes: >

data['LoanAmount_log'] = np.log(data.LoanAmount)

plt.hist(data['LoanAmount_log'], 20, facecolor='b') 
plt.xlabel('Loan Amount')
plt.ylabel('Count')
plt.title('Histogram for loan amount') 
plt.grid(True)
plt.show()

data.boxplot(column='LoanAmount_log')

<Axes: >

data.head()

data.describe()

del data['LoanAmount']

data.head()

#revise what variables still have missing values 

data.apply(missing_value, axis=0)

Loan_ID               0
Gender               12
Married               3
Dependents           15
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
Loan_Amount_Term     12
Credit_History       47
Property_Area         0
Loan_Status           0
LoanAmount_log        0
dtype: int64

#Deal with all the rest missing values 

data['Gender'].fillna(data['Gender'].mode()[0], inplace = True)
data['Married'].fillna(data['Married'].mode()[0], inplace = True) 
data['Dependents'].fillna(data['Dependents'].mode()[0], inplace = True) 
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace = True) 
data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace = True)

data.apply(lambda x : sum(x.isnull()), axis=0)

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
LoanAmount_log       0
dtype: int64

#identify all categorical columns 

data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
LoanAmount_log       float64
dtype: object

columns = list(data)
print (columns)

['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status', 'LoanAmount_log']

#save categorical columns attributes into an array c_columns

c_columns = list(data.select_dtypes(exclude=['float64', 'int64']))
print (c_columns)

['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

c_columns.pop(0)
print (c_columns)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

#import LabelEncoder from scikit-learn 
from sklearn.preprocessing import LabelEncoder

#import copy to save the transformed dataset to a new dataframe
import copy 

#initialize LabelEncoder 
le = LabelEncoder()

#save the transformed dataset to new dataframe
data_trans = copy.deepcopy(data)

#transform categorical value to numeric value 
for i in c_columns:
    data_trans [i] = le.fit_transform(data[i])

data_trans.head()

data.head()

data_trans.head(25)

#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

#save the transformed dataset to a new dataframe 
data_for_norm = data_trans.copy()

data_for_norm.head()

#drop 'Loan_ID' and 'Loan_Status'
data_for_norm = data_for_norm.drop(['Loan_ID', 'Loan_Status'], axis=1)
data_for_norm.head()

#use normalize method to normalize the data
normalized_data = normalize( data_for_norm )
print (normalized_data [0:5])

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 9.93897542e-01 0.00000000e+00 1.10296891e-01
  3.06380253e-04 6.12760507e-04 1.34256643e-03]
 [2.62221668e-04 2.62221668e-04 0.00000000e+00 0.00000000e+00
  0.00000000e+00 8.09478290e-01 5.79509887e-01 9.43998006e-02
  0.00000000e+00 2.62221668e-04 1.28820456e-03]
 [0.00000000e+00 2.25139945e-04 0.00000000e+00 0.00000000e+00
  0.00000000e+00 7.69078052e-01 6.33994085e-01 8.10503802e-02
  2.25139945e-04 2.25139945e-04 1.06432383e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 4.55636615e-04
  0.00000000e+00 9.86453272e-01 0.00000000e+00 1.64029181e-01
  4.55636615e-04 4.55636615e-04 1.93576999e-03]
 [2.40497265e-04 2.40497265e-04 0.00000000e+00 0.00000000e+00
  0.00000000e+00 7.11390911e-01 6.97442069e-01 8.65790155e-02
  2.40497265e-04 2.40497265e-04 1.17247162e-03]]

normalized_data.shape

(550, 11)

#put normalized data array back to the dataframe

normalized_data = pd.DataFrame(normalized_data, columns=data_for_norm.columns)
normalized_data.head()

#insert Loan_ID back to the normalized dataframe
normalized_data['Loan_ID']=data_trans['Loan_ID']

normalized_data.head()

#insert Loan_Status back to the normalized dataframe
normalized_data['Loan_Status'] = data_trans['Loan_Status']
normalized_data.head(10)

normalized_data.describe()

#Importing all necessary libraries from sklearn

from sklearn.model_selection import train_test_split 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.tree import export_graphviz
from sklearn.metrics import ConfusionMatrixDisplay 
import pydotplus

#Feature Selection

columns_nd = list(normalized_data.columns)
print(columns_nd)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'LoanAmount_log', 'Loan_ID', 'Loan_Status']

normalized_data.head()

#select features and drop the Loan_ID and target class 'Loan_Status'
features = normalized_data.drop(['Loan_ID','Loan_Status'], axis = 1)

print ('Features:\n\n', features.head())

Features:

      Gender   Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0  0.000000  0.000000         0.0   0.000000            0.0         0.993898   
1  0.000262  0.000262         0.0   0.000000            0.0         0.809478   
2  0.000000  0.000225         0.0   0.000000            0.0         0.769078   
3  0.000000  0.000000         0.0   0.000456            0.0         0.986453   
4  0.000240  0.000240         0.0   0.000000            0.0         0.711391   

   CoapplicantIncome  Loan_Amount_Term  Credit_History  Property_Area  \
0           0.000000          0.110297        0.000306       0.000613   
1           0.579510          0.094400        0.000000       0.000262   
2           0.633994          0.081050        0.000225       0.000225   
3           0.000000          0.164029        0.000456       0.000456   
4           0.697442          0.086579        0.000240       0.000240   

   LoanAmount_log  
0        0.001343  
1        0.001288  
2        0.001064  
3        0.001936  
4        0.001172

#target class is Loan_Status
classes = pd.DataFrame(normalized_data['Loan_Status'])

print('Target Class:\n', classes.head())

Target Class:
    Loan_Status
0            1
1            0
2            1
3            1
4            1

normalized_data.shape

(550, 13)

from matplotlib import pyplot

x_train, x_test, y_train, y_test = train_test_split(features, classes, test_size= 0.33,
                                                    random_state = 4072)
print(x_train.shape, x_test.shape)

(368, 11) (182, 11)

decisionTree = DecisionTreeClassifier(criterion='entropy')
print(decisionTree)

DecisionTreeClassifier(criterion='entropy')

dtc_model = decisionTree.fit(x_train, y_train)

# feature importance
importance = dtc_model.feature_importances_ 

for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.05017
Feature: 1, Score: 0.10427
Feature: 2, Score: 0.09446
Feature: 3, Score: 0.04110
Feature: 4, Score: 0.00000
Feature: 5, Score: 0.10178
Feature: 6, Score: 0.02270
Feature: 7, Score: 0.03604
Feature: 8, Score: 0.32464
Feature: 9, Score: 0.11210
Feature: 10, Score: 0.11274

# Barchat for feature importance
pyplot.bar([x for x in range(len(importance))], importance) 
pyplot.show()

prediction = dtc_model.predict(x_test) #prediction stores the predicted targets/classes
print (prediction)

[0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 1 1 1 1 0 0 1 1
 0 0 1 0 1 1 1 1 0 1 0 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1
 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 0 1 0
 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0
 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 0 0]

#both are converting 1 and 0 to Y and N 

#Factual truth 100% correct 
y_true = le.inverse_transform(y_test["Loan_Status"])

#what the models gives as True
y_pred = le.inverse_transform(prediction)

cm = confusion_matrix(y_true, y_pred)
labels = ['N', 'Y']
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x16aaeee90>

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           N       0.47      0.50      0.48        62
           Y       0.73      0.71      0.72       120

    accuracy                           0.64       182
   macro avg       0.60      0.60      0.60       182
weighted avg       0.64      0.64      0.64       182

from graphviz import Source
from sklearn import tree

graph = Source( tree.export_graphviz(dtc_model, out_file=None, feature_names=features.columns))

from cairosvg import svg2png
from IPython.display import Image

svg2png(bytestring=graph.pipe(format='svg'),write_to='output.png')
Image("output.png")

#revise on the feature importance 

# feature importance
importance = dtc_model.feature_importances_

for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.05017
Feature: 1, Score: 0.10427
Feature: 2, Score: 0.09446
Feature: 3, Score: 0.04110
Feature: 4, Score: 0.00000
Feature: 5, Score: 0.10178
Feature: 6, Score: 0.02270
Feature: 7, Score: 0.03604
Feature: 8, Score: 0.32464
Feature: 9, Score: 0.11210
Feature: 10, Score: 0.11274

selected_features = normalized_data[['Credit_History', 'LoanAmount_log', 'ApplicantIncome', 'Property_Area', 'Married']]

print ('Selected Features:\n\n', selected_features.head())

#target class is the same 'Loan_Status'
print('Classes: \n\n' ,classes.head())

Selected Features:

    Credit_History  LoanAmount_log  ApplicantIncome  Property_Area   Married
0        0.000306        0.001343         0.993898       0.000613  0.000000
1        0.000000        0.001288         0.809478       0.000262  0.000262
2        0.000225        0.001064         0.769078       0.000225  0.000225
3        0.000456        0.001936         0.986453       0.000456  0.000000
4        0.000240        0.001172         0.711391       0.000240  0.000240
Classes: 

    Loan_Status
0            1
1            0
2            1
3            1
4            1

x_train, x_test, y_train, y_test = train_test_split(selected_features, classes, test_size= .33,
                                                    random_state = 4072)
print(x_train.shape, x_test.shape)

(368, 5) (182, 5)

decisionTree = DecisionTreeClassifier(criterion='entropy')
print(decisionTree)

DecisionTreeClassifier(criterion='entropy')

dtc_model = decisionTree.fit(x_train, y_train)

# feature importance
importance = dtc_model.feature_importances_ 

for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.37062
Feature: 1, Score: 0.12529
Feature: 2, Score: 0.26532
Feature: 3, Score: 0.10372
Feature: 4, Score: 0.13504

selected_features.head()

# Barchat for feature importance
pyplot.bar([x for x in range(len(importance))], importance) 
pyplot.show()

#stores x_test into prediction
prediction = dtc_model.predict(x_test)

y_true = le.inverse_transform(y_test["Loan_Status"])
y_pred = le.inverse_transform(prediction)

cm = confusion_matrix(y_true, y_pred)
labels = ['N', 'Y']
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x16ab3e150>

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           N       0.53      0.50      0.51        62
           Y       0.75      0.77      0.76       120

    accuracy                           0.68       182
   macro avg       0.64      0.63      0.63       182
weighted avg       0.67      0.68      0.67       182

#visulize the new tree
from graphviz import Source
from sklearn import tree
graph = Source( tree.export_graphviz(dtc_model, out_file=None, feature_names=selected_features.columns))

from cairosvg import svg2png
from IPython.display import Image
svg2png(bytestring=graph.pipe(format='svg'),write_to='output.png')
Image("output.png")

	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History
count	550.000000	550.000000	528.000000	538.000000	503.000000
mean	5482.732727	1676.422036	147.928030	342.156134	0.840954
std	6341.658306	3059.776181	85.214822	65.092098	0.366083
min	150.000000	0.000000	9.000000	12.000000	0.000000
25%	2877.500000	0.000000	102.000000	360.000000	1.000000
50%	3812.500000	1211.500000	128.000000	360.000000	1.000000
75%	5844.000000	2324.000000	168.500000	360.000000	1.000000
max	81000.000000	41667.000000	700.000000	480.000000	1.000000

	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History
count	550.000000	550.000000	550.000000	538.000000	503.000000
mean	5482.732727	1676.422036	147.928030	342.156134	0.840954
std	6341.658306	3059.776181	83.489965	65.092098	0.366083
min	150.000000	0.000000	9.000000	12.000000	0.000000
25%	2877.500000	0.000000	104.000000	360.000000	1.000000
50%	3812.500000	1211.500000	130.000000	360.000000	1.000000
75%	5844.000000	2324.000000	165.000000	360.000000	1.000000
max	81000.000000	41667.000000	700.000000	480.000000	1.000000

	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	LoanAmount_log
count	550.000000	550.000000	550.000000	538.000000	503.000000	550.000000
mean	5482.732727	1676.422036	147.928030	342.156134	0.840954	4.875161
std	6341.658306	3059.776181	83.489965	65.092098	0.366083	0.491616
min	150.000000	0.000000	9.000000	12.000000	0.000000	2.197225
25%	2877.500000	0.000000	104.000000	360.000000	1.000000	4.644391
50%	3812.500000	1211.500000	130.000000	360.000000	1.000000	4.867534
75%	5844.000000	2324.000000	165.000000	360.000000	1.000000	5.105945
max	81000.000000	41667.000000	700.000000	480.000000	1.000000	6.551080

	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	Loan_Amount_Term	Credit_History	Property_Area	LoanAmount_log	Loan_Status
count	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000	550.000000
mean	0.000179	0.000144	0.000155	0.000060	0.000025	0.874047	0.309025	0.078100	0.000192	0.000245	0.001072	0.687273
std	0.000125	0.000132	0.000244	0.000121	0.000071	0.177488	0.318756	0.039776	0.000123	0.000237	0.000435	0.464026
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.009983	0.000000	0.002207	0.000000	0.000000	0.000073	0.000000
25%	0.000088	0.000000	0.000000	0.000000	0.000000	0.798922	0.000000	0.048327	0.000105	0.000000	0.000782	0.000000
50%	0.000185	0.000145	0.000000	0.000000	0.000000	0.966331	0.249429	0.076114	0.000200	0.000217	0.001076	1.000000
75%	0.000266	0.000248	0.000272	0.000000	0.000000	0.997304	0.590246	0.103856	0.000276	0.000370	0.001344	1.000000
max	0.000673	0.000589	0.001609	0.000673	0.000455	0.999996	0.999941	0.242218	0.000673	0.001346	0.002679	1.000000

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	NaN	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y

	Unnamed: 0	Loan_ID	Gender	Married	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	206	LP001693	Female	No	Graduate	No	3244	0.0	80.0	360.0	1.0	Urban	Y
1	576	LP002872	NaN	Yes	Graduate	No	3087	2210.0	136.0	360.0	0.0	Semiurban	N
2	198	LP001671	Female	Yes	Graduate	No	3416	2816.0	113.0	360.0	NaN	Semiurban	Y
3	587	LP002917	Female	No	Not Graduate	No	2165	0.0	70.0	360.0	1.0	Semiurban	Y
4	90	LP001316	Male	Yes	Graduate	No	2958	2900.0	131.0	360.0	1.0	Semiurban	Y

	Gender	Married	Education	ApplicantIncome	CoapplicantIncome	Loan_Amount_Term	Credit_History	Property_Area	LoanAmount_log
0	0	0	0	3244	0.0	360.0	1.0	2	4.382027
1	1	1	0	3087	2210.0	360.0	0.0	1	4.912655
2	0	1	0	3416	2816.0	360.0	1.0	1	4.727388
3	0	0	1	2165	0.0	360.0	1.0	1	4.248495
4	1	1	0	2958	2900.0	360.0	1.0	1	4.875197

	Credit_History	LoanAmount_log	ApplicantIncome	Property_Area	Married
0	0.000306	0.001343	0.993898	0.000613	0.000000
1	0.000000	0.001288	0.809478	0.000262	0.000262
2	0.000225	0.001064	0.769078	0.000225	0.000225
3	0.000456	0.001936	0.986453	0.000456	0.000000
4	0.000240	0.001172	0.711391	0.000240	0.000240