Credit card Risk Assessment using Machine Learning

Kishan Menaria Sept 24 2020 · 1 min read
Share this
In [1]:
import pandas as pd
In [3]:
credit_df=pd.read_csv('Credit_default_dataset.csv')
credit_df.head(5)
Out[3]:
IDLIMIT_BALSEXEDUCATIONMARRIAGEAGEPAY_0PAY_2PAY_3PAY_4...BILL_AMT4BILL_AMT5BILL_AMT6PAY_AMT1PAY_AMT2PAY_AMT3PAY_AMT4PAY_AMT5PAY_AMT6default.payment.next.month
0120000.02212422-1-1...0.00.00.00.0689.00.00.00.00.01
12120000.022226-1200...3272.03455.03261.00.01000.01000.01000.00.02000.01
2390000.0222340000...14331.014948.015549.01518.01500.01000.01000.01000.05000.00
3450000.0221370000...28314.028959.029547.02000.02019.01200.01100.01069.01000.00
4550000.012157-10-10...20940.019146.019131.02000.036681.010000.09000.0689.0679.00

5 rows × 25 columns

In [5]:
#We don't need the ID column,so lets drop it.
credit_df = credit_df.drop(["ID"],axis=1)
In [6]:
#changing the name of  pay_0 column to pay_1 to make the numbering correct
credit_df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)
In [8]:
credit_df.head(5)
Out[8]:
LIMIT_BALSEXEDUCATIONMARRIAGEAGEPAY_1PAY_2PAY_3PAY_4PAY_5...BILL_AMT4BILL_AMT5BILL_AMT6PAY_AMT1PAY_AMT2PAY_AMT3PAY_AMT4PAY_AMT5PAY_AMT6default.payment.next.month
020000.02212422-1-1-2...0.00.00.00.0689.00.00.00.00.01
1120000.022226-12000...3272.03455.03261.00.01000.01000.01000.00.02000.01
290000.02223400000...14331.014948.015549.01518.01500.01000.01000.01000.05000.00
350000.02213700000...28314.028959.029547.02000.02019.01200.01100.01069.01000.00
450000.012157-10-100...20940.019146.019131.02000.036681.010000.09000.0689.0679.00

5 rows × 24 columns

In [10]:
#Removing Unwanted categorical levels as mentioned in data exploration
credit_df['EDUCATION'].value_counts()
Out[10]:
2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

Data Preprocessing Steps

In [12]:
credit_df["EDUCATION"]=credit_df["EDUCATION"].map({0:4,1:1,2:2,3:3,4:4,5:4,6:4})
credit_df["MARRIAGE"]=credit_df["MARRIAGE"].map({0:3,1:1,2:2,3:3})
In [20]:
from sklearn.preprocessing import StandardScaler
scaling=StandardScaler()
X=credit_df.drop(['default.payment.next.month'],axis=1)
X=scaling.fit_transform(X)
C:\Users\krish.naik\AppData\Local\Continuum\anaconda3\envs\myenv\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
C:\Users\krish.naik\AppData\Local\Continuum\anaconda3\envs\myenv\lib\site-packages\sklearn\base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
In [22]:
y=credit_df['default.payment.next.month']
In [30]:
## Hyper Parameter Optimization

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
In [31]:
## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost
In [32]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
In [33]:
classifier=xgboost.XGBClassifier()
In [34]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
In [36]:
from datetime import datetime
# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X,y)
timer(start_time) # timing ends here for "start_time" variable
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   14.6s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   16.1s finished
 Time taken: 0 hours 0 minutes and 17.2 seconds.
In [37]:
random_search.best_estimator_
Out[37]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, gamma=0.1, learning_rate=0.25,
       max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)
In [38]:
random_search.best_params_
Out[38]:
{'min_child_weight': 7,
 'max_depth': 3,
 'learning_rate': 0.25,
 'gamma': 0.1,
 'colsample_bytree': 0.4}
In [39]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, gamma=0.1, learning_rate=0.25,
       max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)
In [41]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(classifier,X,y,cv=10)
In [42]:
score
Out[42]:
array([0.80806398, 0.8083972 , 0.81772742, 0.80473176, 0.81666667,
       0.829     , 0.83761254, 0.82994331, 0.83027676, 0.82660887])
In [43]:
score.mean()
Out[43]:
0.8209028507040204

Reference :

For dataset visit : https://github.com/krishnaik06/Credit_Card-Risk-assessment

For video Lecture Visit:

Thank You :)

Comments
Read next