Data Analysis & ML/Machine Learning

[Machine Learning][머신러닝] 대출위험도 예측모델링(모델링)

YSY^ 2020. 9. 8. 19:09

대출위험도 예측모델링

라이브러리 불러오기 및 데이터셋 나누기

import pandas as pd
import numpy as np

np.random.seed(1234)
data = pd.read_csv('data/data-v01.csv')
data.shape

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

data.info()

 

# 데이터셋 나누기(train, valditation, test)

X = data.drop('seriousdlqin2yrs',axis=1)
y = data['seriousdlqin2yrs']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y,
                                                    random_state=2)


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train)

y_train.shape, y_val.shape, y_test.shape
#==> ((84222,), (28074,), (37432,))

Scaling

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Base-line 모델 정의

knn = KNeighborsClassifier()
grb = GradientBoostingClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()


xgb.fit(X_train_scaled, y_train)
grb.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)
base_line = [xgb, grb, rf, knn]
model_names = ['XGBoost', 'GradientBoosting', 'RandomForest', 'KNN']

for i in range(len(base_line)):
    pred_train = base_line[i].predict(X_train_scaled)
    pred_val = base_line[i].predict(X_val_scaled)

    pred_train_proba = base_line[i].predict_proba(X_train_scaled)
    pred_val_proba = base_line[i].predict_proba(X_val_scaled)

    acc_train = np.round(accuracy_score(y_train, pred_train),3)
    acc_val = np.round(accuracy_score(y_val, pred_val),3)

    auc_train = np.round(roc_auc_score(y_train, pred_train_proba[:,1]),3)
    auc_val = np.round(roc_auc_score(y_val, pred_val_proba[:,1]),3)

    print(f'{model_names[i]}')
    print(f'train정확도:{acc_train}, validation정확도:{acc_val}\t train AUC:{auc_train}, validation AUC:{auc_val}')
    print('='*50)

 

GridSearchCV를 이용해 best 모델 생성

scale_pos_weight = (len(y)-y.sum())/y.sum()
param = {
    'learning_rate':[0.01,0.1,0.5,1,10],
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,6),
    'subsample':[0.6,0.7,0.8,0.9,1],
    'scale_pos_weight':[scale_pos_weight+i for i in range(-3,4)]
}

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
r_search = RandomizedSearchCV(XGBClassifier(), param_distributions=param, n_iter=60, scoring='roc_auc', n_jobs=-1 )

 

best_model = r_search.best_estimator_

best_model.fit(X_train_scaled, y_train)

pred_proba  = best_model.predict_proba(X_val_scaled)
AUC = roc_auc_score(y_val, pred_proba[:, 1])
AUC

0.8586918404623884

 

Voting

from sklearn.ensemble import VotingClassifier

param = {
    'n_estimators':[100,200,300],
    'max_depth':range(1,5)
}
r_search_gb = RandomizedSearchCV(GradientBoostingClassifier(), param_distributions=param, n_iter=60, cv=5, scoring='roc_auc', n_jobs=-1)
r_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param, n_iter=60, cv=5, scoring='roc_auc', n_jobs=-1)

estimators = [('xgb', best_model), ('gradient boost', r_search_gb), ('random forest', r_search_rf)]
vote = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

 

pred_proba  = vote.predict_proba(X_val_scaled)
AUC = roc_auc_score(y_val, pred_proba[:, 1])
AUC

0.8591866189717865

Test Set 으로 검증

pred_proba_test = vote.predict_proba(X_test_scaled)
AUC_test = roc_auc_score(y_test, pred_proba_test[:, 1])
AUC_test

0.8539451624400998

728x90
반응형