대출위험도 예측모델링
라이브러리 불러오기 및 데이터셋 나누기
import pandas as pd
import numpy as np
np.random.seed(1234)
data = pd.read_csv('data/data-v01.csv')
data.shape
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
data.info()
# 데이터셋 나누기(train, valditation, test)
X = data.drop('seriousdlqin2yrs',axis=1)
y = data['seriousdlqin2yrs']
X_train, X_test, y_train, y_test = train_test_split(X,
y,
stratify=y,
random_state=2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train)
y_train.shape, y_val.shape, y_test.shape
#==> ((84222,), (28074,), (37432,))
Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
Base-line 모델 정의
knn = KNeighborsClassifier()
grb = GradientBoostingClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
xgb.fit(X_train_scaled, y_train)
grb.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)
base_line = [xgb, grb, rf, knn]
model_names = ['XGBoost', 'GradientBoosting', 'RandomForest', 'KNN']
for i in range(len(base_line)):
pred_train = base_line[i].predict(X_train_scaled)
pred_val = base_line[i].predict(X_val_scaled)
pred_train_proba = base_line[i].predict_proba(X_train_scaled)
pred_val_proba = base_line[i].predict_proba(X_val_scaled)
acc_train = np.round(accuracy_score(y_train, pred_train),3)
acc_val = np.round(accuracy_score(y_val, pred_val),3)
auc_train = np.round(roc_auc_score(y_train, pred_train_proba[:,1]),3)
auc_val = np.round(roc_auc_score(y_val, pred_val_proba[:,1]),3)
print(f'{model_names[i]}')
print(f'train정확도:{acc_train}, validation정확도:{acc_val}\t train AUC:{auc_train}, validation AUC:{auc_val}')
print('='*50)
GridSearchCV를 이용해 best 모델 생성
scale_pos_weight = (len(y)-y.sum())/y.sum()
param = {
'learning_rate':[0.01,0.1,0.5,1,10],
'n_estimators':[100,200,300,400,500],
'max_depth':range(1,6),
'subsample':[0.6,0.7,0.8,0.9,1],
'scale_pos_weight':[scale_pos_weight+i for i in range(-3,4)]
}
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
r_search = RandomizedSearchCV(XGBClassifier(), param_distributions=param, n_iter=60, scoring='roc_auc', n_jobs=-1 )
best_model = r_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
pred_proba = best_model.predict_proba(X_val_scaled)
AUC = roc_auc_score(y_val, pred_proba[:, 1])
AUC
0.8586918404623884
Voting
from sklearn.ensemble import VotingClassifier
param = {
'n_estimators':[100,200,300],
'max_depth':range(1,5)
}
r_search_gb = RandomizedSearchCV(GradientBoostingClassifier(), param_distributions=param, n_iter=60, cv=5, scoring='roc_auc', n_jobs=-1)
r_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param, n_iter=60, cv=5, scoring='roc_auc', n_jobs=-1)
estimators = [('xgb', best_model), ('gradient boost', r_search_gb), ('random forest', r_search_rf)]
vote = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
pred_proba = vote.predict_proba(X_val_scaled)
AUC = roc_auc_score(y_val, pred_proba[:, 1])
AUC
0.8591866189717865
Test Set 으로 검증
pred_proba_test = vote.predict_proba(X_test_scaled)
AUC_test = roc_auc_score(y_test, pred_proba_test[:, 1])
AUC_test
0.8539451624400998
728x90
반응형
'Data Analysis & ML > Machine Learning' 카테고리의 다른 글
[프로세스 마이닝] 프로세스 마이닝(PM4PY) (2) | 2021.07.31 |
---|---|
[Machine Learning][머신러닝] Bagging, Boosting 정리 (1) | 2021.03.12 |
[Machine Learning][머신러닝] 대출위험도 예측모델링(데이터전처리) (2) | 2020.09.08 |
[Machine Learning][머신러닝] 군집(Clustering) / K-Means Clustering (0) | 2020.09.07 |
[Machine Learning][머신러닝] 로지스틱 회귀(Logistic Regression) (0) | 2020.09.07 |