데이터 사이언스/데이터 분할

10-fold cross validation

주인장 아저씨 2021. 5. 8. 03:20

ㅇ 모듈 import 

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import validation_curve

 

ㅇ 예시 코드 (이진분류) ##############################

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42#셔플 트루로 해야 폴드별로 데이터 중복 안됨

 

folds=[]

for train_idx, valid_idx in skf.split(X_train, y_train):

    folds.append((train_idx, valid_idx))

 

 

# 폴드 하나의 MSE가 높게 나오면 포함해서 앙상블해도 되고 빼고 앙상블해보고 둘 다 해봐

models={}

for fold in range(10):

    lgb_model = LGBMClassifier(max_depth = 10, gamma = 1)

    

    # 데이터 분리 설정

    train_idx = folds[fold][0]

    valid_idx = folds[fold][1]

    X_t, y_t = X_train.iloc[train_idx], y_train.iloc[train_idx]

    X_v, y_v = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

 

    

    lgb_model.fit(X_t, y_t, eval_set=[(X_t, y_t), (X_v, y_v)], 

                   eval_metric='rmse', verbose=200#200번 마다 출력 

                  early_stopping_rounds=200)

    

    # 모델 저장

    models[fold] = lgb_model

 

# 폴드 수로 나눠서 앙상블

preds=0

for fold in range(10): 

  preds += models[fold].predict(X_test)/10

 

 

ㅇ 검증데이터를 통한 적당한 폴드 찾기 ########################

예시) 

train_score, val_score = validation_curve(

   LGBMClassifier(),X_train, y_train, "max_depth",

   degree, cv=10, scoring="r2")

 

plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')

plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')

plt.legend(loc='best')

plt.ylim(01)

plt.xlabel('degree')

plt.ylabel('score')