10-fold cross validation

주인장 아저씨 2021. 5. 8. 03:20

ㅇ 모듈 import

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import validation_curve

ㅇ 예시 코드 (이진분류) ##############################

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) #셔플 트루로 해야 폴드별로 데이터 중복 안됨

folds=[]

for train_idx, valid_idx in skf.split(X_train, y_train):

folds.append((train_idx, valid_idx))

# 폴드 하나의 MSE가 높게 나오면 포함해서 앙상블해도 되고 빼고 앙상블해보고 둘 다 해봐

models={}

for fold in range(10):

lgb_model = LGBMClassifier(max_depth = 10, gamma = 1)

# 데이터 분리 설정

train_idx = folds[fold][0]

valid_idx = folds[fold][1]

X_t, y_t = X_train.iloc[train_idx], y_train.iloc[train_idx]

X_v, y_v = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

lgb_model.fit(X_t, y_t, eval_set=[(X_t, y_t), (X_v, y_v)],

eval_metric='rmse', verbose=200, #200번 마다 출력

early_stopping_rounds=200)

# 모델 저장

models[fold] = lgb_model

# 폴드 수로 나눠서 앙상블

preds=0

for fold in range(10):

preds += models[fold].predict(X_test)/10

ㅇ 검증데이터를 통한 적당한 폴드 찾기 ########################

예시)

train_score, val_score = validation_curve(

LGBMClassifier(),X_train, y_train, "max_depth",

degree, cv=10, scoring="r2")

plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')

plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')

plt.legend(loc='best')

plt.ylim(0, 1)

plt.xlabel('degree')

plt.ylabel('score')