10-fold cross validation
ㅇ 모듈 import
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve
ㅇ 예시 코드 (이진분류) ##############################
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) #셔플 트루로 해야 폴드별로 데이터 중복 안됨
folds=[]
for train_idx, valid_idx in skf.split(X_train, y_train):
folds.append((train_idx, valid_idx))
# 폴드 하나의 MSE가 높게 나오면 포함해서 앙상블해도 되고 빼고 앙상블해보고 둘 다 해봐
models={}
for fold in range(10):
lgb_model = LGBMClassifier(max_depth = 10, gamma = 1)
# 데이터 분리 설정
train_idx = folds[fold][0]
valid_idx = folds[fold][1]
X_t, y_t = X_train.iloc[train_idx], y_train.iloc[train_idx]
X_v, y_v = X_train.iloc[valid_idx], y_train.iloc[valid_idx]
lgb_model.fit(X_t, y_t, eval_set=[(X_t, y_t), (X_v, y_v)],
eval_metric='rmse', verbose=200, #200번 마다 출력
early_stopping_rounds=200)
# 모델 저장
models[fold] = lgb_model
# 폴드 수로 나눠서 앙상블
preds=0
for fold in range(10):
preds += models[fold].predict(X_test)/10
ㅇ 검증데이터를 통한 적당한 폴드 찾기 ########################
예시)
train_score, val_score = validation_curve(
LGBMClassifier(),X_train, y_train, "max_depth",
degree, cv=10, scoring="r2")
plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score')