모델 실행 코드

2021. 5. 8. 02:55데이터 사이언스/데이터 모델링

1) KNN #############################################

from sklearn.neighbors import KNeighborsClassifier

 

knn_classification = KNeighborsClassifier(n_neighbors = 3)

knn_model = knn_classification.fit(X_train, y_train)

 

(KNN 모델 튜닝)

tuned_paramaters = {'n_neighbors': np.arange(1252),

                   'metric': ['hamming','euclidean','manhattan','Chebyshev']}

knn_classification = KNeighborsClassifier()

knn_grid = GridSearchCV(estimator = knn_classification, 

                        param_grid = tuned_paramaters, 

                        cv = 5

                        scoring = 'accuracy')

knn_grid.fit(X_train, y_train)

print('Best parameters for KNN Classifier: ', knn_grid.best_params_, '\n')

 

(최적의 K값 찾기)

for i in range(1,40):

  knn = KNeighborsClassifier(n_neighbors=i)

  knn.fit(X_train, y_train)

  pred_i = knn.predict(X_test)

  error.append(np.mean(pred_i != y_test))

 

plt.figure(figsize=(12,6))

plt.plot(range(1,40), error, color='red', linestyle='dashed', marker='o',

           markerfacecolor = 'blue', markersize = 10)

plt.title('Error Rate K Value')

plt.xlable('K Value')

plt.ylable('Mean Error')

plt.show()

(또 다른 방법)

# K means determine k

distortions = []

K = range(1,10)

for k in K:

  kmeanModel = KMeans(n_clusters=k).fit(X)

  kmeanModel.fit(X)

  distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean')

 

# Plot the elbow

plt.plot(K, distortions, 'bx-')

plt.xlabel('k')

plt.ylabel('Distortion')

plt.title('The Elbow Method showing the optimal k')

plt.show()

2) 가우시안 나이브 베이즈 ####################################

from sklearn.naive_bayes import GaussianNB

 

gnb = GaussianNB()

gnb_model = gnb.fit(X_train, y_train)

 

 

 

2) 결정나무 ##################################

from sklearn.tree import DecisionTreeClassifier

from sklearn import tree

 

decision_tree_classification =DecisionTreeClassifier(criterion = 'entropy', random_state = 10)

decision_tree = decision_tree_classification.fit(X_train, y_train)

 

3) 랜덤포레스트 ###############################

from sklearn.ensemble import RandomForestClassifier

 

rf_classification = RandomForestClassifier(n_estimators = 10, random_state = 10)

rf_model = rf_classification.fit(X_train, y_train)

 

4) AdaBoost ###############################

# 구분이 어려운 값에 가중치 

from sklearn.ensemble import AdaBoostClassifier

 

ada_model = AdaBoostClassifier(n_estimators = 40, random_state = 10# 나무의 갯수 40

ada_model.fit(X_train, y_train)

 

5) Gradient Boost #############################

# 손실함수를 이용하여 잔차 구분

from sklearn.ensemble import GradientBoostingClassifier

 

gboost_model = GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)

gboost_model.fit(X_train, y_train)

 

 

6) XG Boost ##########################

# level-wise 방법 (양방향) 

from xgboost import XGBClassifier

 

xgb_model = XGBClassifier(max_depth = 10, gamma = 1)

xgb_model.fit(X_train, y_train)

 

7) LightGBM ###########################

# leaf-wise 방법

# 카테고리컬 구분 안해줘도 알아서 처리

# NA값도 알아서 처리

 

from lightgbm import LGBMClassifier

import lightgbm as lgb

 

 

model = lgb.LGBMClassifier(max_depth = 10, gamma = 1, objective='rmse', seed=8798)

model.fit(X_train2, y_train2, eval_set=[(X_val2, y_val2), (X_train2, y_train2)], verbose=10)

 

 

8) Cat boost #####################

# 범주형 데이터 포함의 경우 효과적

from catboost import CatBoostClassifier

 

cat_model = CatBoostClassifier(max_depth = 10)

cat_model.fit(X_train, y_train)

 

 

9) Stacking ##################

from sklearn.ensemble import StackingClassifier

 

base_learners = [('rf_model', RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 'sqrt'

                                                     max_leaf_nodes = 8, min_samples_leaf = 5, min_samples_split = 2

                                                     n_estimators = 50, random_state = 10)),

                 ('KNN_model', KNeighborsClassifier(n_neighbors = 17, metric = 'euclidean')),

                 ('NB_model', GaussianNB()),

                 ('ada_model', AdaBoostClassifier(n_estimators = 40, random_state = 10)),

                 ('gboost_model', GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)),

                 ('xgb_model',XGBClassifier(max_depth = 10, gamma = 1)),

                 ('lgb_model',LGBMClassifier(max_depth = 10, gamma = 1)),

                 ('cat_model',CatBoostClassifier(max_depth = 10))]

stack_model = StackingClassifier(estimators = base_learners, final_estimator = CatBoostClassifier())

stack_model.fit(X_train, y_train)