2021. 5. 8. 02:55ㆍ데이터 사이언스/데이터 모델링
1) KNN #############################################
from sklearn.neighbors import KNeighborsClassifier
knn_classification = KNeighborsClassifier(n_neighbors = 3)
knn_model = knn_classification.fit(X_train, y_train)
(KNN 모델 튜닝)
tuned_paramaters = {'n_neighbors': np.arange(1, 25, 2),
'metric': ['hamming','euclidean','manhattan','Chebyshev']}
knn_classification = KNeighborsClassifier()
knn_grid = GridSearchCV(estimator = knn_classification,
param_grid = tuned_paramaters,
cv = 5,
scoring = 'accuracy')
knn_grid.fit(X_train, y_train)
print('Best parameters for KNN Classifier: ', knn_grid.best_params_, '\n')
(최적의 K값 찾기)
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12,6))
plt.plot(range(1,40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor = 'blue', markersize = 10)
plt.title('Error Rate K Value')
plt.xlable('K Value')
plt.ylable('Mean Error')
plt.show()
(또 다른 방법)
# K means determine k
distortions = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean')
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
2) 가우시안 나이브 베이즈 ####################################
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
2) 결정나무 ##################################
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
decision_tree_classification =DecisionTreeClassifier(criterion = 'entropy', random_state = 10)
decision_tree = decision_tree_classification.fit(X_train, y_train)
3) 랜덤포레스트 ###############################
from sklearn.ensemble import RandomForestClassifier
rf_classification = RandomForestClassifier(n_estimators = 10, random_state = 10)
rf_model = rf_classification.fit(X_train, y_train)
4) AdaBoost ###############################
# 구분이 어려운 값에 가중치
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier(n_estimators = 40, random_state = 10) # 나무의 갯수 40
ada_model.fit(X_train, y_train)
5) Gradient Boost #############################
# 손실함수를 이용하여 잔차 구분
from sklearn.ensemble import GradientBoostingClassifier
gboost_model = GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)
gboost_model.fit(X_train, y_train)
6) XG Boost ##########################
# level-wise 방법 (양방향)
from xgboost import XGBClassifier
xgb_model = XGBClassifier(max_depth = 10, gamma = 1)
xgb_model.fit(X_train, y_train)
7) LightGBM ###########################
# leaf-wise 방법
# 카테고리컬 구분 안해줘도 알아서 처리
# NA값도 알아서 처리
from lightgbm import LGBMClassifier
import lightgbm as lgb
model = lgb.LGBMClassifier(max_depth = 10, gamma = 1, objective='rmse', seed=8798)
model.fit(X_train2, y_train2, eval_set=[(X_val2, y_val2), (X_train2, y_train2)], verbose=10)
8) Cat boost #####################
# 범주형 데이터 포함의 경우 효과적
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(max_depth = 10)
cat_model.fit(X_train, y_train)
9) Stacking ##################
from sklearn.ensemble import StackingClassifier
base_learners = [('rf_model', RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 'sqrt',
max_leaf_nodes = 8, min_samples_leaf = 5, min_samples_split = 2,
n_estimators = 50, random_state = 10)),
('KNN_model', KNeighborsClassifier(n_neighbors = 17, metric = 'euclidean')),
('NB_model', GaussianNB()),
('ada_model', AdaBoostClassifier(n_estimators = 40, random_state = 10)),
('gboost_model', GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)),
('xgb_model',XGBClassifier(max_depth = 10, gamma = 1)),
('lgb_model',LGBMClassifier(max_depth = 10, gamma = 1)),
('cat_model',CatBoostClassifier(max_depth = 10))]
stack_model = StackingClassifier(estimators = base_learners, final_estimator = CatBoostClassifier())
stack_model.fit(X_train, y_train)
'데이터 사이언스 > 데이터 모델링' 카테고리의 다른 글
모델링 꿀팁 (0) | 2021.06.01 |
---|---|
인공신경망을 활용한 모델링 (이진 분류) (0) | 2021.05.08 |