1. Desicion Tree
import os import gc import pickle import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings("ignore") from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, f1_score from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.tree import DecisionTreeClassifier, export_graphviz from collections import Counter from sklearn import tree data = 'https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example' #깃허브 수술 전 사망 데이터 활용 data.info() #all int64, wtkg = float64, non-null data #event, age, wtkg, karnof, cd...: 수치값 그 외 분류값 |
1. 데이터 분할 #target value censor X = data.drop(columns = ['censor']) y = data['censor'] idx = list(range(X.shape[0])) train_idx, valid_idx = train_test_split(idx, test_size = 0.2, random_state = 42) 2. 의사결정나무 분류모델 생성, 모델 평가 model1 = DecisionTreeClassifier(max_depth = 4, criterion = 'gini') model1.fit(X.iloc[train_idx], y.iloc[train_idx]) train_predict = model1.predict(X.iloc[train_idx]) cm_train = confusion_matrix(y.iloc[train_idx], train_predict) print(cm_train) print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum())) print("Train F1-Score : {}".format(f1_score(y.iloc[train_idx], train_predict))) test_predict = model1.predict(X.iloc[valid_idx]) cm_test = confusion_matrix(y.iloc[valid_idx], test_predict) print(cm_test) print("Test Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum())) print("Test F1-Score : {}".format(f1_score(y.iloc[valid_idx], test_predict))) ---결과--- [[261 26] [ 8 130]] Train Acc : 0.92 Train F1-Score : 0.8843537414965987 [[57 7] [ 9 34]] Test Acc : 0.8504672897196262 Test F1-Score : 0.8095238095238095 |
* tree plot 시각화
tree.plot_tree(model1, filled=True, feature_names=X.columns, class_names = ['Dead', 'indicator']) plt.rcParams['figure.figsize'] = [40,40] |
2. RandomForest
-앙상블 기법을 활용한 의사결정 나무 활용 분류,회귀 모델
-데이터 1000개 이하 시 사용 #1000 <= XGBoost <= 30000 <= LightGBM
model2 = RandomForestClassifier(n_estimators=15, max_depth= 50, criterion='gini', max_features='auto', bootstrap=True, random_state = 42) rf=model2.fit(X.iloc[train_idx], y.iloc[train_idx]) rf_train = rf.predict(X.iloc[train_idx]) cm1_train = confusion_matrix(y.iloc[train_idx], rf_train) print(cm1_train) print("Train Acc : {}".format((cm1_train[0,0] + cm1_train[1,1])/cm1_train.sum())) print("Train F1-Score : {}".format(f1_score(y.iloc[train_idx], rf_train))) rf_test = rf.predict(X.iloc[valid_idx]) cm1_test = confusion_matrix(y.iloc[valid_idx], rf_test) print(cm1_test) print("Test Acc : {}".format((cm1_test[0,0] + cm1_test[1,1])/cm1_test.sum())) print("Test F1-Score : {}".format(f1_score(y.iloc[valid_idx], rf_test))) ---결과--- [[287 0] [ 2 136]] Train Acc : 0.9952941176470588 Train F1-Score : 0.9927007299270074 [[56 8] [10 33]] Test Acc : 0.8317757009345794 Test F1-Score : 0.785714285714285 |
* 중요도가 높은 변수 시각화
feature_map=pd.DataFrame(sorted(zip(model2.feature_importances_, X.columns),reverse=True), columns=['Score', 'Feature']) # Importance Score Top 10 feature_map_20 = feature_map.iloc[:10] plt.figure(figsize=(20, 10)) sns.barplot(x="Score", y="Feature", data=feature_map_20.sort_values(by="Score", ascending=False), errwidth=40) plt.title('Random Forest Importance Features') plt.tight_layout() plt.show() |
결론:
★ Depth가 높아질 수록 acc, f1-score 등 평가 점수가 높아지지만 과적합 가능성이 증가
★ Class의 분포를 비교(불균형 분포일 시 의미가 없음)하여 적정 수준의 Depth 선택이 필요하다
반응형
'데이터 분석 > 머신러닝' 카테고리의 다른 글
캐글 대회 분석-1 Hill climbing을 활용한 Binary Classification (0) | 2023.10.19 |
---|---|
머신러닝(8): 경사하강법, XGBoost, LightGBM, CatBoost (2) | 2023.09.30 |
머신러닝(6): 이상치 탐지 (0) | 2023.09.26 |
머신러닝(5): 차원 감소 (0) | 2023.09.25 |
머신러닝(4): Clustering (0) | 2023.09.25 |