캐글 대회 분석-1 Hill climbing을 활용한 Binary Classification

데이터 분석/머신러닝

캐글 대회 분석-1 Hill climbing을 활용한 Binary Classification

민서타 2023. 10. 19. 11:54

대회: PS3E23 소프트웨어 결함 분류 및 예측 --> 이진 분류를 통해

목표: 코드에 대한 다양한 속성들이 주어진 상태에서 C 프로그램의 결함 예측
평가지표: AUC ROC Score

1. 데이터 로드 & EDA

1)피처별 분포 --> 전반적으로 오른쪽으로 치우쳐져 있어 로그 변환 필요

~~어느정도 균일한 분포가 됐쥬?~~

2) 상관관계 분석 -->높은 피쳐들간의 상관관계, 다중공선성 제거 or 차원 축소 필요

하지만 vif>=20 이상인 것을 제거하고, 차원을 축소하였으나

주최측의 인위적인 데이터 생성으로 점수를 비교해본 결과 큰 의미는 없었음

3. 베이스라인 모델 구축

모델	LightGBM	XGBoost	Random Forest
Train score	0.8158	0.8453	1.0000
Validation score / Acc / f1_score	0.7963 / 0.8181 / 0.5007	0.7886 / 0.8122 / 0.4819	0.7764 / 0.8102 / 0.4823

-베이스라인 모델별 혼동행렬

4. 모델 개선: Hill Climbing 알고리즘 활용 모델별 가중치 개선, 최적 하이퍼파라미터 선정: Optuna로 선정

def hill_climbing(x, y, x_test):
    #x : pd.DataFrame({'XGB' : XGB_pred, 'Hist' : hist_pred})
    #y : Y_test
    #x_test : pd.DataFrame({'XGB' : XGB_pred_test, 'Hist' : hist_pred_test})

    #Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])

    #Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(),
                                      key =lambda item: item[1], reverse = True)}
    #Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(0, 0.51, 0.01)
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt
        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)

            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True

        hill_ens_pred_1 = current_best_ensemble
        hill_ens_pred_2 = current_best_test_preds

        return [hill_ens_pred_1, hill_ens_pred_2]

ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds = list(), list()

#K_Fold 생성, n_splits = fold 분할 횟수, n_repeats = 반복횟수
sk = RepeatedStratifiedKFold(n_splits = 15, n_repeats = 3, random_state = 61)
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    cls_weight = (Y_train.shape[0] - np.sum(Y_train)) / np.sum(Y_train)
    print('--------------------------------------------')

    #RF 모델
    RF_md = RandomForestClassifier(n_estimators = 500,
                                   max_depth = 7,
                                   min_samples_split = 15,
                                   min_samples_leaf = 10).fit(X_train, Y_train)

    RF_pred = RF_md.predict_proba(X_test)[:, 1]
    RF_score = roc_auc_score(Y_test, RF_pred)

    print('Fold', i, '==> RF oof ROC-AUC score is ==>', RF_score)

    RF_pred_test = RF_md.predict_proba(test)[:, 1]

    #HGBM 모델
    hist_md = HistGradientBoostingClassifier(loss='log_loss', learning_rate=0.09494605702447576,
                                           max_depth=83, l2_regularization=0.00045512891761208057,
                                           max_iter=110, random_state=61).fit(X_train, Y_train)

    hist_pred = hist_md.predict_proba(X_test)[:, 1]
    hist_score = roc_auc_score(Y_test, hist_pred)

    print('Fold', i, '==> HGBM oof ROC-AUC score is ==>', hist_score)
    hist_pred_test = hist_md.predict_proba(test)[:, 1]

    #LGBM
    LGBM_md = LGBMClassifier(objective = 'binary',
                            n_estimators = 500,
                            max_depth = 7,
                            learning_rate = 0.01,
                            num_leaves = 20,
                            reg_alpha = 3,
                            reg_lambda = 3,
                            subsample = 0.7,
                            colsample_bytree = 0.7).fit(X_train, Y_train)

    lgb_pred = LGBM_md.predict_proba(X_test)[:, 1]
    lgb_score = roc_auc_score(Y_test, lgb_pred)
    print('Fold', i, '==> LGBM oof ROC-AUC score is ==>', lgb_score)

    lgb_pred_test = LGBM_md.predict_proba(test)[:, 1]

    #XGB 모델
    XGB_md = XGBClassifier(
    max_depth=5,
    colsample_bynode=0.682606021920177,
    reg_lambda=4.630616411012709,
    n_estimators=84,
    learning_rate=0.29465063270539604,
    random_state=61,
    scale_pos_weight=cls_weight,
    eval_metric=evaluation_metric
    ).fit(X_train, Y_train)

    xgb_pred = XGB_md.predict_proba(X_test)[:, 1]
    xgb_score = roc_auc_score(Y_test, xgb_pred)

    print('Fold', i, '==> XGB oof ROC-AUC score is ==>', xgb_score)
    xgb_pred_test = XGB_md.predict_proba(test)[:, 1]

    #CatBoost
    Cat_md = CatBoostClassifier(loss_function = 'Logloss',
                            iterations = 500,
                            learning_rate = 0.01,
                            depth = 7,
                            random_strength = 0.5,
                            bagging_temperature = 0.7,
                            border_count = 30,
                            l2_leaf_reg = 5,
                            verbose = False,
                            task_type = 'CPU').fit(X_train, Y_train)

    cat_pred = Cat_md.predict_proba(X_test)[:, 1]
    cat_score = roc_auc_score(Y_test, cat_pred)

    print('Fold', i, '==> CatBoost oof ROC-AUC score is ==>', cat_score)

    cat_pred_test = Cat_md.predict_proba(test)[:, 1]


    ##ensemble##
    ens_pred_1 = (RF_pred + hist_pred + lgb_pred + xgb_pred + cat_pred) / 5
    ens_pred_2 = (RF_pred_test + hist_pred_test + lgb_pred_test + xgb_pred_test
                  + cat_pred_test) / 5
    ens_score_fold =roc_auc_score(Y_test, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

    ##Hill Climb ensemble##
    x = pd.DataFrame({'RF': RF_pred,'LGBM': lgb_pred, 'XGB' : xgb_pred,
                      'Hist' : hist_pred,'Cat': cat_pred})
    y = Y_test

    x_test = pd.DataFrame({'RF': RF_pred_test,
                           'Hist': hist_pred_test,
                           'LGBM': lgb_pred_test,
                           'XGB': xgb_pred_test,
                           'Cat': cat_pred_test})

    hill_results = hill_climbing(x, y, x_test)
    hill_ens_score_fold = roc_auc_score(y, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])

    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

5. 결과

6. 총평

캐글은 실제 TP를 맞추는 것보단 퍼블릭 점수를 올리기 위한 대회이므로 모델별 앙상블을 많이한다

(현업에선 가성비가 떨어짐)

나의 경우 총 7가지의 모델을 돌려보았으나 결과값이 떨어져 총 5가지 모델만을 활용했다

(랜덤포레스트, 히스트그라디언트부스팅머신, 라이트GBM, XGB분류, 캣부스트)

힐 클라이밍은 이전 캐글 대회의 참가자의 알고리즘을 보며 공부한 것인데 구현이 굉장히 어렵고

현업에서 응용가능할 지에 대한 의구심이 들긴 했다. 하지만 코드 구현해보는 것에서 만족하기로 했다.