# Hyper Parameter 튜닝 - Random Forest


참고: [하이퍼파라미터 튜닝, emseoyk.log](https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D)

## 환경설정

In [5]:
import pandas as pd
import numpy as np

from sklearn import preprocessing # 전처리

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score

from sklearn.ensemble import RandomForestClassifier

## 데이터셋

In [6]:
cancer_df = pd.read_csv('data/breast_cancer.csv')

# list(cancer_df.columns)
y = cancer_df[['diagnosis']]
X = cancer_df.loc[:, 'radius_mean':'fractal_dimension_worst']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
y_train = np.ravel(y_train, order='C') # KNN : A column-vector y was passed when a 1d array was expected

  y = column_or_1d(y, warn=True)


## 기계학습

### 1. Hyper Parameters for Random Forest

In [9]:
clf_rf = RandomForestClassifier(random_state = 777)

rf_hp = clf_rf.get_params()

pd.DataFrame.from_dict(rf_hp, orient='index', columns = ['초기값'] )

Unnamed: 0,초기값
bootstrap,True
ccp_alpha,0.0
class_weight,
criterion,gini
max_depth,
max_features,auto
max_leaf_nodes,
max_samples,
min_impurity_decrease,0.0
min_samples_leaf,1


###  2. 탐색

In [10]:
from sklearn.model_selection import GridSearchCV

params_rf = {'n_estimators': [100, 200, 300, 400, 500],
             'max_depth'   : [2, 3, 4, 6, 8],
             'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],
             'max_features': ['log2', 'sqrt']
}

grid_rf = GridSearchCV(estimator  = clf_rf,
                       param_grid = params_rf,
                       scoring    = 'f1',
                       cv         = 10,
                       n_jobs     = -1)


grid_rf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=777),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 6, 8],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='f1')

### 3. Hyper Parameters

In [12]:
## 초모수
best_hyperparams = grid_rf.best_params_
print('Hyerparameters:\n', best_hyperparams)
## 성능
best_CV_score = grid_rf.best_score_
print('HP Classifier: {:.3f}'.format(best_CV_score))

Hyerparameters:
 {'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 100}
HP Classifier: 0.904


###  4. 최적 모형

In [13]:
# 최적모형 추출
top_model = grid_rf.best_estimator_

y_pred = top_model.predict(X_test)

print('Train F1: {:.3f}'.format(f1_score(y_test, y_pred, average = 'micro')))

Train F1: 0.947
