# Hyper Parameter 튜닝


참고: [하이퍼파라미터 튜닝, emseoyk.log](https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D)

## 환경설정

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing # 전처리

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier

## 데이터셋

In [2]:
cancer_df = pd.read_csv('data/breast_cancer.csv')

# list(cancer_df.columns)
y = cancer_df[['diagnosis']]
X = cancer_df.loc[:, 'radius_mean':'fractal_dimension_worst']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
y_train = np.ravel(y_train, order='C') # KNN : A column-vector y was passed when a 1d array was expected

  y = column_or_1d(y, warn=True)


## 기계학습

### 1. Hyper Parameters for Decision Tree

In [3]:
clf_dt = DecisionTreeClassifier(random_state = 777)

dt_hp = clf_dt.get_params()

pd.DataFrame.from_dict(dt_hp, orient='index', columns = ['초기값'] )

Unnamed: 0,초기값
ccp_alpha,0.0
class_weight,
criterion,gini
max_depth,
max_features,
max_leaf_nodes,
min_impurity_decrease,0.0
min_samples_leaf,1
min_samples_split,2
min_weight_fraction_leaf,0.0


###  2. 탐색

In [4]:
from sklearn.model_selection import GridSearchCV

params_dt = {'max_depth': [2, 3, 4, 5, 6],
             'min_samples_leaf': [0.02, 0.04, 0.06, 0.08, 0.10],
             'max_features': [0.1, 0.2, 0.4, 0.6, 0.8]
            }

grid_dt = GridSearchCV(estimator  = clf_dt,
                       param_grid = params_dt,
                       scoring    = 'f1',
                       cv         = 10,
                       n_jobs     = -1)


grid_dt.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=777),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'max_features': [0.1, 0.2, 0.4, 0.6, 0.8],
                         'min_samples_leaf': [0.02, 0.04, 0.06, 0.08, 0.1]},
             scoring='f1')

### 3. Hyper Parameters

In [5]:
## 초모수
best_hyperparams = grid_dt.best_params_
print('Hyerparameters:\n', best_hyperparams)
## 성능
best_CV_score = grid_dt.best_score_
print('HP Classifier: {:.3f}'.format(best_CV_score))

Hyerparameters:
 {'max_depth': 4, 'max_features': 0.8, 'min_samples_leaf': 0.02}
HP Classifier: 0.925


###  4. 최적 모형

In [6]:
# 최적모형 추출
top_model = grid_dt.best_estimator_

y_pred = top_model.predict(X_test)

print('Train F1: {:.3f}'.format(f1_score(y_test, y_pred, average = 'micro')))

Train F1: 0.939
