# Hyper Parameter 튜닝 - Random Forest


참고: [하이퍼파라미터 튜닝, emseoyk.log](https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D)

## 환경설정

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing # 전처리

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE

from sklearn.ensemble import RandomForestRegressor

## 데이터셋

In [2]:
# 2. 데이터셋
mpg_df = pd.read_csv('data/auto-mpg.csv', index_col='car name')
mpg_df = mpg_df[mpg_df.horsepower != '?']

# 3. 훈련/시험 데이터셋
y = mpg_df[['mpg']]
X = mpg_df.loc[:, 'cylinders':'origin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 777)
y_train = np.ravel(y_train,order='C') 

## 기계학습

### 1. Hyper Parameters for Random Forest

In [3]:
reg_rf = RandomForestRegressor(random_state = 777)

reg_rf_hp = reg_rf.get_params()

pd.DataFrame.from_dict(reg_rf_hp, orient='index', columns = ['초기값'] )

Unnamed: 0,초기값
bootstrap,True
ccp_alpha,0.0
criterion,squared_error
max_depth,
max_features,auto
max_leaf_nodes,
max_samples,
min_impurity_decrease,0.0
min_samples_leaf,1
min_samples_split,2


###  2. 탐색

In [4]:
from sklearn.model_selection import GridSearchCV

params_rf = {'n_estimators': [100, 200, 300, 400, 500],
             'max_depth'   : [2, 3, 4, 6, 8],
             'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],
             'max_features': ['log2', 'sqrt']
}

grid_rf = GridSearchCV(estimator  = reg_rf,
                       param_grid = params_rf,
                       scoring    = 'neg_mean_squared_error',
                       cv         = 10,
                       n_jobs     = -1)


grid_rf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(random_state=777),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 6, 8],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'n_estimators': [100, 200, 300, 400, 500]},
             scoring='neg_mean_squared_error')

### 3. Hyper Parameters

In [5]:
## 초모수
best_hyperparams = grid_rf.best_params_
print('Hyerparameters:\n', best_hyperparams)
## 성능
best_MSE_score = grid_rf.best_score_
print('HP Regression: {:.3f}'.format(best_MSE_score))

Hyerparameters:
 {'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 500}
HP Regression: -13.581


###  4. 최적 모형

In [6]:
# 최적모형 추출
top_model = grid_rf.best_estimator_

y_pred = top_model.predict(X_test)

print('Random Forest Regression - Tuning: {:.3f}'.format(MSE(y_test, y_pred)))

Random Forest Regression - Tuning: 13.342
