{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Hyper Parameter 튜닝 - Random Forest\n", "\n", "\n", "참고: [하이퍼파라미터 튜닝, emseoyk.log](https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D)\n", "\n", "## 환경설정" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn import preprocessing # 전처리\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import mean_squared_error as MSE\n", "\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 데이터셋" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# 2. 데이터셋\n", "mpg_df = pd.read_csv('data/auto-mpg.csv', index_col='car name')\n", "mpg_df = mpg_df[mpg_df.horsepower != '?']\n", "\n", "# 3. 훈련/시험 데이터셋\n", "y = mpg_df[['mpg']]\n", "X = mpg_df.loc[:, 'cylinders':'origin']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 777)\n", "y_train = np.ravel(y_train,order='C') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 기계학습\n", "\n", "### 1. Hyper Parameters for Random Forest" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
초기값
bootstrapTrue
ccp_alpha0.0
criterionsquared_error
max_depthNone
max_featuresauto
max_leaf_nodesNone
max_samplesNone
min_impurity_decrease0.0
min_samples_leaf1
min_samples_split2
min_weight_fraction_leaf0.0
n_estimators100
n_jobsNone
oob_scoreFalse
random_state777
verbose0
warm_startFalse
\n", "
" ], "text/plain": [ " 초기값\n", "bootstrap True\n", "ccp_alpha 0.0\n", "criterion squared_error\n", "max_depth None\n", "max_features auto\n", "max_leaf_nodes None\n", "max_samples None\n", "min_impurity_decrease 0.0\n", "min_samples_leaf 1\n", "min_samples_split 2\n", "min_weight_fraction_leaf 0.0\n", "n_estimators 100\n", "n_jobs None\n", "oob_score False\n", "random_state 777\n", "verbose 0\n", "warm_start False" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reg_rf = RandomForestRegressor(random_state = 777)\n", "\n", "reg_rf_hp = reg_rf.get_params()\n", "\n", "pd.DataFrame.from_dict(reg_rf_hp, orient='index', columns = ['초기값'] )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. 탐색" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=10, estimator=RandomForestRegressor(random_state=777),\n", " n_jobs=-1,\n", " param_grid={'max_depth': [2, 3, 4, 6, 8],\n", " 'max_features': ['log2', 'sqrt'],\n", " 'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],\n", " 'n_estimators': [100, 200, 300, 400, 500]},\n", " scoring='neg_mean_squared_error')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "params_rf = {'n_estimators': [100, 200, 300, 400, 500],\n", " 'max_depth' : [2, 3, 4, 6, 8],\n", " 'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],\n", " 'max_features': ['log2', 'sqrt']\n", "}\n", "\n", "grid_rf = GridSearchCV(estimator = reg_rf,\n", " param_grid = params_rf,\n", " scoring = 'neg_mean_squared_error',\n", " cv = 10,\n", " n_jobs = -1)\n", "\n", "\n", "grid_rf.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Hyper Parameters" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hyerparameters:\n", " {'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 500}\n", "HP Regression: -13.581\n" ] } ], "source": [ "## 초모수\n", "best_hyperparams = grid_rf.best_params_\n", "print('Hyerparameters:\\n', best_hyperparams)\n", "## 성능\n", "best_MSE_score = grid_rf.best_score_\n", "print('HP Regression: {:.3f}'.format(best_MSE_score))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. 최적 모형" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random Forest Regression - Tuning: 13.342\n" ] } ], "source": [ "# 최적모형 추출\n", "top_model = grid_rf.best_estimator_\n", "\n", "y_pred = top_model.predict(X_test)\n", "\n", "print('Random Forest Regression - Tuning: {:.3f}'.format(MSE(y_test, y_pred)))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "vscode": { "interpreter": { "hash": "15d116a99bb54af1ff0aa2371fe08f9f0fc151477b85309a5584642f6865afea" } } }, "nbformat": 4, "nbformat_minor": 2 }