{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hyper Parameter 튜닝 - Random Forest\n",
    "\n",
    "\n",
    "참고: [하이퍼파라미터 튜닝, emseoyk.log](https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D)\n",
    "\n",
    "## 환경설정"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn import preprocessing # 전처리\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import accuracy_score, f1_score\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 데이터셋"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\statkclee\\anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_label.py:115: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    }
   ],
   "source": [
    "cancer_df = pd.read_csv('data/breast_cancer.csv')\n",
    "\n",
    "# list(cancer_df.columns)\n",
    "y = cancer_df[['diagnosis']]\n",
    "X = cancer_df.loc[:, 'radius_mean':'fractal_dimension_worst']\n",
    "\n",
    "le = preprocessing.LabelEncoder()\n",
    "y = le.fit_transform(y)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)\n",
    "y_train = np.ravel(y_train, order='C') # KNN : A column-vector y was passed when a 1d array was expected"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 기계학습\n",
    "\n",
    "### 1. Hyper Parameters for Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>초기값</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>bootstrap</th>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ccp_alpha</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>class_weight</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>criterion</th>\n",
       "      <td>gini</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max_depth</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max_features</th>\n",
       "      <td>auto</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max_leaf_nodes</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max_samples</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_impurity_decrease</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_samples_leaf</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_samples_split</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min_weight_fraction_leaf</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>n_estimators</th>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>n_jobs</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>oob_score</th>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>random_state</th>\n",
       "      <td>777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>verbose</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>warm_start</th>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            초기값\n",
       "bootstrap                  True\n",
       "ccp_alpha                   0.0\n",
       "class_weight               None\n",
       "criterion                  gini\n",
       "max_depth                  None\n",
       "max_features               auto\n",
       "max_leaf_nodes             None\n",
       "max_samples                None\n",
       "min_impurity_decrease       0.0\n",
       "min_samples_leaf              1\n",
       "min_samples_split             2\n",
       "min_weight_fraction_leaf    0.0\n",
       "n_estimators                100\n",
       "n_jobs                     None\n",
       "oob_score                 False\n",
       "random_state                777\n",
       "verbose                       0\n",
       "warm_start                False"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_rf = RandomForestClassifier(random_state = 777)\n",
    "\n",
    "rf_hp = clf_rf.get_params()\n",
    "\n",
    "pd.DataFrame.from_dict(rf_hp, orient='index', columns = ['초기값'] )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  2. 탐색"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=777),\n",
       "             n_jobs=-1,\n",
       "             param_grid={'max_depth': [2, 3, 4, 6, 8],\n",
       "                         'max_features': ['log2', 'sqrt'],\n",
       "                         'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],\n",
       "                         'n_estimators': [100, 200, 300, 400, 500]},\n",
       "             scoring='f1')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "params_rf = {'n_estimators': [100, 200, 300, 400, 500],\n",
    "             'max_depth'   : [2, 3, 4, 6, 8],\n",
    "             'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],\n",
    "             'max_features': ['log2', 'sqrt']\n",
    "}\n",
    "\n",
    "grid_rf = GridSearchCV(estimator  = clf_rf,\n",
    "                       param_grid = params_rf,\n",
    "                       scoring    = 'f1',\n",
    "                       cv         = 10,\n",
    "                       n_jobs     = -1)\n",
    "\n",
    "\n",
    "grid_rf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Hyper Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hyerparameters:\n",
      " {'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 100}\n",
      "HP Classifier: 0.904\n"
     ]
    }
   ],
   "source": [
    "## 초모수\n",
    "best_hyperparams = grid_rf.best_params_\n",
    "print('Hyerparameters:\\n', best_hyperparams)\n",
    "## 성능\n",
    "best_CV_score = grid_rf.best_score_\n",
    "print('HP Classifier: {:.3f}'.format(best_CV_score))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  4. 최적 모형"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train F1: 0.947\n"
     ]
    }
   ],
   "source": [
    "# 최적모형 추출\n",
    "top_model = grid_rf.best_estimator_\n",
    "\n",
    "y_pred = top_model.predict(X_test)\n",
    "\n",
    "print('Train F1: {:.3f}'.format(f1_score(y_test, y_pred, average = 'micro')))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "15d116a99bb54af1ff0aa2371fe08f9f0fc151477b85309a5584642f6865afea"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}