{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Hyper Parameter 튜닝\n", "\n", "\n", "참고: [하이퍼파라미터 튜닝, emseoyk.log](https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D)\n", "\n", "## 환경설정" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn import preprocessing # 전처리\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import accuracy_score, f1_score\n", "\n", "from sklearn.tree import DecisionTreeClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 데이터셋" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\statkclee\\anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_label.py:115: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n" ] } ], "source": [ "cancer_df = pd.read_csv('data/breast_cancer.csv')\n", "\n", "# list(cancer_df.columns)\n", "y = cancer_df[['diagnosis']]\n", "X = cancer_df.loc[:, 'radius_mean':'fractal_dimension_worst']\n", "\n", "le = preprocessing.LabelEncoder()\n", "y = le.fit_transform(y)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)\n", "y_train = np.ravel(y_train, order='C') # KNN : A column-vector y was passed when a 1d array was expected" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 기계학습\n", "\n", "### 1. Hyper Parameters for Decision Tree" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
초기값
ccp_alpha0.0
class_weightNone
criteriongini
max_depthNone
max_featuresNone
max_leaf_nodesNone
min_impurity_decrease0.0
min_samples_leaf1
min_samples_split2
min_weight_fraction_leaf0.0
random_state777
splitterbest
\n", "
" ], "text/plain": [ " 초기값\n", "ccp_alpha 0.0\n", "class_weight None\n", "criterion gini\n", "max_depth None\n", "max_features None\n", "max_leaf_nodes None\n", "min_impurity_decrease 0.0\n", "min_samples_leaf 1\n", "min_samples_split 2\n", "min_weight_fraction_leaf 0.0\n", "random_state 777\n", "splitter best" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_dt = DecisionTreeClassifier(random_state = 777)\n", "\n", "dt_hp = clf_dt.get_params()\n", "\n", "pd.DataFrame.from_dict(dt_hp, orient='index', columns = ['초기값'] )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. 탐색" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=777),\n", " n_jobs=-1,\n", " param_grid={'max_depth': [2, 3, 4, 5, 6],\n", " 'max_features': [0.1, 0.2, 0.4, 0.6, 0.8],\n", " 'min_samples_leaf': [0.02, 0.04, 0.06, 0.08, 0.1]},\n", " scoring='f1')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "params_dt = {'max_depth': [2, 3, 4, 5, 6],\n", " 'min_samples_leaf': [0.02, 0.04, 0.06, 0.08, 0.10],\n", " 'max_features': [0.1, 0.2, 0.4, 0.6, 0.8]\n", " }\n", "\n", "grid_dt = GridSearchCV(estimator = clf_dt,\n", " param_grid = params_dt,\n", " scoring = 'f1',\n", " cv = 10,\n", " n_jobs = -1)\n", "\n", "\n", "grid_dt.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Hyper Parameters" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hyerparameters:\n", " {'max_depth': 4, 'max_features': 0.8, 'min_samples_leaf': 0.02}\n", "HP Classifier: 0.925\n" ] } ], "source": [ "## 초모수\n", "best_hyperparams = grid_dt.best_params_\n", "print('Hyerparameters:\\n', best_hyperparams)\n", "## 성능\n", "best_CV_score = grid_dt.best_score_\n", "print('HP Classifier: {:.3f}'.format(best_CV_score))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. 최적 모형" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train F1: 0.939\n" ] } ], "source": [ "# 최적모형 추출\n", "top_model = grid_dt.best_estimator_\n", "\n", "y_pred = top_model.predict(X_test)\n", "\n", "print('Train F1: {:.3f}'.format(f1_score(y_test, y_pred, average = 'micro')))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "vscode": { "interpreter": { "hash": "15d116a99bb54af1ff0aa2371fe08f9f0fc151477b85309a5584642f6865afea" } } }, "nbformat": 4, "nbformat_minor": 2 }