# 의사결정나무 모형

## 환경설정

In [18]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score

## 데이터셋

In [6]:
cancer_df = pd.read_csv('data/breast_cancer.csv')

# list(cancer_df.columns)
y = cancer_df[['diagnosis']]
X = cancer_df.loc[:, 'radius_mean':'fractal_dimension_worst']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

## 기계학습 - CV
- `n_jobs = -1`: 모든 가용 CPU 코어 사용
![](https://upload.wikimedia.org/wikipedia/commons/b/b5/K-fold_cross_validation_EN.svg)

In [21]:
clf = DecisionTreeClassifier(max_depth = 2, 
                            min_samples_split = 5,
                            min_samples_leaf = 0.1,
                            random_state = 777)

F1_CV = cross_val_score(clf, X_train, y_train, cv= 10, scoring = 'f1_micro', n_jobs = -1)

## 예측 성능

In [14]:
clf.fit(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)

In [20]:
print('CV F1: {:.2f}'.format(F1_CV.mean()))
print('Train F1: {:.3f}'.format(f1_score(y_train, y_predict_train, average = 'micro')))
print('Test F1:  {:.3f}'.format(f1_score(y_test, y_predict_test, average = 'micro')))

CV F1: 0.89
Train F1: 0.919
Test F1:  0.921
