# 의사결정나무 모형 - 연비 예측

## 환경설정

In [2]:
import pandas as pd
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE

## 데이터셋

In [3]:
# 2. 데이터셋
mpg_df = pd.read_csv('data/auto-mpg.csv', index_col='car name')
mpg_df = mpg_df[mpg_df.horsepower != '?']

# 3. 훈련/시험 데이터셋
y = mpg_df[['mpg']]
X = mpg_df.loc[:, 'cylinders':'origin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 777)

## 기계학습 - CV
- `n_jobs = -1`: 모든 가용 CPU 코어 사용
![](https://upload.wikimedia.org/wikipedia/commons/b/b5/K-fold_cross_validation_EN.svg)

In [6]:
clf = DecisionTreeRegressor(max_depth = 2, 
                            min_samples_split = 5,
                            min_samples_leaf = 0.1,
                            random_state = 777)

MSE_CV = cross_val_score(clf, X_train, y_train, cv= 10, scoring = 'neg_mean_squared_error', n_jobs = -1)

## 예측 성능

In [8]:
clf.fit(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)

In [10]:
print('CV MSE: {:.2f}'.format(MSE_CV.mean()))
print('Train MSE: {:.3f}'.format(MSE(y_train, y_predict_train)))
print('Test MSE:  {:.3f}'.format(MSE(y_test, y_predict_test)))

CV MSE: -20.51
Train MSE: 15.546
Test MSE:  19.739
