- 데이터셋 1 : make_forge from mglearn
- 데이터셋 2 : load_breast_cancer from sklearn.datasets (유방암 데이터셋)
- 함수 : plot_feature_importances_cancer (해당 모델을 이용한 예측에서 feature들의 중요도 출력)
In [1]:
#-*- coding:utf-8 -*-
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import mglearn
x, y = mglearn.datasets.make_forge()
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
for model, ax in zip([LinearSVC(), LogisticRegression()], axes):
clf = model.fit(x, y)
mglearn.plots.plot_2d_separator(clf, x, fill=False, eps=0.5, ax=ax, alpha=.7)
mglearn.discrete_scatter(x[:, 0], x[:, 1], y, ax=ax)
ax.set_title("{}".format(clf.__class__.__name__))
ax.set_xlabel("Class 0")
ax.set_ylabel("Class 1")
axes[0].legend()
Out[1]:
In [2]:
mglearn.plots.plot_linear_svc_regularization()
# C값이 커질수록 복잡도가 높아지지만, 시간이 오래 걸림
# C값이 클 수록 정확도가 높아짐
# C값은 10^n 값으로만 조정
In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
cancer.data, cancer.target,stratify=cancer.target, random_state=42)
# C값이 1로 10^0인 모델
logreg = LogisticRegression().fit(x_train, y_train)
print("훈련 세트 점수 : {:.3f}".format(logreg.score(x_train, y_train)))
print("테스트 세트 점수 : {:.3f}".format(logreg.score(x_test, y_test)))
In [4]:
# C값이 100으로 10^2인 모델
logreg100 = LogisticRegression(C=100).fit(x_train, y_train)
print("훈련 세트 점수 : {:.3f}".format(logreg100.score(x_train, y_train)))
print("테스트 세트 점수 : {:.3f}".format(logreg100.score(x_test, y_test)))
In [5]:
# C값이 0.01로 10^-2인 모델
logreg001 = LogisticRegression(C=0.01).fit(x_train, y_train)
print("훈련 세트 점수 : {:.3f}".format(logreg001.score(x_train, y_train)))
print("테스트 세트 점수 : {:.3f}".format(logreg001.score(x_test, y_test)))
In [6]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
# 트리의 최대 깊이를 설정하는 'max_depth'의 값을 정하지 않으면 모든 분류를 할 때까지
# 노드를 생성
tree = DecisionTreeClassifier(random_state=0)
tree.fit(x_train, y_train)
print("훈련 세트 점수 : {:.3f}".format(tree.score(x_train, y_train)))
print("테스트 세트 점수 : {:.3f}".format(tree.score(x_test, y_test)))
# 트레이닝 데이터셋의 점수가 100%이므로 오버피팅 발생
In [7]:
tree4 = DecisionTreeClassifier(max_depth=4, random_state=0)
tree4.fit(x_train, y_train)
print("훈련 세트 점수 : {:.3f}".format(tree4.score(x_train, y_train)))
print("테스트 세트 점수 : {:.3f}".format(tree4.score(x_test, y_test)))
In [8]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.txt", class_names=["malignity", "positivity"],
feature_names = cancer.feature_names,
impurity=False, filled=True)
In [9]:
import graphviz
file = open('tree.txt', 'r', encoding = 'utf8')
text = file.read()
In [10]:
import pydotplus
pydotplus.find_graphviz()
In [11]:
import pydot
(graph,) = pydot.graph_from_dot_file('tree.txt', encoding='UTF-8')
In [12]:
from IPython.display import Image
graph.write_png("tree.png")
Image(graph.create_png())
Out[12]:
In [13]:
print("특성 중요도:\n{}".format(tree.feature_importances_))
In [14]:
import numpy as np
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("Classifier importance")
plt.ylabel("Classifier")
plt.ylim(-1, n_features)
In [15]:
plot_feature_importances_cancer(tree)
In [16]:
tree = mglearn.plots.plot_tree_not_monotone()
'Python > 빅데이터분석' 카테고리의 다른 글
RandomForest (0) | 2019.06.23 |
---|---|
SVC - Support Vector Classification (0) | 2019.06.23 |
LinearRegression, DecisionTreeRegression (0) | 2019.06.23 |
LinearRegression, Ridge, Lasso 알고리즘 (0) | 2019.05.27 |
붓꽃데이터를 이용한 KNN 분류모델 사용 (0) | 2019.04.20 |