# 아이리스 데이터셋 로드
from sklearn.datasets import load_iris
# 데이터셋 선언
# 붓꽃의 데이터셋을 사용
iris_dataset = load_iris()

# 특성 확인
print("iris_dataset의 키 : {}".format(iris_dataset.keys()))

iris_dataset의 키 : dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

# 데이터셋의 정보
print(iris_dataset['DESCR'][:193]+"\n...")

Iris Plants Database
====================

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive att
...

# 특성의 값 정보 확인
print("타깃의 이름 : {}".format(iris_dataset['target_names']))
# 특성 정보 단위
print("특성의 이름 : {}".format(iris_dataset['feature_names']))
# data타입 출력
print("data의 타입 : {}".format(type(iris_dataset['data'])))
# 데이터셋 크기 확인
print("data의 크기 : {}".format(iris_dataset['data'].shape))

타깃의 이름 : ['setosa' 'versicolor' 'virginica']
특성의 이름 : ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
data의 타입 : <class 'numpy.ndarray'>
data의 크기 : (150, 4)

# Iris 데이터셋 첫 5행만 출력
print("data의 처음 다섯 행 : \n{}".format(iris_dataset['data'][:5]))

data의 처음 다섯 행 : 
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

print("target의 타입 : {}".format(type(iris_dataset['target'])))
print("target의 크기 : {}".format(iris_dataset['target'].shape))
print("타깃 : \n{}".format(iris_dataset['target']))

target의 타입 : <class 'numpy.ndarray'>
target의 크기 : (150,)
타깃 : 
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

from sklearn.model_selection import train_test_split
# 분류 모델을 사용하기 위한 train, test 데이터셋을 랜덤으로 추출
# x가 되는 독립변수 : ['data']
# y가 되는 종속변수 : ['target]
x_train, x_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)

print("x_train 크기 : {}".format(x_train.shape))
print("y_train 크기 : {}".format(y_train.shape))
x_train[:5]

x_train 크기 : (112, 4)
y_train 크기 : (112,)

array([[5.9, 3. , 4.2, 1.5],
       [5.8, 2.6, 4. , 1.2],
       [6.8, 3. , 5.5, 2.1],
       [4.7, 3.2, 1.3, 0.2],
       [6.9, 3.1, 5.1, 2.3]])

print("x_test 크기 : {}".format(x_test.shape))
print("y_test 크기 : {}".format(y_test.shape))
x_test[:5]

x_test 크기 : (38, 4)
y_test 크기 : (38,)

array([[5.8, 2.8, 5.1, 2.4],
       [6. , 2.2, 4. , 1. ],
       [5.5, 4.2, 1.4, 0.2],
       [7.3, 2.9, 6.3, 1.8],
       [5. , 3.4, 1.5, 0.2]])

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import mglearn
# x_train 데이터를 사용해서 데이터프레임을 생성
# 열의 이름은 iris_dataset.feature_names에 있는 문자열을 사용
iris_dataframe = pd.DataFrame(x_train, columns=iris_dataset.feature_names)
# 상관관계 출력
pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8, cmap=mglearn.cm3)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000018139CF2EB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BD4E2E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BD76978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BDA9048>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001813BDD16D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BDD1710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BE2A438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BE50AC8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001813BE82198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BEA7828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BED2EB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BF01588>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001813BF28C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BF5C2E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BF85978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001813BFB5048>]],
      dtype=object)

# KNN 분류 모델을 위한 라이브러리 선언
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
# 피팅시킬 K의 값을 5로 선언
knn = KNeighborsClassifier(n_neighbors = 5)

# 위에서 선언한 KNN모델에 각 train데이터셋을 피팅
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# 분류를 하기 위한 임의의 새로운 데이터 선언
x_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape : {}".format(x_new.shape))

X_new.shape : (1, 4)

# 새로운 데이터를 이용해 예측
prediction = knn.predict(x_new)
print("예측 : {}".format(prediction))
print("예측한 타깃의 이름 : {}".format(iris_dataset['target_names'][prediction]))

예측 : [0]
예측한 타깃의 이름 : ['setosa']

# 랜덤으로 선언된 test 데이터셋을 이용해 예측 시작
y_pred = knn.predict(x_test)
print("테스트 셋의 대한 예측값 : \n{}".format(y_pred))
print("{}".format(y_pred.shape))

테스트 셋의 대한 예측값 : 
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
(38,)

print("테스트 셋의 정확도 : {:.2f}".format(np.mean(y_pred == y_test)))
print("테스트 셋의 정확도 : {:.2f}".format(knn.score(x_test, y_test)))

테스트 셋의 정확도 : 0.97
테스트 셋의 정확도 : 0.97

RandomForest (0)	2019.06.23
SVC - Support Vector Classification (0)	2019.06.23
Regression, Tree (0)	2019.06.23
LinearRegression, DecisionTreeRegression (0)	2019.06.23
LinearRegression, Ridge, Lasso 알고리즘 (0)	2019.05.27

일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

김야키

붓꽃데이터를 이용한 KNN 분류모델 사용

'Python > 빅데이터분석' 카테고리의 다른 글

'Python/빅데이터분석'의 다른글

티스토리툴바

붓꽃데이터를 이용한 KNN 분류모델 사용

'Python > 빅데이터분석' 카테고리의 다른 글

'Python/빅데이터분석'의 다른글

관련글

티스토리툴바