728x90
In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
In [126]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
In [127]:
df = pd.read_csv("./wine.csv")
In [128]:
df
Out[128]:
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 0 |
1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 0 |
2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 0 |
3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 1 |
4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1594 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 0 |
1595 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 1 |
1596 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 1 |
1597 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 0 |
1598 | 6.0 | 0.310 | 0.47 | 3.6 | 0.067 | 18.0 | 42.0 | 0.99549 | 3.39 | 0.66 | 11.0 | 1 |
1599 rows × 12 columns
In [61]:
df.isnull().sum()
Out[61]:
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
In [62]:
df.shape
Out[62]:
(1599, 12)
In [64]:
df.iloc[:,:-1] #x데이터. 모든 행+0~-1까지 열
Out[64]:
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 |
1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 |
2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 |
3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 |
4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1594 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 |
1595 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 |
1596 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 |
1597 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 |
1598 | 6.0 | 0.310 | 0.47 | 3.6 | 0.067 | 18.0 | 42.0 | 0.99549 | 3.39 | 0.66 | 11.0 |
1599 rows × 11 columns
In [65]:
df.iloc[:,-1:] #y데이터
Out[65]:
quality | |
---|---|
0 | 0 |
1 | 0 |
2 | 0 |
3 | 1 |
4 | 0 |
... | ... |
1594 | 0 |
1595 | 1 |
1596 | 1 |
1597 | 0 |
1598 | 1 |
1599 rows × 1 columns
In [66]:
df_x = df.iloc[:,:-1].copy()
df_y = df.iloc[:,-1:].copy()
In [86]:
train_input, test_input, train_target, test_target = train_test_split(df_x,df_y, random_state=42)
In [87]:
print(train_input.shape)
print(test_input.shape)
(1199, 11) (400, 11)
In [88]:
mean = np.mean(train_input,axis=0)
std = np.std(train_input,axis=0)
In [89]:
mean
Out[89]:
fixed acidity 8.316931 volatile acidity 0.531902 citric acid 0.272244 residual sugar 2.547623 chlorides 0.088464 free sulfur dioxide 15.864887 total sulfur dioxide 46.766055 density 0.996765 pH 3.311259 sulphates 0.660017 alcohol 10.420183 dtype: float64
In [90]:
std
Out[90]:
fixed acidity 1.713657 volatile acidity 0.180142 citric acid 0.195369 residual sugar 1.411277 chlorides 0.049336 free sulfur dioxide 10.249824 total sulfur dioxide 33.012617 density 0.001842 pH 0.154192 sulphates 0.175158 alcohol 1.058656 dtype: float64
In [91]:
train_scaled = (train_input - mean) / std
In [92]:
train_target.values.ravel().shape
Out[92]:
(1199,)
In [93]:
kn = KNeighborsClassifier()
kn.fit(train_scaled,train_target.values.ravel())
Out[93]:
KNeighborsClassifier()
In [94]:
test_scaled = (test_input-mean) / std
In [95]:
kn.score(test_scaled,test_target)
Out[95]:
0.7025
In [96]:
for n in range(3, 16):
kn.n_neighbors = n
score = kn.score(test_scaled, test_target)
print(n, score)
3 0.7125 4 0.695 5 0.7025 6 0.6875 7 0.6875 8 0.715 9 0.715 10 0.725 11 0.715 12 0.7125 13 0.7175 14 0.7225 15 0.7125
In [97]:
kn.n_neighbors = 10
In [110]:
import pickle
ml_data = {'model':kn,
'mean':mean,
'std':std}
with open("./ml.pkl","wb") as f:
pickle.dump(ml_data,f)
In [111]:
ml_data
Out[111]:
{'model': KNeighborsClassifier(n_neighbors=10), 'mean': fixed acidity 8.316931 volatile acidity 0.531902 citric acid 0.272244 residual sugar 2.547623 chlorides 0.088464 free sulfur dioxide 15.864887 total sulfur dioxide 46.766055 density 0.996765 pH 3.311259 sulphates 0.660017 alcohol 10.420183 dtype: float64, 'std': fixed acidity 1.713657 volatile acidity 0.180142 citric acid 0.195369 residual sugar 1.411277 chlorides 0.049336 free sulfur dioxide 10.249824 total sulfur dioxide 33.012617 density 0.001842 pH 0.154192 sulphates 0.175158 alcohol 1.058656 dtype: float64}
In [108]:
test_scaled.iloc[10].values.reshape(1,-1) #reshape 많이 씀
Out[108]:
array([[ 1.27392452e+00, -1.21579562e-01, 1.88236489e+00, -1.04602461e-01, 3.75717860e-01, -9.62444565e-01, -9.62239831e-01, 2.90765142e-01, -1.43496105e+00, -9.52314295e-05, 1.30336580e+00]])
In [109]:
kn.predict(test_scaled.iloc[10].values.reshape(1,-1))
Out[109]:
array([1])
'두두의 IT > Machine Learning' 카테고리의 다른 글
[머신러닝3] K-최근접 이웃/표준점수(z-score) (0) | 2022.01.05 |
---|---|
[머신러닝2] 불확실성/샘플링편향/훈련세트/테스트세트 (0) | 2022.01.05 |
[머신러닝1] 개념/통계와의 차이/이진분류/K-최근접 이웃(K-Nearest Neighbors) (0) | 2022.01.05 |