# 3.6 다중 레이블 분류
'''
하나의 샘플이 여러개의 label variable을 가지고 있는 경우.
예를들어, Wafer Bin Map에 scratch, Edge Pattern
'''
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
# multi label을 지원하는 KNN classifier 사용
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
knn_clf.predict([some_digit])
>>> array([[False, True]])
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average='macro')
>>> 0.976410265560605
'''
multiclass task에서 f1 score를 계산하는 방법
* original f1 score는 binary에 해당함.
* multi class 문제에서는 class별 oroginal f1-score 계산 후(이를 per-class F1 score라고 하는듯 함)
아래 방법으로 averaged f1 score 구한다.
* macro : 산술평균
* weighted : macro 방식이지만, class별 weight 부여한 산술평균. 이때 class별 weight는 각 class의 실제 label 비율
(support라고 함)에 비례.
*
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
default='binary'
This parameter is required for multiclass/multilabel targets.
If ``None``, the scores for each class are returned. Otherwise, this
determines the type of averaging performed on the data:
``'binary'``:
Only report results for the class specified by ``pos_label``.
This is applicable only if targets (``y_{true,pred}``) are binary.
``'micro'``:
Calculate metrics globally by counting the total true positives,
false negatives and false positives.
``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
``'weighted'``:
Calculate metrics for each label, and find their average weighted
by support (the number of true instances for each label). This
alters 'macro' to account for label imbalance; it can result in an
F-score that is not between precision and recall.
``'samples'``:
Calculate metrics for each instance, and find their average (only
meaningful for multilabel classification where this differs from
:func:`accuracy_score`).
'''
# 3.7 다중 출력
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
# original / noise-added image
plt.subplot(2, 1, 1)
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(X_train_mod[0].reshape(28,28), cmap='binary')
plt.imshow(X_train[0].reshape(28,28), cmap='binary')
plt.show()
fig = plt.figure()
ax1 = fig.add_subplot(1, 2, 1)
ax1.imshow(X_train_mod[0].reshape(28,28), cmap='binary')
# ax1.axis("off")
ax2 = fig.add_subplot(1, 2, 2)
ax2.imshow(X_train[0].reshape(28,28), cmap='binary')
# ax2.axis("off")
plt.show()
# noise 들어간 이미지를 깨끗하게 만드는 분류기 만들기
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[0]])
fig = plt.figure()
ax1 = fig.add_subplot(1, 3, 1)
ax1.imshow(X_test[0].reshape(28,28), cmap='binary')
ax2 = fig.add_subplot(1, 3, 2)
ax2.imshow(X_test_mod[0].reshape(28,28), cmap='binary')
ax3 = fig.add_subplot(1, 3, 3)
ax3.imshow(clean_digit.reshape(28,28), cmap='binary')
plt.show()
import os
import tarfile
import urllib.request
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
housing.info()
housing['ocean_proximity'].value_counts()
housing.describe()
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()
# 2.3.4 Test data
'''
Train/Test data를 나누는 일반적인 방법 : 난수 생성(or 무작위 정렬) 후 미리 지정된 비유에 따라 split
'''
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
'''
위의 방법은 1) 프로그램을 다시 실행할 때 : 난수 생성시 다른 숫자가 나옴 2) 새로운 data가 들어옴 : 난수 or suffle시 달라짐
-> 정합성 보장할 수 없음.
-> [각 instance가 고유하고 변경불가한 index 가진다]는 가정 하에서(예를들어, hash) hash값 사용해 train/test 분리하는 방법.
-> 신/구 data의 instance가 모두 고유값을 가지기 때문에 안정적으로 train/test 분리 가능함.
'''
from zlib import crc32
# 주어진 identifier로 hash값 구하기.
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2 ** 32
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column]
in_test_set = ids.apply(lambda ids: test_set_check(ids, test_ratio))
return data.iloc[~in_test_set.values], data.iloc[in_test_set.values]
# pandas dataframe index로 hash값 구하기
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
# calculate hash value by longigude & latitude on the dataset.
housing_with_id['id'] = housing_with_id['longitude'] * 1800 + housing_with_id['latitude']
train_set, tset_set = split_train_test_by_id(housing_with_id, 0.2, "id")
'''
sklearn 제공함수로
'''
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
'''
numeric to categorical
'''
housing['income_cat'] = pd.cut(housing['median_income'],
bins=[0, 0.5, 3.0, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])
'''
Stratified sampling based on income_cat
'''
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
start_train_set = housing.loc[train_index]
start_test_set = housing.loc[test_index]
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
compare_income_cat_ratio = pd.DataFrame(
{
'origin': housing['income_cat'].value_counts() / len(housing),
'stratified': start_test_set['income_cat'].value_counts() / len(start_test_set),
'random': test_set['income_cat'].value_counts() / len(test_set)
},
columns=['origin', 'stratified', 'random']
)
print(compare_income_cat_ratio)
'''
recover data
'''
for set_ in (start_train_set, start_test_set):
print(set_.drop('income_cat', axis=1, inplace=True))
'''
2.4 데이터 이해를 위한 탐색과 시각화
'''
housing = start_train_set.copy()
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.8)
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.8,
s=housing['population'] / 100, label='population', figsize=(10, 7),
c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True,
sharex=False)
plt.legend()
'''
2.5 머신러닝 알고리즘은 위한 데이터 준비
어떤 데이터셋에 대해서도 데이터변환을 쉽게 할 수 있어야 함
향후 프로젝트에 사용할 수 있는 변환 라이브러리를 점진적으로 구축하게 됨
실제 시스템에서 알고리즘에 새 데이터 주입 전 변환에 사요할 수 있음
여러 가지 데이터변환을 쉽게 시도할 수 있고, 어떤 조합이 가장 좋은지 확인하는 데 편리함.
'''
# recover and split X, y
housing = start_train_set.drop('median_house_value', axis=1)
housing_labels = start_train_set['median_house_value'].copy()
# 2.5.1 데이터 정제
housing.dropna(subset=['total_bedrooms'])
housing.drop('total_bedrooms', axis=1)
median = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(median, inplace=True)
housing.describe()
# sklean SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)
imputer.statistics_
imputer.fill_value
imputer.fit(housing_num)
housing_num.median().values
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing_num.index)
'''
scikit-learn의 철학
주요 설계원칙
* 일관성 : 모든 객체가 일관되고 단순한 인터페이스를 공유합니다
- 추정기: 데이터셋을 기반으로 일련의 모델 파라미터들을 추정하는 객체. 추정 자체는 fit() 메서드에 의해 수행되고, 하나의 매개변수로 하나의 데이터셋을 전달함. (지도학습에는 x, y 두 개)
추정과정에서 필요한 다른 매개변수들은 모두 hyper-parameter로 간주되어 인스턴스 변수로 저장된다.
- 변환기 : 데이터셋을 변환하는 추정기. 데이터셋을 매개변수로 전달받은 transform() 메서드가 수행 하고, 변환된 데이터셋을 반환.
'''
# 2.5.2 텍스트와 범주형 특성 다루기
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)
# OrdinalEncoder 사용해서 cat -> num encoding.
# ordinal 일 때만.
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
ordinal_encoder.categories_
# one-hot encoding for categorical data
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot # scipy sparse matrix
housing_cat_1hot.toarray().shape
cat_encoder.categories_
# 2.5.3 custom transformer
'''
duct typing : 상속이나 인터페이스 구현이 아니라, 객체의 속성이나 메서드가 객체의 유형을 결정하는 방식.
특별한 정제 작업, 특성들을 조합하는 등의 맞춤형 작업을 위해
'''
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedroom_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X):
rooms_per_household = X[:, rooms_ix] / X[:, bedroom_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedroom_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
attr_adder = CombinedAttributeAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing.values)
# 2.5.4 특성 스케일링
# 2.5.5 변환 파이프라인
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('attribs_adder', CombinedAttributeAdder(add_bedrooms_per_room=True)),
('std_scaler', StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)
'''
Pipeline은 연속된 단계를 나타내는 (이름, 추정기) 쌍으로 구성된 List를 받음.
마지막 단계에서는 변환기 혹은 추정기를 모두 사용할 수 있음.
이전 단계에서는 모두 변환기여야 함.
이름은 무엇이든지 상관없음.(이중 밑줄문자만 아니면 된다)
pipeline의 fit() 메서드를 호출하면 모든 변환기의 fit_transform() 메서드를 순서대로 호출하면서
한 단계의 출력을 다음 단계의 입력으로 전달한다. 마지막 단계에서는 fit() 매서드만 호출한다.
파이프라인 객체는 마지막 추정기와 동일한 메서드를 제공한다. 이 예에서는 마지막 추정기가 반환기 StandardScaler() 이므로, 파이프라인에 데ㅣ터에 대해 모든 변환을 순서대로 적용하는 transform() 메서드를 가지고 있다.
'''
'''
하나의 변환기로 각 열마다 적절한 변환기를 적용하여 모든 열을 처리할 수 있는 Transform
ColumnTransfomer([
(이름, 변환기, 변환기가 적용될 열 이름)
])
'''
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = list(housing_cat)
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
('cat', OneHotEncoder(), cat_attribs)
])
housing_prepared = full_pipeline.fit_transform(housing)
# 2.6 모델 선택과 훈련
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_label = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("예측 : ", lin_reg.predict(some_data_prepared))
print("레이블 : ", list(some_label))
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mse = np.sqrt(lin_mse)
lin_mse
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_mse = np.sqrt(tree_mse)
tree_mse
# 2.6.2 교차검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print('score : ', scores)
print('meam : ', scores.mean())
print('std : ', scores.std())
display_scores(tree_rmse_scores)
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
scoring='neg_mean_squared_error', cv=10)
lin_mse_scores = np.sqrt(-lin_scores)
display_scores(lin_mse_scores)
# RandomForest
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
scoring='neg_mean_squared_error', cv=10)
forest_mse_scores = np.sqrt(-forest_scores)
display_scores(forest_mse_scores)
#2.7 model tunning
# 2.7.1 Grid search
from sklearn.model_selection import GridSearchCV
param_grid = [
# 왜 분리하는거지..?
# bootstrap=True에서 한 번,
# bootstrap=False에서 한 번..?
{'n_estimators': [3],
'max_features': [2, 4,],
},
{'bootstrap': [False],
'n_estimators': [3, 10, 30],
'max_features': [4, 8]}
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
#최적의 추정기에 접근
grid_search.best_params_
#평가 점수 확인
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
# 2.7.2 random search
# 2.7.3 앙상블 방법
# 2.7.4 최상의 모델과 와 분석
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
pd.DataFrame(sorted(zip(feature_importances, attributes), reverse=True))
##TODO : 3.1 MNIST
# Machine learning 분야의 Hello world.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()
# dict 구조를 가지고 있습니다. 이 중에서 설명변수 X와 종속변수 y를 따로 지정하겠습니다
X, y = mnist['data'], mnist['target']
print(X.shape)
print(y.shape)
#
ind_image = 0
some_digit = X[ind_image]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap='binary')
plt.axis('off')
plt.show()
print(y[ind_image])
y = y.astype(np.uint8)
##TODO : 3.2 이진분류기 훈련
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# stochastic gradient descent
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)
sgd_clf.predict([some_digit])
##TODO : 3.3 성능측정
# 교차검증(cross-validation) 구현
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
# StratifiedFold : class별 비율이 유지되도록 fold를 만들어 줌.
# ratio(y==True) = ratio(y_train==True) = ratio(y_test==True)
for train_index, test_index in skfolds.split(X_train, y_train_5):
print(np.sum(y_train_5[train_index]), np.sum(y_train_5[test_index]))
for train_index, test_index in skfolds.split(X_train, y_train_5):
clone_clf = clone(sgd_clf)
X_train_folds = X_train[train_index]
y_train_folds = y_train_5[train_index]
X_test_folds = X_train[test_index]
y_test_folds = y_train_5[test_index]
clone_clf.fit(X_train_folds, y_train_folds)
y_pred = clone_clf.predict(X_test_folds)
n_correct = sum(y_test_folds == y_pred)
print(n_correct/len(y_test_folds))
# dummy 분류기 만들기
from sklearn.base import BaseEstimator
class Never5Classfier(BaseEstimator):
def fit(self, X, y=None):
return self
def predict(self, X):
return np.zeros((len(X), 1),dtype=bool)
##TODO : check book code
# 3.3.2 confusion matrix
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
'''
cross_val_predict 함수는 k-cross-valdiation 수행하지만,
평가점수를 반환하지 않고 각 validation 폴드에서 얻은 validation prediction값을 반환.
아마 해당 data point를 validation으로 사용하는 CV에서의 예측값인듯...
'''
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
'''
Pred : negative | positive
| True Negative | False Positive
| False negative | True Positive
'''
# 3.3.3 Recall and Precision
'''
Recall : 실제 True 중 예측값 일치하는 경우
Precision : 예측 True 중 실제값 일치하는 경
'''
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred)
recall_score(y_train_5, y_train_pred)
'''
f1 score = harmonic mean(recall, precision)
'''
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)
## 3.3.4 Trade off
## y_score = 첫번쨰 data의 예측값
y_scores = sgd_clf.decision_function([some_digit])
y_scores
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
method = 'decision_function')
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_train_5, y_scores)
def plot_precison_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], 'b--', label='정밀도')
plt.plot(thresholds, recalls[:-1], 'g--', label='재현율')
plot_precison_recall_vs_threshold(precision, recall, thresholds)
plt.show()
'''
ROC curve
'''
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls[:-1], precisions[:-1], 'b--')
plot_precision_vs_recall(precision, recall)
plt.show()
# precision 90% 이상이면서 recall 가장 좋은 threshold 찾기
threshold_90_precision = thresholds[np.argmax(precision >= 0.9)]
print(threshold_90_precision)
'''
ROC곡선이 정밀도/재현율 곡선과 비슷함.
일반적인 법칙은, 양성 클래스가 드물거나 거짓 음성보다 거짓 양성이 더 중요할 때 PR 곡선을 사용한다.
반대의 경우 ROC curve
ㅇ
'''
#
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0,1], [0,1], 'k--')
plot_roc_curve(fpr, tpr)
plt.show()
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probs_forest = cross_val_predict(forest_clf, X_train, y_train_5,
cv=3, method='predict_proba')
y_scores_forest = y_probs_forest[:,1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "random forest")
plt.legend(loc='lower right')
plt.show()
# 3.4 다중분류
'''
binary classifier로 multiclass task 풀기
1) OvR(One-versus-the-test) : m개 class에 대해 m개의 binary model -> 확률 가장 높은 class로 분류
2) OvO(One-versus-one) : 2개 class 조합마다 binary model 각각 적용
-> 비효율적이긴 하지만, 개별 task만 보면 쉬운 문제일 수도..
-> SVM처럼 작은 train-set에서 많은 분류기를 훈련시키는게 유리한 알고리즘에 적용하면 좋다고 함.
'''
# OvR로
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_clf.predict([some_digit])
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores
np.argmax(some_digit_scores)
svm_clf.classes_
# OvO으로 강제하기
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SVC())
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
len(ovo_clf.estimators_)
# OvR으로 강제하기
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC())
ovr_clf.fit(X_train, y_train)
ovr_clf.predict([some_digit])
len(ovr_clf.estimators_)
# SGDClassifier의 정확도를 CV로 평가
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')
# 성능향상을 위해 입력 scale을 StandardScaler로 조절
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
#3.5 에러 분석
'''
모델 성능 향상 방법 검토
'''
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx