핸즈온 머신러닝 2장

# 3.6 다중 레이블 분류
'''
하나의 샘플이 여러개의 label variable을 가지고 있는 경우.
예를들어, Wafer Bin Map에 scratch, Edge Pattern
'''

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

y_train_large  = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

# multi label을 지원하는 KNN classifier 사용
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

knn_clf.predict([some_digit])
>>> array([[False,  True]])

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average='macro')
>>> 0.976410265560605

'''
multiclass task에서 f1 score를 계산하는 방법 
 * original f1 score는 binary에 해당함. 
 * multi class 문제에서는 class별 oroginal f1-score 계산 후(이를 per-class F1 score라고 하는듯 함)
   아래 방법으로 averaged f1 score 구한다.
 * macro : 산술평균
 * weighted : macro 방식이지만, class별 weight 부여한 산술평균. 이때 class별 weight는 각 class의 실제 label 비율
              (support라고 함)에 비례.
 *  
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
            default='binary'
        This parameter is required for multiclass/multilabel targets.
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

        ``'binary'``:
            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
        ``'micro'``:
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
        ``'samples'``:
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from
            :func:`accuracy_score`).
'''


# 3.7 다중 출력

noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

# original / noise-added image

plt.subplot(2, 1, 1)
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(X_train_mod[0].reshape(28,28), cmap='binary')
plt.imshow(X_train[0].reshape(28,28), cmap='binary')
plt.show()


fig = plt.figure()

ax1 = fig.add_subplot(1, 2, 1)
ax1.imshow(X_train_mod[0].reshape(28,28), cmap='binary')
# ax1.axis("off")

ax2 = fig.add_subplot(1, 2, 2)
ax2.imshow(X_train[0].reshape(28,28), cmap='binary')
# ax2.axis("off")
plt.show()


# noise 들어간 이미지를 깨끗하게 만드는 분류기 만들기
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[0]])

fig = plt.figure()

ax1 = fig.add_subplot(1, 3, 1)
ax1.imshow(X_test[0].reshape(28,28), cmap='binary')

ax2 = fig.add_subplot(1, 3, 2)
ax2.imshow(X_test_mod[0].reshape(28,28), cmap='binary')

ax3 = fig.add_subplot(1, 3, 3)
ax3.imshow(clean_digit.reshape(28,28), cmap='binary')

plt.show()

import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


fetch_housing_data()

import pandas as pd


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()
housing.head()

housing.info()

housing['ocean_proximity'].value_counts()

housing.describe()

import matplotlib.pyplot as plt

housing.hist(bins=50, figsize=(20, 15))
plt.show()

# 2.3.4 Test data
'''
Train/Test data를 나누는 일반적인 방법 : 난수 생성(or 무작위 정렬) 후 미리 지정된 비유에 따라 split
'''
import numpy as np


def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]


train_set, test_set = split_train_test(housing, 0.2)

'''
위의 방법은 1) 프로그램을 다시 실행할 때 : 난수 생성시 다른 숫자가 나옴 2) 새로운 data가 들어옴 : 난수 or suffle시 달라짐
-> 정합성 보장할 수 없음.
-> [각 instance가 고유하고 변경불가한 index 가진다]는 가정 하에서(예를들어, hash) hash값 사용해 train/test 분리하는 방법.
-> 신/구 data의 instance가 모두 고유값을 가지기 때문에 안정적으로 train/test 분리 가능함.
'''
from zlib import crc32


# 주어진 identifier로 hash값 구하기.
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2 ** 32


def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda ids: test_set_check(ids, test_ratio))
    return data.iloc[~in_test_set.values], data.iloc[in_test_set.values]


# pandas dataframe index로 hash값 구하기
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

# calculate hash value by longigude & latitude on the dataset.
housing_with_id['id'] = housing_with_id['longitude'] * 1800 + housing_with_id['latitude']
train_set, tset_set = split_train_test_by_id(housing_with_id, 0.2, "id")

'''
sklearn 제공함수로 
'''

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

'''
numeric to categorical
'''
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0, 0.5, 3.0, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

'''
Stratified sampling based on income_cat
'''
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    start_train_set = housing.loc[train_index]
    start_test_set = housing.loc[test_index]

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_income_cat_ratio = pd.DataFrame(
    {
        'origin': housing['income_cat'].value_counts() / len(housing),
        'stratified': start_test_set['income_cat'].value_counts() / len(start_test_set),
        'random': test_set['income_cat'].value_counts() / len(test_set)
    },
    columns=['origin', 'stratified', 'random']
)
print(compare_income_cat_ratio)

'''
recover data
'''
for set_ in (start_train_set, start_test_set):
    print(set_.drop('income_cat', axis=1, inplace=True))

'''
2.4 데이터 이해를 위한 탐색과 시각화
'''
housing = start_train_set.copy()
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.8)

housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.8,
             s=housing['population'] / 100, label='population', figsize=(10, 7),
             c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True,
             sharex=False)
plt.legend()

'''
2.5 머신러닝 알고리즘은 위한 데이터 준비

어떤 데이터셋에 대해서도 데이터변환을 쉽게 할 수 있어야 함
향후 프로젝트에 사용할 수 있는 변환 라이브러리를 점진적으로 구축하게 됨
실제 시스템에서 알고리즘에 새 데이터 주입 전 변환에 사요할 수 있음
여러 가지 데이터변환을 쉽게 시도할 수 있고, 어떤 조합이 가장 좋은지 확인하는 데 편리함.
'''
# recover and split X, y
housing = start_train_set.drop('median_house_value', axis=1)
housing_labels = start_train_set['median_house_value'].copy()

# 2.5.1 데이터 정제
housing.dropna(subset=['total_bedrooms'])
housing.drop('total_bedrooms', axis=1)
median = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(median, inplace=True)
housing.describe()

# sklean SimpleImputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)

imputer.statistics_
imputer.fill_value
imputer.fit(housing_num)

housing_num.median().values

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)

'''
scikit-learn의 철학
주요 설계원칙
* 일관성 : 모든 객체가 일관되고 단순한 인터페이스를 공유합니다
 - 추정기: 데이터셋을 기반으로 일련의 모델 파라미터들을 추정하는 객체. 추정 자체는 fit() 메서드에 의해 수행되고, 하나의 매개변수로 하나의 데이터셋을 전달함. (지도학습에는 x, y 두 개)
         추정과정에서 필요한 다른 매개변수들은 모두 hyper-parameter로 간주되어 인스턴스 변수로 저장된다.
 - 변환기 : 데이터셋을 변환하는 추정기. 데이터셋을 매개변수로 전달받은 transform() 메서드가 수행 하고, 변환된 데이터셋을 반환.
'''

# 2.5.2 텍스트와 범주형 특성 다루기
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

# OrdinalEncoder 사용해서 cat -> num encoding.
# ordinal 일 때만.
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

ordinal_encoder.categories_

# one-hot encoding for categorical data
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot  # scipy sparse matrix
housing_cat_1hot.toarray().shape

cat_encoder.categories_

# 2.5.3 custom transformer

'''
duct typing : 상속이나 인터페이스 구현이 아니라, 객체의 속성이나 메서드가 객체의 유형을 결정하는 방식.
특별한 정제 작업, 특성들을 조합하는 등의 맞춤형 작업을 위해

'''

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedroom_ix, population_ix, household_ix = 3, 4, 5, 6


class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, bedroom_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedroom_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]


attr_adder = CombinedAttributeAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing.values)

# 2.5.4 특성 스케일링

# 2.5.5 변환 파이프라인
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributeAdder(add_bedrooms_per_room=True)),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

'''
Pipeline은 연속된 단계를 나타내는 (이름, 추정기) 쌍으로 구성된 List를 받음.

마지막 단계에서는 변환기 혹은 추정기를 모두 사용할 수 있음.
이전 단계에서는 모두 변환기여야 함.
이름은 무엇이든지 상관없음.(이중 밑줄문자만 아니면 된다)

pipeline의 fit() 메서드를 호출하면 모든 변환기의 fit_transform() 메서드를 순서대로 호출하면서
 한 단계의 출력을 다음 단계의 입력으로 전달한다. 마지막 단계에서는 fit() 매서드만 호출한다.

 파이프라인 객체는 마지막 추정기와 동일한 메서드를 제공한다. 이 예에서는 마지막 추정기가 반환기 StandardScaler() 이므로, 파이프라인에 데ㅣ터에 대해 모든 변환을 순서대로 적용하는 transform() 메서드를 가지고 있다.
'''

'''
하나의 변환기로 각 열마다 적절한 변환기를 적용하여 모든 열을 처리할 수 있는 Transform

ColumnTransfomer([
    (이름, 변환기, 변환기가 적용될 열 이름)
])

'''
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = list(housing_cat)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

# 2.6 모델 선택과 훈련
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_label = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)
print("예측  : ", lin_reg.predict(some_data_prepared))
print("레이블 : ", list(some_label))

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mse = np.sqrt(lin_mse)
lin_mse


# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)

tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_mse = np.sqrt(tree_mse)
tree_mse

# 2.6.2 교차검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print('score : ', scores)
    print('meam : ', scores.mean())
    print('std : ', scores.std())

display_scores(tree_rmse_scores)

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring='neg_mean_squared_error', cv=10)
lin_mse_scores = np.sqrt(-lin_scores)
display_scores(lin_mse_scores)

# RandomForest
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                             scoring='neg_mean_squared_error', cv=10)
forest_mse_scores = np.sqrt(-forest_scores)
display_scores(forest_mse_scores)



#2.7 model tunning
# 2.7.1 Grid search
from sklearn.model_selection import GridSearchCV
param_grid = [
    # 왜 분리하는거지..?
    # bootstrap=True에서 한 번,
    # bootstrap=False에서 한 번..?
    {'n_estimators': [3],
     'max_features': [2, 4,],
     },
    {'bootstrap': [False],
     'n_estimators': [3, 10, 30],
     'max_features': [4, 8]}
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

#최적의 추정기에 접근
grid_search.best_params_

#평가 점수 확인
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)


# 2.7.2 random search

# 2.7.3 앙상블 방법

# 2.7.4 최상의 모델과 와 분석
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
pd.DataFrame(sorted(zip(feature_importances, attributes), reverse=True))

##TODO : 3.1 MNIST
# Machine learning 분야의 Hello world.

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

# dict 구조를 가지고 있습니다. 이 중에서 설명변수 X와 종속변수 y를 따로 지정하겠습니다

X, y = mnist['data'], mnist['target']
print(X.shape)
print(y.shape)


#
ind_image = 0
some_digit = X[ind_image]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap='binary')
plt.axis('off')
plt.show()

print(y[ind_image])

y = y.astype(np.uint8)



##TODO : 3.2 이진분류기 훈련
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

# stochastic gradient descent
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

sgd_clf.predict([some_digit])

##TODO : 3.3 성능측정
# 교차검증(cross-validation) 구현
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# StratifiedFold : class별 비율이 유지되도록 fold를 만들어 줌.
# ratio(y==True) = ratio(y_train==True) = ratio(y_test==True)
for train_index, test_index in skfolds.split(X_train, y_train_5):
    print(np.sum(y_train_5[train_index]), np.sum(y_train_5[test_index]))


for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_folds = X_train[test_index]
    y_test_folds = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_folds)
    n_correct = sum(y_test_folds == y_pred)
    print(n_correct/len(y_test_folds))


# dummy 분류기 만들기
from sklearn.base import BaseEstimator

class Never5Classfier(BaseEstimator):
    def fit(self, X, y=None):
        return self
    def predict(self, X):
        return np.zeros((len(X), 1),dtype=bool)


##TODO : check book code


# 3.3.2 confusion matrix
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

'''
cross_val_predict 함수는 k-cross-valdiation 수행하지만,
평가점수를 반환하지 않고 각 validation 폴드에서 얻은 validation prediction값을 반환.
아마 해당 data point를 validation으로 사용하는 CV에서의 예측값인듯...
'''

from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)

'''
Pred : negative | positive
 | True Negative | False Positive
 | False negative | True Positive
'''

# 3.3.3 Recall and Precision
'''
Recall : 실제 True 중 예측값 일치하는 경우
Precision : 예측 True 중 실제값 일치하는 경
'''
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred)
recall_score(y_train_5, y_train_pred)


'''
f1 score = harmonic mean(recall, precision) 
'''
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)


## 3.3.4 Trade off

## y_score = 첫번쨰 data의 예측값
y_scores = sgd_clf.decision_function([some_digit])
y_scores

threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred


y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method = 'decision_function')
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precison_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='정밀도')
    plt.plot(thresholds, recalls[:-1], 'g--', label='재현율')

plot_precison_recall_vs_threshold(precision, recall, thresholds)
plt.show()


'''
ROC curve
'''

def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls[:-1], precisions[:-1], 'b--')
plot_precision_vs_recall(precision, recall)
plt.show()


# precision 90% 이상이면서 recall 가장 좋은 threshold 찾기
threshold_90_precision = thresholds[np.argmax(precision >= 0.9)]
print(threshold_90_precision)



'''
ROC곡선이 정밀도/재현율 곡선과 비슷함.
일반적인 법칙은, 양성 클래스가 드물거나 거짓 음성보다 거짓 양성이 더 중요할 때 PR 곡선을 사용한다.
반대의 경우 ROC curve
ㅇ
'''

#
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')

plot_roc_curve(fpr, tpr)
plt.show()


from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probs_forest = cross_val_predict(forest_clf, X_train, y_train_5,
                                   cv=3, method='predict_proba')


y_scores_forest = y_probs_forest[:,1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "random forest")
plt.legend(loc='lower right')
plt.show()


# 3.4 다중분류
'''
binary classifier로 multiclass task 풀기
1) OvR(One-versus-the-test) : m개 class에 대해 m개의 binary model -> 확률 가장 높은 class로 분류
2) OvO(One-versus-one) : 2개 class 조합마다 binary model 각각 적용
    -> 비효율적이긴 하지만, 개별 task만 보면 쉬운 문제일 수도.. 
    -> SVM처럼 작은 train-set에서 많은 분류기를 훈련시키는게 유리한 알고리즘에 적용하면 좋다고 함.
'''

# OvR로
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_clf.predict([some_digit])


some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores

np.argmax(some_digit_scores)

svm_clf.classes_

# OvO으로 강제하기
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SVC())
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
len(ovo_clf.estimators_)

# OvR으로 강제하기
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC())
ovr_clf.fit(X_train, y_train)
ovr_clf.predict([some_digit])
len(ovr_clf.estimators_)


# SGDClassifier의 정확도를 CV로 평가
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

# 성능향상을 위해 입력 scale을 StandardScaler로 조절
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")


#3.5 에러 분석
'''
모델 성능 향상 방법 검토
'''

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

블로그

핸즈온 머신러닝 2장

티스토리툴바