마할라노비스 거리를 이용한 Elliptic Envelope 아웃라이어 디텍션

  • 작성중
outlier 제거 후 방
In [1]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import matplotlib.pyplot as plt
In [2]:
from sklearn.datasets import make_classification 
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
np.random.seed(2019)
In [172]:
# 400개가 바
X, y = make_classification(n_samples = 500, flip_y=0.20, n_features=20, class_sep=5 ,
                           n_informative=20, n_redundant=0, n_clusters_per_class=1, random_state=4)

test_X = X[250:]
test_y = y[250:]

X = X[:250]
y = y[:250]
In [173]:
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y,
            s=100, edgecolor="k", linewidth=2)

plt.xlabel("$X_1$")
plt.ylabel("$X_2$")
Out[173]:
Text(0, 0.5, '$X_2$')
In [174]:
oof = np.zeros(len(y))
skf = StratifiedKFold(n_splits=11, random_state=42)
preds = np.zeros(len(test_y))

for train_index, val_index in skf.split(X, y):

    clf = QuadraticDiscriminantAnalysis()
    clf.fit(X,y)
    oof[val_index] = clf.predict_proba(X[val_index,:])[:,1]
    preds += clf.predict_proba(test_X)[:,1] / skf.n_splits
    
print(roc_auc_score(y, oof))

print(confusion_matrix(y, np.where(oof > 0.5, 1, 0)))
print("")
print("test")
print("")
print(roc_auc_score(test_y, preds))
print(confusion_matrix(test_y, np.where(preds > 0.5, 1, 0)))
0.9609133126934984
[[106   8]
 [ 13 123]]

test

0.880713918849512
[[115  17]
 [ 15 103]]
In [185]:
X_0 = X[y == 0]
X_1 = X[y == 1]

from sklearn.covariance import EllipticEnvelope
ell = EllipticEnvelope(contamination=0.08, assume_centered = False)
ell.fit(X_0)
outliers_0 = ell.predict(X_0)
ell2 = EllipticEnvelope(contamination=0.08, assume_centered = False)
ell2.fit(X_1)
outliers_1 = ell2.predict(X_1)

inlier_0_index = np.where(outliers_0 == 1)
outlier_0_index = np.where(outliers_0 == -1)
inlier_1_index = np.where(outliers_1 == 1)
outlier_1_index = np.where(outliers_1 == -1)

X_0_removed = X_0[inlier_0_index]
X_1_removed = X_1[inlier_1_index]

X_0_out = X_0[outlier_0_index]
X_1_out = X_1[outlier_1_index]

X_removed = np.concatenate( (X_0_removed, X_1_removed, X_0_out, X_1_out), axis = 0)
y_removed = np.concatenate( (np.zeros(len(X_0_removed)), np.ones(len(X_1_removed)), np.ones(len(X_0_out)), np.zeros(len(X_1_out))), axis = 0)
In [186]:
X_removed.shape
Out[186]:
(250, 20)
In [187]:
oof = np.zeros(len(y_removed))
skf = StratifiedKFold(n_splits=11, random_state=42)
preds2 = np.zeros(len(test_y))

for train_index, val_index in skf.split(X_removed, y_removed):

    clf = QuadraticDiscriminantAnalysis()
    clf.fit(X_removed,y_removed)
    oof[val_index] = clf.predict_proba(X_removed[val_index,:])[:,1]
    preds2 += clf.predict_proba(test_X)[:,1] / skf.n_splits
    
print(roc_auc_score(y_removed, oof))
print(confusion_matrix(y_removed, np.where(oof > 0.5, 1, 0)))
print("")
print("test")
print("")
print(roc_auc_score(test_y, preds2))
print(confusion_matrix(test_y, np.where(preds2 > 0.5, 1, 0)))
0.9992270531400966
[[115   0]
 [  4 131]]

test

0.8820942475603493
[[116  16]
 [ 15 103]]
In [178]:
import seaborn as sns
sns.distplot(oof)
Out[178]:
<matplotlib.axes._subplots.AxesSubplot at 0x24d0c25bef0>
In [ ]:
 
In [ ]:
 

댓글 남기기