- 작성중
In [1]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import matplotlib.pyplot as plt
In [2]:
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
np.random.seed(2019)
In [172]:
# 400개가 바
X, y = make_classification(n_samples = 500, flip_y=0.20, n_features=20, class_sep=5 ,
n_informative=20, n_redundant=0, n_clusters_per_class=1, random_state=4)
test_X = X[250:]
test_y = y[250:]
X = X[:250]
y = y[:250]
In [173]:
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y,
s=100, edgecolor="k", linewidth=2)
plt.xlabel("$X_1$")
plt.ylabel("$X_2$")
Out[173]:
In [174]:
oof = np.zeros(len(y))
skf = StratifiedKFold(n_splits=11, random_state=42)
preds = np.zeros(len(test_y))
for train_index, val_index in skf.split(X, y):
clf = QuadraticDiscriminantAnalysis()
clf.fit(X,y)
oof[val_index] = clf.predict_proba(X[val_index,:])[:,1]
preds += clf.predict_proba(test_X)[:,1] / skf.n_splits
print(roc_auc_score(y, oof))
print(confusion_matrix(y, np.where(oof > 0.5, 1, 0)))
print("")
print("test")
print("")
print(roc_auc_score(test_y, preds))
print(confusion_matrix(test_y, np.where(preds > 0.5, 1, 0)))
In [185]:
X_0 = X[y == 0]
X_1 = X[y == 1]
from sklearn.covariance import EllipticEnvelope
ell = EllipticEnvelope(contamination=0.08, assume_centered = False)
ell.fit(X_0)
outliers_0 = ell.predict(X_0)
ell2 = EllipticEnvelope(contamination=0.08, assume_centered = False)
ell2.fit(X_1)
outliers_1 = ell2.predict(X_1)
inlier_0_index = np.where(outliers_0 == 1)
outlier_0_index = np.where(outliers_0 == -1)
inlier_1_index = np.where(outliers_1 == 1)
outlier_1_index = np.where(outliers_1 == -1)
X_0_removed = X_0[inlier_0_index]
X_1_removed = X_1[inlier_1_index]
X_0_out = X_0[outlier_0_index]
X_1_out = X_1[outlier_1_index]
X_removed = np.concatenate( (X_0_removed, X_1_removed, X_0_out, X_1_out), axis = 0)
y_removed = np.concatenate( (np.zeros(len(X_0_removed)), np.ones(len(X_1_removed)), np.ones(len(X_0_out)), np.zeros(len(X_1_out))), axis = 0)
In [186]:
X_removed.shape
Out[186]:
In [187]:
oof = np.zeros(len(y_removed))
skf = StratifiedKFold(n_splits=11, random_state=42)
preds2 = np.zeros(len(test_y))
for train_index, val_index in skf.split(X_removed, y_removed):
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_removed,y_removed)
oof[val_index] = clf.predict_proba(X_removed[val_index,:])[:,1]
preds2 += clf.predict_proba(test_X)[:,1] / skf.n_splits
print(roc_auc_score(y_removed, oof))
print(confusion_matrix(y_removed, np.where(oof > 0.5, 1, 0)))
print("")
print("test")
print("")
print(roc_auc_score(test_y, preds2))
print(confusion_matrix(test_y, np.where(preds2 > 0.5, 1, 0)))
In [178]:
import seaborn as sns
sns.distplot(oof)
Out[178]:
In [ ]:
In [ ]: