뉴럴 네트워크와 LightGBM을 이용한 table 데이터 예측 (이진 분류)¶
In [1]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.layers import Dense, Input, Activation, Flatten
from tensorflow.keras.layers import BatchNormalization,Add,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import LeakyReLU, ReLU, Conv2D, MaxPooling2D, BatchNormalization, Conv2DTranspose, UpSampling2D
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow import metrics
import tensorflow as tf
from sklearn.metrics import roc_auc_score
In [2]:
os.listdir('../input')
Out[2]:
In [3]:
submission = pd.read_csv("../input/sample_submission.csv")
In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
In [5]:
submission.head(3)
Out[5]:
LGB는 굳이 인코딩을 하지 않더라도 category타입이라면 자동변환되므로 해당하는 함수¶
In [6]:
def to_category_columns(df) :
string_columns = df.columns[df.dtypes == 'object']
for c in string_columns :
df[c] = df[c].astype('category')
return df
라벨 인코딩을 하는 함수¶
In [7]:
def to_label_encoding(df) :
le = preprocessing.LabelEncoder()
string_columns = df.columns[(df.dtypes == 'object') | (df.dtypes == 'category')]
for c in string_columns :
df[c] = le.fit_transform(df[c].astype(str))
return df
In [8]:
raw_transaction = pd.read_csv("../input/train_transaction.csv")
raw_identity = pd.read_csv("../input/train_identity.csv")
COMPETITION_raw_transaction = pd.read_csv("../input/test_transaction.csv")
COMPETITION_raw_identity = pd.read_csv("../input/test_identity.csv")
In [9]:
# 카테고리 컬럼으로 변환
raw_transaction = to_category_columns(raw_transaction)
raw_identity = to_category_columns(raw_identity)
COMPETITION_raw_transaction = to_category_columns(COMPETITION_raw_transaction)
COMPETITION_raw_identity = to_category_columns(COMPETITION_raw_identity)
In [10]:
raw_transaction.head(2)
Out[10]:
In [11]:
raw_identity.head(2)
Out[11]:
In [12]:
feature_list = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6']
In [13]:
train_raw_X = raw_transaction[feature_list]
train_raw_y = raw_transaction[['isFraud']]
COMPETITION_X = COMPETITION_raw_transaction[feature_list]
비교용 LGB모델¶
In [14]:
train_X, valid_X, train_y, valid_y = train_test_split(train_raw_X, train_raw_y, test_size=0.2, random_state=1493)
params = {'learning_rate': 0.1,
'max_depth': 16,
'boosting': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'is_training_metric': True,
'num_leaves': 144,
'feature_fraction': 0.9,
'bagging_fraction': 0.7,
'bagging_freq': 5,
'seed':2018}
train_ds = lgb.Dataset(train_X, label = train_y)
valid_ds = lgb.Dataset(valid_X, label = valid_y)
model = lgb.train(params, train_ds, 1000, [train_ds, valid_ds], verbose_eval=100, early_stopping_rounds=100)
In [16]:
model = lgb.train(params, train_ds, 1000, [train_ds, valid_ds], verbose_eval=100, early_stopping_rounds=100)
In [17]:
predict = model.predict(COMPETITION_X)
submission['isFraud'] = predict
In [19]:
submission.to_csv("./submission/submission_fraud_0829_baseline2.csv", index=False)
기본적 신경망 모델 구축¶
In [17]:
def create_nn_model(input_shape):
inp = Input(shape = (input_shape, ))
x = Dense(1024, activation = 'relu', kernel_initializer='he_normal')(inp)
x = BatchNormalization()(x)
x = Dense(1024, activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dense(512, activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dense(256, activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dense(128, activation = 'relu')(x)
x = BatchNormalization()(x)
out = Dense(1, activation = 'sigmoid')(x)
model = Model(inputs=inp, outputs=[out])
return model
카테고리컬 변수를 숫자로 변환하기 위한 과정¶
- Label인코더는 숫자로 의미를 띄게 되어서, embedding이나 one hot encoder가 예측률이 더 놓은 편
In [88]:
le = preprocessing.LabelEncoder()
X = pd.concat([train_raw_X, COMPETITION_X], ignore_index=True)
X = to_label_encoding(X)
input_data = StandardScaler().fit_transform(X)
input_data = np.nan_to_num(input_data)
AUROC를 계산하기 위한 callback 함수 적용¶
In [27]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class roc_callback(Callback):
def __init__(self,training_data,validation_data):
self.x = training_data[0]
self.y = training_data[1]
self.x_val = validation_data[0]
self.y_val = validation_data[1]
def on_train_batch_begin(self, batch, logs=None):
return
def on_train_batch_end(self, batch, logs=None):
return
def on_epoch_begin(self, epoch, logs={}):
return
def on_epoch_end(self, epoch, logs={}):
y_pred = self.model.predict(self.x)
roc = roc_auc_score(self.y, y_pred)
y_pred_val = self.model.predict(self.x_val)
roc_val = roc_auc_score(self.y_val, y_pred_val)
print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
return
def on_test_batch_begin(self, batch, logs={}):
return
def on_test_batch_end(self, batch, logs={}):
return
def on_test_begin(self, batch, logs={}):
return
def on_test_end(self, batch, logs={}):
return
실험용으로 KFOLD를 이용 (stratified나 timeseries를 이용하는게 적당)¶
- 카테고리변수를 제대로 처리를 안하면, 아래와 같이 정확도에서 의심가는 형태가 발견됨
In [71]:
N_FOLDS = 2
folds = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1493)
In [93]:
predictions = np.zeros((len(COMPETITION_X), 1))
In [ ]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_raw_X, train_raw_y)):
tr_x, tr_y = input_data[trn_idx], train_raw_y.values[trn_idx]
vl_x, vl_y = input_data[val_idx], train_raw_y.values[val_idx]
test_input = input_data[len(train_raw_X):,:]
nn_model = create_nn_model(tr_x.shape[1])
nn_model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam())
es = callbacks.EarlyStopping(monitor='val_loss', min_delta = 0.0001, patience = 40, verbose = 1, mode='auto', restore_best_weights = True)
rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor = 0.1, patience = 30, min_lr = 1e-6, mode = 'auto', verbose = 1)
history = nn_model.fit(tr_x, [tr_y], validation_data=(vl_x, [vl_y]), callbacks = [es, rlr,roc_callback(training_data=(tr_x, tr_y),validation_data=(vl_x, vl_y)) ], epochs = 10, batch_size = 10000, verbose = 1)
#history = nn_model.fit(tr_x, [tr_y], validation_data=(vl_x, [vl_y]), callbacks = [es, rlr], epochs = 5, batch_size = 10000, verbose = 1)
cv_predict = nn_model.predict(vl_x)
v = np.squeeze(vl_y)
p = np.squeeze(cv_predict)
accuracy = 1 - np.mean(np.abs(v - p))
test_predict = nn_model.predict(test_input)
predictions += test_predict/N_FOLDS
In [ ]:
np.sort(predictions)
In [ ]: