글로벌 벡터를 이용한 간단한 LSTM 이용법입니다.
GLOVE, CRAWL 글로벌 임베딩 벡터를 이용한 훈련¶
In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
임베딩 파일 준비¶
In [2]:
EMBEDDING_FILES = [
'input/crawl-300d-2M.vec',
'input/glove.840B.300d.txt'
]
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
임베딩에서 골라냄¶
In [ ]:
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
임베딩에서 딕셔너리 형태로 추출¶
In [ ]:
def load_embeddings(path):
with open(path, 'rt', encoding='UTF8') as f:
return dict(get_coefs(*line.strip().split(' ')) for line in f)
단어로 매트릭스 만들기¶
In [3]:
def build_matrix(word_index, path):
embedding_index = load_embeddings(path)
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
try:
embedding_matrix[i] = embedding_index[word]
except KeyError:
pass
return embedding_matrix
모델링¶
In [4]:
def build_model(embedding_matrix, num_aux_targets):
words = Input(shape=(MAX_LEN,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
x = SpatialDropout1D(0.3)(x)
x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
hidden = concatenate([
GlobalMaxPooling1D()(x),
GlobalAveragePooling1D()(x),
])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
result = Dense(1, activation='sigmoid')(hidden)
aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
model = Model(inputs=words, outputs=[result, aux_result])
model.compile(loss='binary_crossentropy', optimizer='adam')
return model
특수문자 전처리¶
In [5]:
def preprocess(data):
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
for p in punct:
text = text.replace(p, ' ')
return text
data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
return data
데이터 로드¶
In [6]:
train = pd.read_csv('input/train.csv', encoding = 'utf-8')
test = pd.read_csv('input/test.csv', encoding = 'utf-8')
In [7]:
train.head()
Out[7]:
텍스트 데이터 준비 및 변환¶
In [9]:
x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)
# crawl and golve
embedding_matrix = np.concatenate(
[build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
checkpoint_predictions = []
weights = []
훈련¶
In [ ]:
for model_idx in range(NUM_MODELS):
model = build_model(embedding_matrix, y_aux_train.shape[-1])
for global_epoch in range(EPOCHS):
model.fit(
x_train,
[y_train, y_aux_train],
batch_size=BATCH_SIZE,
epochs=1,
verbose=2,
callbacks=[
LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))
]
)
checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
weights.append(2 ** global_epoch)
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)
submission = pd.DataFrame.from_dict({
'id': test['id'],
'prediction': predictions
})