Glove, Crawl 글로벌 임베딩 벡터를 이용한 LSTM 파이썬 치트코드

글로벌 벡터를 이용한 간단한 LSTM 이용법입니다.

simple lstm

GLOVE, CRAWL 글로벌 임베딩 벡터를 이용한 훈련

In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
Using TensorFlow backend.

임베딩 파일 준비

In [2]:
EMBEDDING_FILES = [
    'input/crawl-300d-2M.vec',
    'input/glove.840B.300d.txt'
]
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220

임베딩에서 골라냄

In [ ]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

임베딩에서 딕셔너리 형태로 추출

In [ ]:
def load_embeddings(path):
    with open(path, 'rt', encoding='UTF8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

단어로 매트릭스 만들기

In [3]:
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix
    

모델링

In [4]:
def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model
    

특수문자 전처리

In [5]:
def preprocess(data):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

데이터 로드

In [6]:
train = pd.read_csv('input/train.csv', encoding = 'utf-8')
test = pd.read_csv('input/test.csv', encoding = 'utf-8')
In [7]:
train.head()
Out[7]:
id target comment_text severe_toxicity obscene identity_attack insult threat asian atheist article_id rating funny wow sad likes disagree sexual_explicit identity_annotator_count toxicity_annotator_count
0 59848 0.000000 This is so cool. It’s like, ‘would you want yo… 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN 2006 rejected 0 0 0 0 0 0.0 0 4
1 59849 0.000000 Thank you!! This would make my life a lot less… 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN 2006 rejected 0 0 0 0 0 0.0 0 4
2 59852 0.000000 This is such an urgent design problem; kudos t… 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN 2006 rejected 0 0 0 0 0 0.0 0 4
3 59855 0.000000 Is this something I’ll be able to install on m… 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN 2006 rejected 0 0 0 0 0 0.0 0 4
4 59856 0.893617 haha you guys are a bunch of losers. 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 2006 rejected 0 0 0 1 0 0.0 4 47

5 rows × 45 columns

텍스트 데이터 준비 및 변환

In [9]:
x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

# crawl and golve
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
    
checkpoint_predictions = []
weights = []

훈련

In [ ]:
for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix, y_aux_train.shape[-1])
    for global_epoch in range(EPOCHS):
        model.fit(
            x_train,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=2,
            callbacks=[
                LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))
            ]
        )
        checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)

predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': predictions
})

댓글 남기기