GPU와 케라스를 이용한 간단한 GRU¶
In [ ]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU,CuDNNLSTM, Conv1D, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import matplotlib.pyplot as plt
In [2]:
train_raw = pd.read_csv("raw.txt",sep='\t', encoding='cp949')
train, test = train_test_split(train_raw, test_size = 0.1, random_state = 58)
In [ ]:
EMBED_SIZE = 300
MAX_FEATURE = 100000
MAX_LEN = 300
결측치 처리¶
In [ ]:
train_x = train['sentence'].fillna("_na_").values
test_x = test['sentence'].fillna("_na_").values
토큰을 끊고, 토크나이저를 준비¶
In [ ]:
tokenizer = Tokenizer(num_words=MAX_FEATURE)
tokenizer.fit_on_texts(list(train_x))
tokenizer.fit_on_texts(list(test_x))
텍스트를 숫자로 변환¶
In [3]:
train_s = tokenizer.texts_to_sequences(train_x)
test_s = tokenizer.texts_to_sequences(test_x)
시퀀스가 빈것들을 0으로 채움¶
In [ ]:
train_p = pad_sequences(train_s, maxlen=MAX_LEN)
test_p = pad_sequences(test_s, maxlen=MAX_LEN)
라벨링 데이터 나누기¶
In [ ]:
train_y = train['label']
test_y = test['label']
분류를 위해 라벨을 원핫인코딩¶
In [3]:
train_dummy_y = pd.get_dummies(train_y)
test_dummy_y = pd.get_dummies(test_y)
Embedding 레이어를 만들고, Bidirectional GRU layer 모델을 컴파일¶
In [4]:
inp = Input(shape = (MAX_LEN,))
x = Embedding(MAX_FEATURE, EMBED_SIZE)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation = "relu")(x)
x = Dropout(0.1)(x)
x = Dense(3, activation = "sigmoid")(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['acc','mae'])
In [5]:
print(model.summary())
In [6]:
history = model.fit(train_p, train_dummy_y, batch_size=512, epochs=24, validation_data=(test_p, test_dummy_y))
argmax를 통해 컬럼중 제일 높은 확률을 가지는 컬럼을 선택¶
In [7]:
pred_y = model.predict([test_p], batch_size=1024, verbose=1)
answer = np.argmax(pred_y, axis=1)
plt.plot(history.history['acc'])
confusion_matrix(test_y, answer)
Out[7]:
정확도 확인¶
In [9]:
precision_recall_fscore_support(test_y, answer, average='micro')
Out[9]:
In [10]:
precision_recall_fscore_support(test_y, answer, average='macro')
Out[10]: