165 lines
3.4 KiB
Python
165 lines
3.4 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
# In[1]:
|
||
|
|
||
|
|
||
|
import lzma
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from keras import optimizers
|
||
|
from keras.callbacks import ModelCheckpoint
|
||
|
from keras.layers import Dense, LSTM, Embedding, RepeatVector
|
||
|
from keras.models import Sequential, load_model
|
||
|
from keras.preprocessing.text import Tokenizer
|
||
|
from keras.utils import pad_sequences
|
||
|
from keras.backend import clear_session
|
||
|
get_ipython().run_line_magic('matplotlib', 'inline')
|
||
|
pd.set_option('display.max_colwidth', 200)
|
||
|
|
||
|
|
||
|
# In[2]:
|
||
|
|
||
|
|
||
|
import tensorflow as tf
|
||
|
|
||
|
physical_devices = tf.config.list_physical_devices('GPU')
|
||
|
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
|
||
|
|
||
|
|
||
|
# In[3]:
|
||
|
|
||
|
|
||
|
train_folder = 'wmt-2020-pl-en/train'
|
||
|
|
||
|
|
||
|
def read_xz_file(fname):
|
||
|
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
|
||
|
return [line.strip() for line in f.readlines()]
|
||
|
|
||
|
|
||
|
train_data = read_xz_file(f'{train_folder}/in.tsv.xz')
|
||
|
train_labels = read_xz_file(f'{train_folder}/expected.tsv.xz')
|
||
|
|
||
|
|
||
|
# In[4]:
|
||
|
|
||
|
|
||
|
dev_folder = 'wmt-2020-pl-en/dev-0'
|
||
|
|
||
|
|
||
|
def read_file(fname):
|
||
|
with open(fname, mode='rt', encoding='utf-8') as f:
|
||
|
return [line.strip() for line in f.readlines()]
|
||
|
|
||
|
|
||
|
dev_data = read_file(f'{dev_folder}/in.tsv')
|
||
|
dev_labels = read_file(f'{dev_folder}/in.tsv')
|
||
|
|
||
|
|
||
|
# In[5]:
|
||
|
|
||
|
|
||
|
def tokenize(lines):
|
||
|
tokenizer = Tokenizer()
|
||
|
tokenizer.fit_on_texts(lines)
|
||
|
return tokenizer
|
||
|
|
||
|
|
||
|
src_tokenizer = tokenize(train_data)
|
||
|
src_vocab_size = len(src_tokenizer.word_index) + 1
|
||
|
dst_tokenizer = tokenize(train_labels)
|
||
|
dst_vocab_size = len(dst_tokenizer.word_index) + 1
|
||
|
|
||
|
|
||
|
# In[6]:
|
||
|
|
||
|
|
||
|
def encode_sequences(tokenizer, length, lines):
|
||
|
seq = tokenizer.texts_to_sequences(lines)
|
||
|
# pad sequences with 0 values
|
||
|
seq = pad_sequences(seq, maxlen=length, padding='post', value=0)
|
||
|
return seq
|
||
|
|
||
|
|
||
|
seq_len = 50
|
||
|
|
||
|
|
||
|
# In[7]:
|
||
|
|
||
|
|
||
|
train_x = encode_sequences(src_tokenizer, seq_len, train_data)
|
||
|
train_y = encode_sequences(dst_tokenizer, seq_len, train_labels)
|
||
|
|
||
|
|
||
|
# In[8]:
|
||
|
|
||
|
|
||
|
hidden_size = 256
|
||
|
|
||
|
model = Sequential()
|
||
|
model.add(Embedding(src_vocab_size, hidden_size, input_length=seq_len, mask_zero=True))
|
||
|
model.add(LSTM(hidden_size))
|
||
|
model.add(RepeatVector(seq_len))
|
||
|
model.add(LSTM(hidden_size, return_sequences=True))
|
||
|
model.add(Dense(dst_vocab_size, activation='softmax'))
|
||
|
|
||
|
rms = optimizers.RMSprop(learning_rate=0.001)
|
||
|
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
|
||
|
|
||
|
|
||
|
# In[9]:
|
||
|
|
||
|
|
||
|
filename = 'model'
|
||
|
|
||
|
|
||
|
# In[ ]:
|
||
|
|
||
|
|
||
|
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
|
||
|
|
||
|
model.fit(
|
||
|
train_x,
|
||
|
train_y.reshape(train_y.shape[0], train_y.shape[1], 1),
|
||
|
epochs=10,
|
||
|
batch_size=256,
|
||
|
validation_split=0.2,
|
||
|
callbacks=[checkpoint]
|
||
|
)
|
||
|
|
||
|
|
||
|
# In[28]:
|
||
|
|
||
|
|
||
|
dev_x = encode_sequences(src_tokenizer, seq_len, dev_data)
|
||
|
dev_y = encode_sequences(dst_tokenizer, seq_len, dev_labels)
|
||
|
|
||
|
model = load_model(filename)
|
||
|
dev_preds = np.argmax(model.predict(dev_x), axis=-1)
|
||
|
dev_preds_texts = dst_tokenizer.sequences_to_texts(dev_preds)
|
||
|
with open(f'{dev_folder}/out.tsv', 'w') as f:
|
||
|
f.writelines(dev_preds_texts)
|
||
|
|
||
|
|
||
|
# In[27]:
|
||
|
|
||
|
|
||
|
dst_tokenizer.sequences_to_texts([dev_preds[0]])
|
||
|
|
||
|
|
||
|
# In[ ]:
|
||
|
|
||
|
|
||
|
test_folder = 'wmt-2020-pl-en/test-A'
|
||
|
|
||
|
test_data = read_file(f'{test_folder}/in.tsv')
|
||
|
|
||
|
test_x = encode_sequences(src_tokenizer, seq_len, test_data)
|
||
|
test_preds = np.argmax(model.predict(test_x), axis=-1)
|
||
|
test_preds_texts = dst_tokenizer.sequences_to_texts(test_preds)
|
||
|
with open(f'{test_folder}/out.tsv', 'w') as f:
|
||
|
f.writelines(test_preds_texts)
|
||
|
|