wmt-2020-pl-en/run.py
2023-06-15 00:18:30 +02:00

165 lines
3.4 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[1]:
import lzma
import pandas as pd
import numpy as np
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.models import Sequential, load_model
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.backend import clear_session
get_ipython().run_line_magic('matplotlib', 'inline')
pd.set_option('display.max_colwidth', 200)
# In[2]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
# In[3]:
train_folder = 'wmt-2020-pl-en/train'
def read_xz_file(fname):
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
train_data = read_xz_file(f'{train_folder}/in.tsv.xz')
train_labels = read_xz_file(f'{train_folder}/expected.tsv.xz')
# In[4]:
dev_folder = 'wmt-2020-pl-en/dev-0'
def read_file(fname):
with open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
dev_data = read_file(f'{dev_folder}/in.tsv')
dev_labels = read_file(f'{dev_folder}/in.tsv')
# In[5]:
def tokenize(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
src_tokenizer = tokenize(train_data)
src_vocab_size = len(src_tokenizer.word_index) + 1
dst_tokenizer = tokenize(train_labels)
dst_vocab_size = len(dst_tokenizer.word_index) + 1
# In[6]:
def encode_sequences(tokenizer, length, lines):
seq = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
seq = pad_sequences(seq, maxlen=length, padding='post', value=0)
return seq
seq_len = 50
# In[7]:
train_x = encode_sequences(src_tokenizer, seq_len, train_data)
train_y = encode_sequences(dst_tokenizer, seq_len, train_labels)
# In[8]:
hidden_size = 256
model = Sequential()
model.add(Embedding(src_vocab_size, hidden_size, input_length=seq_len, mask_zero=True))
model.add(LSTM(hidden_size))
model.add(RepeatVector(seq_len))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dense(dst_vocab_size, activation='softmax'))
rms = optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
# In[9]:
filename = 'model'
# In[ ]:
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(
train_x,
train_y.reshape(train_y.shape[0], train_y.shape[1], 1),
epochs=10,
batch_size=256,
validation_split=0.2,
callbacks=[checkpoint]
)
# In[28]:
dev_x = encode_sequences(src_tokenizer, seq_len, dev_data)
dev_y = encode_sequences(dst_tokenizer, seq_len, dev_labels)
model = load_model(filename)
dev_preds = np.argmax(model.predict(dev_x), axis=-1)
dev_preds_texts = dst_tokenizer.sequences_to_texts(dev_preds)
with open(f'{dev_folder}/out.tsv', 'w') as f:
f.writelines(dev_preds_texts)
# In[27]:
dst_tokenizer.sequences_to_texts([dev_preds[0]])
# In[ ]:
test_folder = 'wmt-2020-pl-en/test-A'
test_data = read_file(f'{test_folder}/in.tsv')
test_x = encode_sequences(src_tokenizer, seq_len, test_data)
test_preds = np.argmax(model.predict(test_x), axis=-1)
test_preds_texts = dst_tokenizer.sequences_to_texts(test_preds)
with open(f'{test_folder}/out.tsv', 'w') as f:
f.writelines(test_preds_texts)