challenging-america-word-ga.../run.py

282 lines
8.2 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import regex as re
import csv
import torch
from torch import nn
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
# In[2]:
torch.cuda.empty_cache()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# In[3]:
def clean_text(text):
text = text.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ')
text = re.sub(r'\p{P}', '', text)
text = text.replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
return text
# In[4]:
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
train_data = train_data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
# In[5]:
class TrainCorpus:
def __init__(self, data):
self.data = data
def __iter__(self):
for _, row in self.data.iterrows():
text = str(row[6]) + str(row[0]) + str(row[7])
text = clean_text(text)
yield word_tokenize(text)
# In[6]:
train_sentences = TrainCorpus(train_data.head(80000))
w2v_model = Word2Vec(vector_size=100, min_count=10)
# In[7]:
w2v_model.build_vocab(corpus_iterable=train_sentences)
key_to_index = w2v_model.wv.key_to_index
index_to_key = w2v_model.wv.index_to_key
index_to_key.append('<unk>')
key_to_index['<unk>'] = len(index_to_key) - 1
vocab_size = len(index_to_key)
print(vocab_size)
# In[8]:
class TrainDataset(torch.utils.data.IterableDataset):
def __init__(self, data, index_to_key, key_to_index, reversed=False):
self.reversed = reversed
self.data = data
self.index_to_key = index_to_key
self.key_to_index = key_to_index
self.vocab_size = len(key_to_index)
def __iter__(self):
for _, row in self.data.iterrows():
text = str(row[6]) + str(row[0]) + str(row[7])
text = clean_text(text)
tokens = word_tokenize(text)
if self.reversed:
tokens = list(reversed(tokens))
for i in range(5, len(tokens), 1):
input_context = tokens[i-5:i]
target_context = tokens[i-4:i+1]
#gap_word = tokens[i]
input_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in input_context]
target_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in target_context]
#word_index = self.key_to_index[gap_word] if gap_word in self.key_to_index else self.key_to_index['<unk>']
#word_embed = np.concatenate([np.zeros(word_index), np.ones(1), np.zeros(vocab_size - word_index - 1)])
yield np.asarray(input_embed, dtype=np.int64), np.asarray(target_embed, dtype=np.int64)
# In[9]:
class Model(nn.Module):
def __init__(self, embed_size, vocab_size):
super(Model, self).__init__()
self.embed_size = embed_size
self.vocab_size = vocab_size
self.lstm_size = 128
self.num_layers = 2
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_size)
self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.lstm_size, num_layers=self.num_layers, dropout=0.2)
self.fc = nn.Linear(self.lstm_size, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embed(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
probs = torch.softmax(logits, dim=1)
return logits, state
def init_state(self, sequence_length):
zeros = torch.zeros(self.num_layers, sequence_length, self.gru_size).to(device)
return (zeros, zeros)
# In[10]:
from torch.utils.data import DataLoader
from torch.optim import Adam
def train(dataset, model, max_epochs, batch_size):
model.train()
dataloader = DataLoader(dataset, batch_size=batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
for epoch in range(max_epochs):
for batch, (x, y) in enumerate(dataloader):
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, (state_h, state_c) = model(x)
loss = criterion(y_pred.transpose(1, 2), y)
loss.backward()
optimizer.step()
if batch % 1000 == 0:
print(f'epoch: {epoch}, update in batch {batch}/???, loss: {loss.item()}')
# In[11]:
train_dataset_front = TrainDataset(train_data.head(80000), index_to_key, key_to_index, False)
train_dataset_back = TrainDataset(train_data.tail(80000), index_to_key, key_to_index, True)
# In[12]:
model_front = Model(100, vocab_size).to(device)
model_back = Model(100, vocab_size).to(device)
# In[13]:
train(train_dataset_front, model_front, 1, 64)
# In[14]:
train(train_dataset_back, model_back, 1, 64)
# In[30]:
def predict_probs(left_tokens, right_tokens):
model_front.eval()
model_back.eval()
x_left = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index['<unk>'] for w in left_tokens]]).to(device)
x_right = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index['<unk>'] for w in right_tokens]]).to(device)
y_pred_left, (state_h_left, state_c_left) = model_front(x_left)
y_pred_right, (state_h_right, state_c_right) = model_back(x_right)
last_word_logits_left = y_pred_left[0][-1]
last_word_logits_right = y_pred_right[0][-1]
probs_left = torch.nn.functional.softmax(last_word_logits_left, dim=0).detach().cpu().numpy()
probs_right = torch.nn.functional.softmax(last_word_logits_right, dim=0).detach().cpu().numpy()
probs = [np.mean(k) for k in zip(probs_left, probs_right)]
top_words = []
for index in range(len(probs)):
if len(top_words) < 30:
top_words.append((probs[index], [index]))
else:
worst_word = None
for word in top_words:
if not worst_word:
worst_word = word
else:
if word[0] < worst_word[0]:
worst_word = word
if worst_word[0] < probs[index] and index != len(probs) - 1:
top_words.remove(worst_word)
top_words.append((probs[index], [index]))
prediction = ''
sum_prob = 0.0
for word in top_words:
sum_prob += word[0]
word_index = word[0]
word_text = index_to_key[word[1][0]]
prediction += f'{word_text}:{word_index} '
prediction += f':{1 - sum_prob}'
return prediction
# In[16]:
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
# In[39]:
with open('dev-0/out.tsv', 'w') as file:
for index, row in dev_data.iterrows():
left_text = clean_text(str(row[6]))
right_text = clean_text(str(row[7]))
left_words = word_tokenize(left_text)
right_words = word_tokenize(right_text)
right_words.reverse()
if len(left_words) < 6 or len(right_words) < 6:
prediction = ':1.0'
else:
prediction = predict_probs(left_words[-5:], right_words[-5:])
file.write(prediction + '\n')
# In[41]:
with open('test-A/out.tsv', 'w') as file:
for index, row in test_data.iterrows():
left_text = clean_text(str(row[6]))
right_text = clean_text(str(row[7]))
left_words = word_tokenize(left_text)
right_words = word_tokenize(right_text)
right_words.reverse()
if len(left_words) < 6 or len(right_words) < 6:
prediction = ':1.0'
else:
prediction = predict_probs(left_words[-5:], right_words[-5:])
file.write(prediction + '\n')