challenging-america-word-ga.../run.py

260 lines
7.0 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import regex as re
import csv
import torch
from torch import nn
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
# In[2]:
torch.cuda.empty_cache()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# In[3]:
def clean_text(text):
text = text.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ')
text = re.sub(r'\p{P}', '', text)
text = text.replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
return text
# In[4]:
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
train_data = train_data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
# In[5]:
class TrainCorpus:
def __init__(self, data):
self.data = data
def __iter__(self):
for _, row in self.data.iterrows():
text = str(row[6]) + str(row[0]) + str(row[7])
text = clean_text(text)
yield word_tokenize(text)
# In[6]:
train_sentences = TrainCorpus(train_data.head(100000))
w2v_model = Word2Vec(vector_size=100, min_count=10)
# In[7]:
w2v_model.build_vocab(corpus_iterable=train_sentences)
key_to_index = w2v_model.wv.key_to_index
index_to_key = w2v_model.wv.index_to_key
index_to_key.append('<unk>')
key_to_index['<unk>'] = len(index_to_key) - 1
vocab_size = len(index_to_key)
print(vocab_size)
# In[8]:
class TrainDataset(torch.utils.data.IterableDataset):
def __init__(self, data, index_to_key, key_to_index):
self.data = data
self.index_to_key = index_to_key
self.key_to_index = key_to_index
self.vocab_size = len(key_to_index)
def __iter__(self):
for _, row in self.data.iterrows():
text = str(row[6]) + str(row[0]) + str(row[7])
text = clean_text(text)
tokens = word_tokenize(text)
for i in range(5, len(tokens), 1):
input_context = tokens[i-5:i]
target_context = tokens[i-4:i+1]
#gap_word = tokens[i]
input_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in input_context]
target_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in target_context]
#word_index = self.key_to_index[gap_word] if gap_word in self.key_to_index else self.key_to_index['<unk>']
#word_embed = np.concatenate([np.zeros(word_index), np.ones(1), np.zeros(vocab_size - word_index - 1)])
yield np.asarray(input_embed, dtype=np.int64), np.asarray(target_embed, dtype=np.int64)
# In[9]:
class Model(nn.Module):
def __init__(self, embed_size, vocab_size):
super(Model, self).__init__()
self.embed_size = embed_size
self.vocab_size = vocab_size
self.gru_size = 128
self.num_layers = 2
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_size)
self.gru = nn.GRU(input_size=self.embed_size, hidden_size=self.gru_size, num_layers=self.num_layers, dropout=0.2)
self.fc = nn.Linear(self.gru_size, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embed(x)
output, state = self.gru(embed, prev_state)
logits = self.fc(output)
probs = torch.softmax(logits, dim=1)
return logits, state
def init_state(self, sequence_length):
zeros = torch.zeros(self.num_layers, sequence_length, self.gru_size).to(device)
return (zeros, zeros)
# In[10]:
from torch.utils.data import DataLoader
from torch.optim import Adam
def train(dataset, model, max_epochs, batch_size):
model.train()
dataloader = DataLoader(dataset, batch_size=batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
for epoch in range(max_epochs):
for batch, (x, y) in enumerate(dataloader):
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, state_h = model(x)
loss = criterion(y_pred.transpose(1, 2), y)
loss.backward()
optimizer.step()
if batch % 1000 == 0:
print(f'epoch: {epoch}, update in batch {batch}/???, loss: {loss.item()}')
# In[11]:
train_dataset = TrainDataset(train_data.head(100000), index_to_key, key_to_index)
# In[12]:
model = Model(100, vocab_size).to(device)
# In[13]:
train(train_dataset, model, 1, 64)
# In[58]:
def predict_probs(tokens):
model.eval()
state_h = model.init_state(len(tokens))
x = torch.tensor([[train_dataset.key_to_index[w] if w in key_to_index else train_dataset.key_to_index['<unk>'] for w in tokens]]).to(device)
y_pred, state_h = model(x)
last_word_logits = y_pred[0][-1]
probs = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
word_index = np.random.choice(len(last_word_logits), p=probs)
top_words = []
for index in range(len(probs)):
if len(top_words) < 30:
top_words.append((probs[index], [index]))
else:
worst_word = None
for word in top_words:
if not worst_word:
worst_word = word
else:
if word[0] < worst_word[0]:
worst_word = word
if worst_word[0] < probs[index] and index != len(probs) - 1:
top_words.remove(worst_word)
top_words.append((probs[index], [index]))
prediction = ''
sum_prob = 0.0
for word in top_words:
sum_prob += word[0]
word_index = word[0]
word_text = index_to_key[word[1][0]]
prediction += f'{word_text}:{word_index} '
prediction += f':{1 - sum_prob}'
return prediction
# In[56]:
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
# In[59]:
with open('dev-0/out.tsv', 'w') as file:
for index, row in dev_data.iterrows():
left_text = clean_text(str(row[6]))
left_words = word_tokenize(left_text)
if len(left_words) < 6:
prediction = ':1.0'
else:
prediction = predict_probs(left_words[-5:])
file.write(prediction + '\n')
# In[60]:
with open('test-A/out.tsv', 'w') as file:
for index, row in test_data.iterrows():
left_text = clean_text(str(row[6]))
left_words = word_tokenize(left_text)
if len(left_words) < 6:
prediction = ':1.0'
else:
prediction = predict_probs(left_words[-5:])
file.write(prediction + '\n')