260 lines
7.0 KiB
Python
260 lines
7.0 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
# In[1]:
|
|
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import regex as re
|
|
import csv
|
|
import torch
|
|
from torch import nn
|
|
from gensim.models import Word2Vec
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
|
|
# In[2]:
|
|
|
|
|
|
torch.cuda.empty_cache()
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
|
# In[3]:
|
|
|
|
|
|
def clean_text(text):
|
|
text = text.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ')
|
|
text = re.sub(r'\p{P}', '', text)
|
|
text = text.replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
|
|
|
|
return text
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
|
train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
|
|
|
train_data = train_data[[6, 7]]
|
|
train_data = pd.concat([train_data, train_labels], axis=1)
|
|
|
|
|
|
# In[5]:
|
|
|
|
|
|
class TrainCorpus:
|
|
def __init__(self, data):
|
|
self.data = data
|
|
|
|
def __iter__(self):
|
|
for _, row in self.data.iterrows():
|
|
text = str(row[6]) + str(row[0]) + str(row[7])
|
|
text = clean_text(text)
|
|
yield word_tokenize(text)
|
|
|
|
|
|
# In[6]:
|
|
|
|
|
|
train_sentences = TrainCorpus(train_data.head(100000))
|
|
w2v_model = Word2Vec(vector_size=100, min_count=10)
|
|
|
|
|
|
# In[7]:
|
|
|
|
|
|
w2v_model.build_vocab(corpus_iterable=train_sentences)
|
|
|
|
key_to_index = w2v_model.wv.key_to_index
|
|
index_to_key = w2v_model.wv.index_to_key
|
|
|
|
index_to_key.append('<unk>')
|
|
key_to_index['<unk>'] = len(index_to_key) - 1
|
|
|
|
vocab_size = len(index_to_key)
|
|
print(vocab_size)
|
|
|
|
|
|
# In[8]:
|
|
|
|
|
|
class TrainDataset(torch.utils.data.IterableDataset):
|
|
def __init__(self, data, index_to_key, key_to_index):
|
|
self.data = data
|
|
self.index_to_key = index_to_key
|
|
self.key_to_index = key_to_index
|
|
self.vocab_size = len(key_to_index)
|
|
|
|
def __iter__(self):
|
|
for _, row in self.data.iterrows():
|
|
text = str(row[6]) + str(row[0]) + str(row[7])
|
|
text = clean_text(text)
|
|
tokens = word_tokenize(text)
|
|
for i in range(5, len(tokens), 1):
|
|
input_context = tokens[i-5:i]
|
|
target_context = tokens[i-4:i+1]
|
|
#gap_word = tokens[i]
|
|
|
|
input_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in input_context]
|
|
target_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in target_context]
|
|
#word_index = self.key_to_index[gap_word] if gap_word in self.key_to_index else self.key_to_index['<unk>']
|
|
#word_embed = np.concatenate([np.zeros(word_index), np.ones(1), np.zeros(vocab_size - word_index - 1)])
|
|
|
|
yield np.asarray(input_embed, dtype=np.int64), np.asarray(target_embed, dtype=np.int64)
|
|
|
|
|
|
# In[9]:
|
|
|
|
|
|
class Model(nn.Module):
|
|
def __init__(self, embed_size, vocab_size):
|
|
super(Model, self).__init__()
|
|
self.embed_size = embed_size
|
|
self.vocab_size = vocab_size
|
|
self.gru_size = 128
|
|
self.num_layers = 2
|
|
|
|
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_size)
|
|
self.gru = nn.GRU(input_size=self.embed_size, hidden_size=self.gru_size, num_layers=self.num_layers, dropout=0.2)
|
|
self.fc = nn.Linear(self.gru_size, vocab_size)
|
|
|
|
def forward(self, x, prev_state = None):
|
|
embed = self.embed(x)
|
|
output, state = self.gru(embed, prev_state)
|
|
logits = self.fc(output)
|
|
probs = torch.softmax(logits, dim=1)
|
|
return logits, state
|
|
|
|
def init_state(self, sequence_length):
|
|
zeros = torch.zeros(self.num_layers, sequence_length, self.gru_size).to(device)
|
|
return (zeros, zeros)
|
|
|
|
|
|
# In[10]:
|
|
|
|
|
|
from torch.utils.data import DataLoader
|
|
from torch.optim import Adam
|
|
|
|
def train(dataset, model, max_epochs, batch_size):
|
|
model.train()
|
|
|
|
dataloader = DataLoader(dataset, batch_size=batch_size)
|
|
criterion = nn.CrossEntropyLoss()
|
|
optimizer = Adam(model.parameters(), lr=0.001)
|
|
|
|
for epoch in range(max_epochs):
|
|
for batch, (x, y) in enumerate(dataloader):
|
|
optimizer.zero_grad()
|
|
|
|
x = x.to(device)
|
|
y = y.to(device)
|
|
|
|
y_pred, state_h = model(x)
|
|
loss = criterion(y_pred.transpose(1, 2), y)
|
|
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
if batch % 1000 == 0:
|
|
print(f'epoch: {epoch}, update in batch {batch}/???, loss: {loss.item()}')
|
|
|
|
|
|
# In[11]:
|
|
|
|
|
|
train_dataset = TrainDataset(train_data.head(100000), index_to_key, key_to_index)
|
|
|
|
|
|
# In[12]:
|
|
|
|
|
|
model = Model(100, vocab_size).to(device)
|
|
|
|
|
|
# In[13]:
|
|
|
|
|
|
train(train_dataset, model, 1, 64)
|
|
|
|
|
|
# In[58]:
|
|
|
|
|
|
def predict_probs(tokens):
|
|
model.eval()
|
|
state_h = model.init_state(len(tokens))
|
|
|
|
x = torch.tensor([[train_dataset.key_to_index[w] if w in key_to_index else train_dataset.key_to_index['<unk>'] for w in tokens]]).to(device)
|
|
y_pred, state_h = model(x)
|
|
|
|
last_word_logits = y_pred[0][-1]
|
|
probs = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
|
|
word_index = np.random.choice(len(last_word_logits), p=probs)
|
|
|
|
top_words = []
|
|
for index in range(len(probs)):
|
|
if len(top_words) < 30:
|
|
top_words.append((probs[index], [index]))
|
|
else:
|
|
worst_word = None
|
|
for word in top_words:
|
|
if not worst_word:
|
|
worst_word = word
|
|
else:
|
|
if word[0] < worst_word[0]:
|
|
worst_word = word
|
|
if worst_word[0] < probs[index] and index != len(probs) - 1:
|
|
top_words.remove(worst_word)
|
|
top_words.append((probs[index], [index]))
|
|
|
|
prediction = ''
|
|
sum_prob = 0.0
|
|
for word in top_words:
|
|
sum_prob += word[0]
|
|
word_index = word[0]
|
|
word_text = index_to_key[word[1][0]]
|
|
prediction += f'{word_text}:{word_index} '
|
|
prediction += f':{1 - sum_prob}'
|
|
|
|
return prediction
|
|
|
|
|
|
# In[56]:
|
|
|
|
|
|
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
|
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
|
|
|
|
|
# In[59]:
|
|
|
|
|
|
with open('dev-0/out.tsv', 'w') as file:
|
|
for index, row in dev_data.iterrows():
|
|
left_text = clean_text(str(row[6]))
|
|
left_words = word_tokenize(left_text)
|
|
if len(left_words) < 6:
|
|
prediction = ':1.0'
|
|
else:
|
|
prediction = predict_probs(left_words[-5:])
|
|
file.write(prediction + '\n')
|
|
|
|
|
|
# In[60]:
|
|
|
|
|
|
with open('test-A/out.tsv', 'w') as file:
|
|
for index, row in test_data.iterrows():
|
|
left_text = clean_text(str(row[6]))
|
|
left_words = word_tokenize(left_text)
|
|
if len(left_words) < 6:
|
|
prediction = ':1.0'
|
|
else:
|
|
prediction = predict_probs(left_words[-5:])
|
|
file.write(prediction + '\n')
|
|
|