final solution
This commit is contained in:
parent
d8961a9410
commit
6f93a318b8
|
@ -0,0 +1,3 @@
|
|||
.ipynb_checkpoints/
|
||||
processed_train.txt
|
||||
model/
|
|
@ -0,0 +1 @@
|
|||
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1 @@
|
|||
FileId Year LeftContext RightContext
|
|
|
@ -0,0 +1 @@
|
|||
Word
|
|
|
@ -0,0 +1,281 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[1]:
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import regex as re
|
||||
import csv
|
||||
import torch
|
||||
from torch import nn
|
||||
from gensim.models import Word2Vec
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
|
||||
# In[2]:
|
||||
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
text = text.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ')
|
||||
text = re.sub(r'\p{P}', '', text)
|
||||
text = text.replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||
train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||
|
||||
train_data = train_data[[6, 7]]
|
||||
train_data = pd.concat([train_data, train_labels], axis=1)
|
||||
|
||||
|
||||
# In[5]:
|
||||
|
||||
|
||||
class TrainCorpus:
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
|
||||
def __iter__(self):
|
||||
for _, row in self.data.iterrows():
|
||||
text = str(row[6]) + str(row[0]) + str(row[7])
|
||||
text = clean_text(text)
|
||||
yield word_tokenize(text)
|
||||
|
||||
|
||||
# In[6]:
|
||||
|
||||
|
||||
train_sentences = TrainCorpus(train_data.head(80000))
|
||||
w2v_model = Word2Vec(vector_size=100, min_count=10)
|
||||
|
||||
|
||||
# In[7]:
|
||||
|
||||
|
||||
w2v_model.build_vocab(corpus_iterable=train_sentences)
|
||||
|
||||
key_to_index = w2v_model.wv.key_to_index
|
||||
index_to_key = w2v_model.wv.index_to_key
|
||||
|
||||
index_to_key.append('<unk>')
|
||||
key_to_index['<unk>'] = len(index_to_key) - 1
|
||||
|
||||
vocab_size = len(index_to_key)
|
||||
print(vocab_size)
|
||||
|
||||
|
||||
# In[8]:
|
||||
|
||||
|
||||
class TrainDataset(torch.utils.data.IterableDataset):
|
||||
def __init__(self, data, index_to_key, key_to_index, reversed=False):
|
||||
self.reversed = reversed
|
||||
self.data = data
|
||||
self.index_to_key = index_to_key
|
||||
self.key_to_index = key_to_index
|
||||
self.vocab_size = len(key_to_index)
|
||||
|
||||
def __iter__(self):
|
||||
for _, row in self.data.iterrows():
|
||||
text = str(row[6]) + str(row[0]) + str(row[7])
|
||||
text = clean_text(text)
|
||||
tokens = word_tokenize(text)
|
||||
if self.reversed:
|
||||
tokens = list(reversed(tokens))
|
||||
for i in range(5, len(tokens), 1):
|
||||
input_context = tokens[i-5:i]
|
||||
target_context = tokens[i-4:i+1]
|
||||
#gap_word = tokens[i]
|
||||
|
||||
input_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in input_context]
|
||||
target_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in target_context]
|
||||
#word_index = self.key_to_index[gap_word] if gap_word in self.key_to_index else self.key_to_index['<unk>']
|
||||
#word_embed = np.concatenate([np.zeros(word_index), np.ones(1), np.zeros(vocab_size - word_index - 1)])
|
||||
|
||||
yield np.asarray(input_embed, dtype=np.int64), np.asarray(target_embed, dtype=np.int64)
|
||||
|
||||
|
||||
# In[9]:
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self, embed_size, vocab_size):
|
||||
super(Model, self).__init__()
|
||||
self.embed_size = embed_size
|
||||
self.vocab_size = vocab_size
|
||||
self.lstm_size = 128
|
||||
self.num_layers = 2
|
||||
|
||||
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_size)
|
||||
self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.lstm_size, num_layers=self.num_layers, dropout=0.2)
|
||||
self.fc = nn.Linear(self.lstm_size, vocab_size)
|
||||
|
||||
def forward(self, x, prev_state = None):
|
||||
embed = self.embed(x)
|
||||
output, state = self.lstm(embed, prev_state)
|
||||
logits = self.fc(output)
|
||||
probs = torch.softmax(logits, dim=1)
|
||||
return logits, state
|
||||
|
||||
def init_state(self, sequence_length):
|
||||
zeros = torch.zeros(self.num_layers, sequence_length, self.gru_size).to(device)
|
||||
return (zeros, zeros)
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.optim import Adam
|
||||
|
||||
def train(dataset, model, max_epochs, batch_size):
|
||||
model.train()
|
||||
|
||||
dataloader = DataLoader(dataset, batch_size=batch_size)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
|
||||
for epoch in range(max_epochs):
|
||||
for batch, (x, y) in enumerate(dataloader):
|
||||
optimizer.zero_grad()
|
||||
|
||||
x = x.to(device)
|
||||
y = y.to(device)
|
||||
|
||||
y_pred, (state_h, state_c) = model(x)
|
||||
loss = criterion(y_pred.transpose(1, 2), y)
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
if batch % 1000 == 0:
|
||||
print(f'epoch: {epoch}, update in batch {batch}/???, loss: {loss.item()}')
|
||||
|
||||
|
||||
# In[11]:
|
||||
|
||||
|
||||
train_dataset_front = TrainDataset(train_data.head(80000), index_to_key, key_to_index, False)
|
||||
train_dataset_back = TrainDataset(train_data.tail(80000), index_to_key, key_to_index, True)
|
||||
|
||||
|
||||
# In[12]:
|
||||
|
||||
|
||||
model_front = Model(100, vocab_size).to(device)
|
||||
model_back = Model(100, vocab_size).to(device)
|
||||
|
||||
|
||||
# In[13]:
|
||||
|
||||
|
||||
train(train_dataset_front, model_front, 1, 64)
|
||||
|
||||
|
||||
# In[14]:
|
||||
|
||||
|
||||
train(train_dataset_back, model_back, 1, 64)
|
||||
|
||||
|
||||
# In[30]:
|
||||
|
||||
|
||||
def predict_probs(left_tokens, right_tokens):
|
||||
model_front.eval()
|
||||
model_back.eval()
|
||||
|
||||
x_left = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index['<unk>'] for w in left_tokens]]).to(device)
|
||||
x_right = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index['<unk>'] for w in right_tokens]]).to(device)
|
||||
y_pred_left, (state_h_left, state_c_left) = model_front(x_left)
|
||||
y_pred_right, (state_h_right, state_c_right) = model_back(x_right)
|
||||
|
||||
last_word_logits_left = y_pred_left[0][-1]
|
||||
last_word_logits_right = y_pred_right[0][-1]
|
||||
probs_left = torch.nn.functional.softmax(last_word_logits_left, dim=0).detach().cpu().numpy()
|
||||
probs_right = torch.nn.functional.softmax(last_word_logits_right, dim=0).detach().cpu().numpy()
|
||||
|
||||
probs = [np.mean(k) for k in zip(probs_left, probs_right)]
|
||||
|
||||
top_words = []
|
||||
for index in range(len(probs)):
|
||||
if len(top_words) < 30:
|
||||
top_words.append((probs[index], [index]))
|
||||
else:
|
||||
worst_word = None
|
||||
for word in top_words:
|
||||
if not worst_word:
|
||||
worst_word = word
|
||||
else:
|
||||
if word[0] < worst_word[0]:
|
||||
worst_word = word
|
||||
if worst_word[0] < probs[index] and index != len(probs) - 1:
|
||||
top_words.remove(worst_word)
|
||||
top_words.append((probs[index], [index]))
|
||||
|
||||
prediction = ''
|
||||
sum_prob = 0.0
|
||||
for word in top_words:
|
||||
sum_prob += word[0]
|
||||
word_index = word[0]
|
||||
word_text = index_to_key[word[1][0]]
|
||||
prediction += f'{word_text}:{word_index} '
|
||||
prediction += f':{1 - sum_prob}'
|
||||
|
||||
return prediction
|
||||
|
||||
|
||||
# In[16]:
|
||||
|
||||
|
||||
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||
|
||||
|
||||
# In[39]:
|
||||
|
||||
|
||||
with open('dev-0/out.tsv', 'w') as file:
|
||||
for index, row in dev_data.iterrows():
|
||||
left_text = clean_text(str(row[6]))
|
||||
right_text = clean_text(str(row[7]))
|
||||
left_words = word_tokenize(left_text)
|
||||
right_words = word_tokenize(right_text)
|
||||
right_words.reverse()
|
||||
if len(left_words) < 6 or len(right_words) < 6:
|
||||
prediction = ':1.0'
|
||||
else:
|
||||
prediction = predict_probs(left_words[-5:], right_words[-5:])
|
||||
file.write(prediction + '\n')
|
||||
|
||||
|
||||
# In[41]:
|
||||
|
||||
|
||||
with open('test-A/out.tsv', 'w') as file:
|
||||
for index, row in test_data.iterrows():
|
||||
left_text = clean_text(str(row[6]))
|
||||
right_text = clean_text(str(row[7]))
|
||||
left_words = word_tokenize(left_text)
|
||||
right_words = word_tokenize(right_text)
|
||||
right_words.reverse()
|
||||
if len(left_words) < 6 or len(right_words) < 6:
|
||||
prediction = ':1.0'
|
||||
else:
|
||||
prediction = predict_probs(left_words[-5:], right_words[-5:])
|
||||
file.write(prediction + '\n')
|
||||
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Loading…
Reference in New Issue