final solution
This commit is contained in:
parent
d8961a9410
commit
6f93a318b8
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
.ipynb_checkpoints/
|
||||||
|
processed_train.txt
|
||||||
|
model/
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv
|
10519
dev-0/expected.tsv
Normal file
10519
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
Normal file
BIN
dev-0/in.tsv.xz
Normal file
Binary file not shown.
10519
dev-0/out.tsv
Normal file
10519
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
FileId Year LeftContext RightContext
|
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
Word
|
|
281
run.py
Normal file
281
run.py
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import regex as re
|
||||||
|
import csv
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
|
||||||
|
|
||||||
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
text = text.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ')
|
||||||
|
text = re.sub(r'\p{P}', '', text)
|
||||||
|
text = text.replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
|
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
train_data = train_data[[6, 7]]
|
||||||
|
train_data = pd.concat([train_data, train_labels], axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
class TrainCorpus:
|
||||||
|
def __init__(self, data):
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for _, row in self.data.iterrows():
|
||||||
|
text = str(row[6]) + str(row[0]) + str(row[7])
|
||||||
|
text = clean_text(text)
|
||||||
|
yield word_tokenize(text)
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
train_sentences = TrainCorpus(train_data.head(80000))
|
||||||
|
w2v_model = Word2Vec(vector_size=100, min_count=10)
|
||||||
|
|
||||||
|
|
||||||
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
|
w2v_model.build_vocab(corpus_iterable=train_sentences)
|
||||||
|
|
||||||
|
key_to_index = w2v_model.wv.key_to_index
|
||||||
|
index_to_key = w2v_model.wv.index_to_key
|
||||||
|
|
||||||
|
index_to_key.append('<unk>')
|
||||||
|
key_to_index['<unk>'] = len(index_to_key) - 1
|
||||||
|
|
||||||
|
vocab_size = len(index_to_key)
|
||||||
|
print(vocab_size)
|
||||||
|
|
||||||
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
class TrainDataset(torch.utils.data.IterableDataset):
|
||||||
|
def __init__(self, data, index_to_key, key_to_index, reversed=False):
|
||||||
|
self.reversed = reversed
|
||||||
|
self.data = data
|
||||||
|
self.index_to_key = index_to_key
|
||||||
|
self.key_to_index = key_to_index
|
||||||
|
self.vocab_size = len(key_to_index)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for _, row in self.data.iterrows():
|
||||||
|
text = str(row[6]) + str(row[0]) + str(row[7])
|
||||||
|
text = clean_text(text)
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
if self.reversed:
|
||||||
|
tokens = list(reversed(tokens))
|
||||||
|
for i in range(5, len(tokens), 1):
|
||||||
|
input_context = tokens[i-5:i]
|
||||||
|
target_context = tokens[i-4:i+1]
|
||||||
|
#gap_word = tokens[i]
|
||||||
|
|
||||||
|
input_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in input_context]
|
||||||
|
target_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index['<unk>'] for word in target_context]
|
||||||
|
#word_index = self.key_to_index[gap_word] if gap_word in self.key_to_index else self.key_to_index['<unk>']
|
||||||
|
#word_embed = np.concatenate([np.zeros(word_index), np.ones(1), np.zeros(vocab_size - word_index - 1)])
|
||||||
|
|
||||||
|
yield np.asarray(input_embed, dtype=np.int64), np.asarray(target_embed, dtype=np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
class Model(nn.Module):
|
||||||
|
def __init__(self, embed_size, vocab_size):
|
||||||
|
super(Model, self).__init__()
|
||||||
|
self.embed_size = embed_size
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.lstm_size = 128
|
||||||
|
self.num_layers = 2
|
||||||
|
|
||||||
|
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_size)
|
||||||
|
self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.lstm_size, num_layers=self.num_layers, dropout=0.2)
|
||||||
|
self.fc = nn.Linear(self.lstm_size, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, x, prev_state = None):
|
||||||
|
embed = self.embed(x)
|
||||||
|
output, state = self.lstm(embed, prev_state)
|
||||||
|
logits = self.fc(output)
|
||||||
|
probs = torch.softmax(logits, dim=1)
|
||||||
|
return logits, state
|
||||||
|
|
||||||
|
def init_state(self, sequence_length):
|
||||||
|
zeros = torch.zeros(self.num_layers, sequence_length, self.gru_size).to(device)
|
||||||
|
return (zeros, zeros)
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from torch.optim import Adam
|
||||||
|
|
||||||
|
def train(dataset, model, max_epochs, batch_size):
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
dataloader = DataLoader(dataset, batch_size=batch_size)
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
optimizer = Adam(model.parameters(), lr=0.001)
|
||||||
|
|
||||||
|
for epoch in range(max_epochs):
|
||||||
|
for batch, (x, y) in enumerate(dataloader):
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
x = x.to(device)
|
||||||
|
y = y.to(device)
|
||||||
|
|
||||||
|
y_pred, (state_h, state_c) = model(x)
|
||||||
|
loss = criterion(y_pred.transpose(1, 2), y)
|
||||||
|
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
if batch % 1000 == 0:
|
||||||
|
print(f'epoch: {epoch}, update in batch {batch}/???, loss: {loss.item()}')
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
train_dataset_front = TrainDataset(train_data.head(80000), index_to_key, key_to_index, False)
|
||||||
|
train_dataset_back = TrainDataset(train_data.tail(80000), index_to_key, key_to_index, True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
model_front = Model(100, vocab_size).to(device)
|
||||||
|
model_back = Model(100, vocab_size).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
|
train(train_dataset_front, model_front, 1, 64)
|
||||||
|
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
|
||||||
|
train(train_dataset_back, model_back, 1, 64)
|
||||||
|
|
||||||
|
|
||||||
|
# In[30]:
|
||||||
|
|
||||||
|
|
||||||
|
def predict_probs(left_tokens, right_tokens):
|
||||||
|
model_front.eval()
|
||||||
|
model_back.eval()
|
||||||
|
|
||||||
|
x_left = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index['<unk>'] for w in left_tokens]]).to(device)
|
||||||
|
x_right = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index['<unk>'] for w in right_tokens]]).to(device)
|
||||||
|
y_pred_left, (state_h_left, state_c_left) = model_front(x_left)
|
||||||
|
y_pred_right, (state_h_right, state_c_right) = model_back(x_right)
|
||||||
|
|
||||||
|
last_word_logits_left = y_pred_left[0][-1]
|
||||||
|
last_word_logits_right = y_pred_right[0][-1]
|
||||||
|
probs_left = torch.nn.functional.softmax(last_word_logits_left, dim=0).detach().cpu().numpy()
|
||||||
|
probs_right = torch.nn.functional.softmax(last_word_logits_right, dim=0).detach().cpu().numpy()
|
||||||
|
|
||||||
|
probs = [np.mean(k) for k in zip(probs_left, probs_right)]
|
||||||
|
|
||||||
|
top_words = []
|
||||||
|
for index in range(len(probs)):
|
||||||
|
if len(top_words) < 30:
|
||||||
|
top_words.append((probs[index], [index]))
|
||||||
|
else:
|
||||||
|
worst_word = None
|
||||||
|
for word in top_words:
|
||||||
|
if not worst_word:
|
||||||
|
worst_word = word
|
||||||
|
else:
|
||||||
|
if word[0] < worst_word[0]:
|
||||||
|
worst_word = word
|
||||||
|
if worst_word[0] < probs[index] and index != len(probs) - 1:
|
||||||
|
top_words.remove(worst_word)
|
||||||
|
top_words.append((probs[index], [index]))
|
||||||
|
|
||||||
|
prediction = ''
|
||||||
|
sum_prob = 0.0
|
||||||
|
for word in top_words:
|
||||||
|
sum_prob += word[0]
|
||||||
|
word_index = word[0]
|
||||||
|
word_text = index_to_key[word[1][0]]
|
||||||
|
prediction += f'{word_text}:{word_index} '
|
||||||
|
prediction += f':{1 - sum_prob}'
|
||||||
|
|
||||||
|
return prediction
|
||||||
|
|
||||||
|
|
||||||
|
# In[16]:
|
||||||
|
|
||||||
|
|
||||||
|
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
|
||||||
|
# In[39]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('dev-0/out.tsv', 'w') as file:
|
||||||
|
for index, row in dev_data.iterrows():
|
||||||
|
left_text = clean_text(str(row[6]))
|
||||||
|
right_text = clean_text(str(row[7]))
|
||||||
|
left_words = word_tokenize(left_text)
|
||||||
|
right_words = word_tokenize(right_text)
|
||||||
|
right_words.reverse()
|
||||||
|
if len(left_words) < 6 or len(right_words) < 6:
|
||||||
|
prediction = ':1.0'
|
||||||
|
else:
|
||||||
|
prediction = predict_probs(left_words[-5:], right_words[-5:])
|
||||||
|
file.write(prediction + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# In[41]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('test-A/out.tsv', 'w') as file:
|
||||||
|
for index, row in test_data.iterrows():
|
||||||
|
left_text = clean_text(str(row[6]))
|
||||||
|
right_text = clean_text(str(row[7]))
|
||||||
|
left_words = word_tokenize(left_text)
|
||||||
|
right_words = word_tokenize(right_text)
|
||||||
|
right_words.reverse()
|
||||||
|
if len(left_words) < 6 or len(right_words) < 6:
|
||||||
|
prediction = ':1.0'
|
||||||
|
else:
|
||||||
|
prediction = predict_probs(left_words[-5:], right_words[-5:])
|
||||||
|
file.write(prediction + '\n')
|
||||||
|
|
BIN
test-A/in.tsv.xz
Normal file
BIN
test-A/in.tsv.xz
Normal file
Binary file not shown.
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
432022
train/expected.tsv
Normal file
432022
train/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Normal file
BIN
train/in.tsv.xz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user