#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import regex as re import csv import torch from torch import nn from gensim.models import Word2Vec from nltk.tokenize import word_tokenize # In[2]: torch.cuda.empty_cache() device = 'cuda' if torch.cuda.is_available() else 'cpu' # In[3]: def clean_text(text): text = text.lower().replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ') text = re.sub(r'\p{P}', '', text) text = text.replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have") return text # In[4]: train_data = pd.read_csv('train/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) train_labels = pd.read_csv('train/expected.tsv', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) train_data = train_data[[6, 7]] train_data = pd.concat([train_data, train_labels], axis=1) # In[5]: class TrainCorpus: def __init__(self, data): self.data = data def __iter__(self): for _, row in self.data.iterrows(): text = str(row[6]) + str(row[0]) + str(row[7]) text = clean_text(text) yield word_tokenize(text) # In[6]: train_sentences = TrainCorpus(train_data.head(80000)) w2v_model = Word2Vec(vector_size=100, min_count=10) # In[7]: w2v_model.build_vocab(corpus_iterable=train_sentences) key_to_index = w2v_model.wv.key_to_index index_to_key = w2v_model.wv.index_to_key index_to_key.append('') key_to_index[''] = len(index_to_key) - 1 vocab_size = len(index_to_key) print(vocab_size) # In[8]: class TrainDataset(torch.utils.data.IterableDataset): def __init__(self, data, index_to_key, key_to_index, reversed=False): self.reversed = reversed self.data = data self.index_to_key = index_to_key self.key_to_index = key_to_index self.vocab_size = len(key_to_index) def __iter__(self): for _, row in self.data.iterrows(): text = str(row[6]) + str(row[0]) + str(row[7]) text = clean_text(text) tokens = word_tokenize(text) if self.reversed: tokens = list(reversed(tokens)) for i in range(5, len(tokens), 1): input_context = tokens[i-5:i] target_context = tokens[i-4:i+1] #gap_word = tokens[i] input_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index[''] for word in input_context] target_embed = [self.key_to_index[word] if word in self.key_to_index else self.key_to_index[''] for word in target_context] #word_index = self.key_to_index[gap_word] if gap_word in self.key_to_index else self.key_to_index[''] #word_embed = np.concatenate([np.zeros(word_index), np.ones(1), np.zeros(vocab_size - word_index - 1)]) yield np.asarray(input_embed, dtype=np.int64), np.asarray(target_embed, dtype=np.int64) # In[9]: class Model(nn.Module): def __init__(self, embed_size, vocab_size): super(Model, self).__init__() self.embed_size = embed_size self.vocab_size = vocab_size self.lstm_size = 128 self.num_layers = 2 self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_size) self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.lstm_size, num_layers=self.num_layers, dropout=0.2) self.fc = nn.Linear(self.lstm_size, vocab_size) def forward(self, x, prev_state = None): embed = self.embed(x) output, state = self.lstm(embed, prev_state) logits = self.fc(output) probs = torch.softmax(logits, dim=1) return logits, state def init_state(self, sequence_length): zeros = torch.zeros(self.num_layers, sequence_length, self.gru_size).to(device) return (zeros, zeros) # In[10]: from torch.utils.data import DataLoader from torch.optim import Adam def train(dataset, model, max_epochs, batch_size): model.train() dataloader = DataLoader(dataset, batch_size=batch_size) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=0.001) for epoch in range(max_epochs): for batch, (x, y) in enumerate(dataloader): optimizer.zero_grad() x = x.to(device) y = y.to(device) y_pred, (state_h, state_c) = model(x) loss = criterion(y_pred.transpose(1, 2), y) loss.backward() optimizer.step() if batch % 1000 == 0: print(f'epoch: {epoch}, update in batch {batch}/???, loss: {loss.item()}') # In[11]: train_dataset_front = TrainDataset(train_data.head(80000), index_to_key, key_to_index, False) train_dataset_back = TrainDataset(train_data.tail(80000), index_to_key, key_to_index, True) # In[12]: model_front = Model(100, vocab_size).to(device) model_back = Model(100, vocab_size).to(device) # In[13]: train(train_dataset_front, model_front, 1, 64) # In[14]: train(train_dataset_back, model_back, 1, 64) # In[30]: def predict_probs(left_tokens, right_tokens): model_front.eval() model_back.eval() x_left = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index[''] for w in left_tokens]]).to(device) x_right = torch.tensor([[train_dataset_front.key_to_index[w] if w in key_to_index else train_dataset_front.key_to_index[''] for w in right_tokens]]).to(device) y_pred_left, (state_h_left, state_c_left) = model_front(x_left) y_pred_right, (state_h_right, state_c_right) = model_back(x_right) last_word_logits_left = y_pred_left[0][-1] last_word_logits_right = y_pred_right[0][-1] probs_left = torch.nn.functional.softmax(last_word_logits_left, dim=0).detach().cpu().numpy() probs_right = torch.nn.functional.softmax(last_word_logits_right, dim=0).detach().cpu().numpy() probs = [np.mean(k) for k in zip(probs_left, probs_right)] top_words = [] for index in range(len(probs)): if len(top_words) < 30: top_words.append((probs[index], [index])) else: worst_word = None for word in top_words: if not worst_word: worst_word = word else: if word[0] < worst_word[0]: worst_word = word if worst_word[0] < probs[index] and index != len(probs) - 1: top_words.remove(worst_word) top_words.append((probs[index], [index])) prediction = '' sum_prob = 0.0 for word in top_words: sum_prob += word[0] word_index = word[0] word_text = index_to_key[word[1][0]] prediction += f'{word_text}:{word_index} ' prediction += f':{1 - sum_prob}' return prediction # In[16]: dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) # In[39]: with open('dev-0/out.tsv', 'w') as file: for index, row in dev_data.iterrows(): left_text = clean_text(str(row[6])) right_text = clean_text(str(row[7])) left_words = word_tokenize(left_text) right_words = word_tokenize(right_text) right_words.reverse() if len(left_words) < 6 or len(right_words) < 6: prediction = ':1.0' else: prediction = predict_probs(left_words[-5:], right_words[-5:]) file.write(prediction + '\n') # In[41]: with open('test-A/out.tsv', 'w') as file: for index, row in test_data.iterrows(): left_text = clean_text(str(row[6])) right_text = clean_text(str(row[7])) left_words = word_tokenize(left_text) right_words = word_tokenize(right_text) right_words.reverse() if len(left_words) < 6 or len(right_words) < 6: prediction = ':1.0' else: prediction = predict_probs(left_words[-5:], right_words[-5:]) file.write(prediction + '\n')