diff --git a/run.py b/run.py old mode 100755 new mode 100644 index dca5564..e252b0b --- a/run.py +++ b/run.py @@ -1,22 +1,68 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[2]: - - -from nltk import trigrams, word_tokenize -import pandas as pd -import csv +from itertools import islice import regex as re -from collections import Counter, defaultdict -import kenlm -from english_words import english_words_alpha_set -from math import log10 +import sys +from torchtext.vocab import build_vocab_from_iterator +from torch import nn +import torch +from torch.utils.data import IterableDataset +import itertools +import pandas as pd +from torch.utils.data import DataLoader +import csv + +def data_preprocessing(text): + return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ').replace("'ll", " will").replace("-", "").replace("'ve", " have").replace("'s", " is")) + +def get_words_from_line(line): + line = line.rstrip() + yield '' + for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): + yield m.group(0).lower() + yield '' -# In[3]: +def get_word_lines_from_file(data): + for line in data: + yield get_words_from_line(line) +class SimpleBigramNeuralLanguageModel(nn.Module): + def __init__(self, vocabulary_size, embedding_size): + super(SimpleBigramNeuralLanguageModel, self).__init__() + self.model = nn.Sequential( + nn.Embedding(vocabulary_size, embedding_size), + nn.Linear(embedding_size, vocabulary_size), + nn.Softmax() + ) + + def forward(self, x): + return self.model(x) + + +def look_ahead_iterator(gen): + prev = None + for item in gen: + if prev is not None: + yield (prev, item) + prev = item + +class Bigrams(IterableDataset): + def __init__(self, text_file, vocabulary_size): + self.vocab = build_vocab_from_iterator( + get_word_lines_from_file(text_file), + max_tokens = vocabulary_size, + specials = ['']) + self.vocab.set_default_index(self.vocab['']) + self.vocabulary_size = vocabulary_size + self.text_file = text_file + + def __iter__(self): + return look_ahead_iterator( + (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))) + +in_file = 'train/in.tsv.xz' +out_file = 'train/expected.tsv' + train_set = pd.read_csv( 'train/in.tsv.xz', sep='\t', @@ -31,116 +77,72 @@ train_labels = pd.read_csv( quoting=csv.QUOTE_NONE, nrows=35000) - -# In[4]: - - data = pd.concat([train_set, train_labels], axis=1) - - -# In[5]: - - data = train_set[6] + train_set[0] + train_set[7] - - -# In[6]: - - -def data_preprocessing(text): - return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ').replace("'ll", " will").replace("-", "").replace("'ve", " have").replace("'s", " is")) - - -# In[8]: - - data = data.apply(data_preprocessing) + +vocab_size = 30000 +embed_size = 150 + + +bigram_data = Bigrams(data, vocab_size) + +device = 'cpu' +model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) +data = DataLoader(bigram_data, batch_size=5000) +optimizer = torch.optim.Adam(model.parameters()) +criterion = torch.nn.NLLLoss() + +model.train() +step = 0 +for x, y in data: + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + ypredicted = model(x) + loss = criterion(torch.log(ypredicted), y) + if step % 100 == 0: + print(step, loss) + step += 1 + loss.backward() + optimizer.step() + +torch.save(model.state_dict(), 'model1.bin') + +vocab = bigram_data.vocab prediction = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77' - -# In[25]: - - -with open("train_file.txt", "w+") as f: - for text in data: - f.write(text + "\n") - - -# In[27]: - - -KENLM_BUILD_PATH='../kenlm/build/bin/lmplz' - - -# In[28]: - - -get_ipython().system('$KENLM_BUILD_PATH -o 4 < train_file.txt > kenlm_model.arpa') - - -# In[29]: - - -import os -print(os.getcwd()) -model = kenlm.Model('kenlm_model.arpa') - - -# In[30]: - - -def predict(before, after): - result = '' - prob = 0.0 - best = [] - for word in english_words_alpha_set: - text = ' '.join([before, word, after]) - text_score = model.score(text, bos=False, eos=False) - if len(best) < 12: - best.append((word, text_score)) - else: - is_better = False - worst_score = None - for score in best: - if not worst_score: - worst_score = score - else: - if worst_score[1] > score[1]: - worst_score = score - if worst_score[1] < text_score: - best.remove(worst_score) - best.append((word, text_score)) - probs = sorted(best, key=lambda tup: tup[1], reverse=True) - pred_str = '' - for word, prob in probs: - pred_str += f'{word}:{prob} ' - pred_str += f':{log10(0.99)}' +def predict_word(w): + ixs = torch.tensor(vocab.forward(w)).to(device) + out = model(ixs) + top = torch.topk(out[0], 8) + top_indices = top.indices.tolist() + top_probs = top.values.tolist() + top_words = vocab.lookup_tokens(top_indices) + pred_str = "" + for word, prob in list(zip(top_words, top_probs)): + pred_str += f"{word}:{prob} " return pred_str -# In[31]: +def predict(f): + x = pd.read_csv(f'{f}/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, on_bad_lines='skip', encoding="UTF-8")[6] + x = x.apply(data_preprocessing) - -def make_prediction(path, result_path): - data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE) - with open(result_path, 'w', encoding='utf-8') as file_out: - for _, row in data.iterrows(): - before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7]))) - if len(before) < 2 or len(after) < 2: - pred = prediction + with open(f'{f}/out.tsv', "w+", encoding="UTF-8") as f: + for row in x: + result = {} + before = None + for before in get_words_from_line(data_preprocessing(str(row)), False): + pass + before = [before] + if(len(before) < 1): + pred_str = prediction else: - pred = predict(before[-1], after[0]) - file_out.write(pred + '\n') + pred_str = predict_word(before) + pred_str = pred_str.strip() + f.write(pred_str + "\n") -# In[32]: - - -make_prediction("dev-0/in.tsv.xz", "dev-0/out.tsv") - - -# In[33]: - - -make_prediction("test-A/in.tsv.xz", "test-A/out.tsv") - +prediction("dev-0/") +prediction("test-A/") \ No newline at end of file diff --git a/run3.py b/run3.py new file mode 100755 index 0000000..dca5564 --- /dev/null +++ b/run3.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[2]: + + +from nltk import trigrams, word_tokenize +import pandas as pd +import csv +import regex as re +from collections import Counter, defaultdict +import kenlm +from english_words import english_words_alpha_set +from math import log10 + + +# In[3]: + + +train_set = pd.read_csv( + 'train/in.tsv.xz', + sep='\t', + header=None, + quoting=csv.QUOTE_NONE, + nrows=35000) + +train_labels = pd.read_csv( + 'train/expected.tsv', + sep='\t', + header=None, + quoting=csv.QUOTE_NONE, + nrows=35000) + + +# In[4]: + + +data = pd.concat([train_set, train_labels], axis=1) + + +# In[5]: + + +data = train_set[6] + train_set[0] + train_set[7] + + +# In[6]: + + +def data_preprocessing(text): + return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ').replace("'ll", " will").replace("-", "").replace("'ve", " have").replace("'s", " is")) + + +# In[8]: + + +data = data.apply(data_preprocessing) +prediction = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77' + + +# In[25]: + + +with open("train_file.txt", "w+") as f: + for text in data: + f.write(text + "\n") + + +# In[27]: + + +KENLM_BUILD_PATH='../kenlm/build/bin/lmplz' + + +# In[28]: + + +get_ipython().system('$KENLM_BUILD_PATH -o 4 < train_file.txt > kenlm_model.arpa') + + +# In[29]: + + +import os +print(os.getcwd()) +model = kenlm.Model('kenlm_model.arpa') + + +# In[30]: + + +def predict(before, after): + result = '' + prob = 0.0 + best = [] + for word in english_words_alpha_set: + text = ' '.join([before, word, after]) + text_score = model.score(text, bos=False, eos=False) + if len(best) < 12: + best.append((word, text_score)) + else: + is_better = False + worst_score = None + for score in best: + if not worst_score: + worst_score = score + else: + if worst_score[1] > score[1]: + worst_score = score + if worst_score[1] < text_score: + best.remove(worst_score) + best.append((word, text_score)) + probs = sorted(best, key=lambda tup: tup[1], reverse=True) + pred_str = '' + for word, prob in probs: + pred_str += f'{word}:{prob} ' + pred_str += f':{log10(0.99)}' + return pred_str + + +# In[31]: + + +def make_prediction(path, result_path): + data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE) + with open(result_path, 'w', encoding='utf-8') as file_out: + for _, row in data.iterrows(): + before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7]))) + if len(before) < 2 or len(after) < 2: + pred = prediction + else: + pred = predict(before[-1], after[0]) + file_out.write(pred + '\n') + + +# In[32]: + + +make_prediction("dev-0/in.tsv.xz", "dev-0/out.tsv") + + +# In[33]: + + +make_prediction("test-A/in.tsv.xz", "test-A/out.tsv") +