Use nltk and pandas.
This commit is contained in:
parent
730e401d24
commit
ca72f4ea4a
157
run.py
157
run.py
@ -1,111 +1,66 @@
|
||||
from encodings import search_function
|
||||
import lzma
|
||||
from re import L
|
||||
import regex as re
|
||||
import string
|
||||
import queue
|
||||
# text = lzma.open('train/in.tsv.xz').read()
|
||||
def read_file(file):
|
||||
for line in file:
|
||||
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', line.split("\t")[7].replace("\\n"," ").replace("\n","").lower())).split(" ")
|
||||
|
||||
def get_words(file):
|
||||
for words in read_file(file):
|
||||
yield from words
|
||||
|
||||
def set_bigram_count(first_word, second_word, bigrams):
|
||||
if f"{first_word}_{second_word}" not in bigrams:
|
||||
bigrams[f"{first_word}_{second_word}"] = 1
|
||||
else:
|
||||
bigrams[f"{first_word}_{second_word}"] += 1
|
||||
|
||||
def set_trigram_count(first_word, second_word, third_word, trigrams):
|
||||
if f"{first_word}_{second_word}_{third_word}" not in trigrams:
|
||||
trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
|
||||
else:
|
||||
trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
|
||||
|
||||
def load_train():
|
||||
trigrams = {}
|
||||
bigrams = {}
|
||||
index = 0
|
||||
expected = open('train/expected.tsv', 'r')
|
||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
for words in read_file(file):
|
||||
expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
|
||||
mv = 0
|
||||
if not words[0]:
|
||||
mv = 1
|
||||
set_bigram_count(words[0+mv], words[1+mv], bigrams)
|
||||
set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
|
||||
print(bigrams)
|
||||
print(trigrams)
|
||||
import pandas as pd
|
||||
import csv
|
||||
from collections import Counter, defaultdict
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
from nltk import trigrams
|
||||
|
||||
|
||||
class WordPred:
|
||||
|
||||
def predict(search_for_words):
|
||||
trigrams = {}
|
||||
bigrams = {}
|
||||
index = 0
|
||||
expected = open('train/expected.tsv', 'r')
|
||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
for words in read_file(file):
|
||||
expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
|
||||
mv = 0
|
||||
if not words[0]:
|
||||
mv = 1
|
||||
for search_for_word in search_for_words:
|
||||
if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]:
|
||||
set_bigram_count(words[0+mv], words[1+mv], bigrams)
|
||||
set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
|
||||
def __init__(self):
|
||||
self.tokenizer = RegexpTokenizer(r"\w+")
|
||||
self.model = defaultdict(lambda: defaultdict(lambda: 0))
|
||||
|
||||
if index == 100000:
|
||||
break
|
||||
index += 1
|
||||
def read_train_data(self, file):
|
||||
data = pd.read_csv(file, compression='xz', sep="\t", error_bad_lines=False, index_col=0, header=None)
|
||||
for row in data[:140000].itertuples():
|
||||
if len(row)<8:
|
||||
continue
|
||||
text = str(row[6]) + ' ' + str(row[7])
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True):
|
||||
if w1 and w2 and w3:
|
||||
self.model[(w2, w3)][w1] += 1
|
||||
|
||||
print(len(search_for_words))
|
||||
print(len(bigrams))
|
||||
print(len(trigrams))
|
||||
for word_pair in self.model:
|
||||
num_n_grams = float(sum(self.model[word_pair].values()))
|
||||
for word in self.model[word_pair]:
|
||||
self.model[word_pair][word] /= num_n_grams
|
||||
|
||||
left_context_search_for_word = {}
|
||||
for bigram in bigrams:
|
||||
max_count = 0
|
||||
for trigram in trigrams:
|
||||
if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams[trigram] > max_count:
|
||||
max_count = trigrams[trigram]
|
||||
left_context = trigram.split("_")[0]
|
||||
left_context_search_for_word[bigram] = left_context
|
||||
def generate_outputs(self, input_file, output_file):
|
||||
data = pd.read_csv(input_file, compression='xz', sep='\t', error_bad_lines=False, index_col=0, header=None, quoting=csv.QUOTE_NONE)
|
||||
with open(output_file, 'w') as f:
|
||||
for row in data.iterrows():
|
||||
text = str(row[7])
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
if len(tokens) < 4:
|
||||
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
|
||||
else:
|
||||
prediction = word_gap_prediction.predict_probs(tokens[0], tokens[1])
|
||||
f.write(prediction + '\n')
|
||||
|
||||
for index, search_for_word in enumerate(search_for_words):
|
||||
hash_search_for_word = '_'.join(search_for_word)
|
||||
if hash_search_for_word in left_context_search_for_word:
|
||||
left_context = left_context_search_for_word[hash_search_for_word]
|
||||
print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}")
|
||||
def predict_probs(self, word1, word2):
|
||||
predictions = dict(self.model[word1, word2])
|
||||
most_common = dict(Counter(predictions).most_common(6))
|
||||
|
||||
total_prob = 0.0
|
||||
str_prediction = ''
|
||||
|
||||
for word, prob in most_common.items():
|
||||
total_prob += prob
|
||||
str_prediction += f'{word}:{prob} '
|
||||
|
||||
if total_prob == 0.0:
|
||||
return 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
|
||||
|
||||
if 1 - total_prob >= 0.01:
|
||||
str_prediction += f":{1-total_prob}"
|
||||
else:
|
||||
print(f"{index+1}: ??? {' '.join(search_for_word)}")
|
||||
str_prediction += f":0.01"
|
||||
|
||||
def load_dev():
|
||||
search_for_words = []
|
||||
with lzma.open('dev-0/in.tsv.xz', mode='rt') as file:
|
||||
index = 0
|
||||
for words in read_file(file):
|
||||
if words[0]:
|
||||
search_for_words.append([words[0], words[1]])
|
||||
else:
|
||||
search_for_words.append([words[1], words[2]])
|
||||
if index == 100:
|
||||
break
|
||||
index += 1
|
||||
print(search_for_words)
|
||||
return search_for_words
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load_train()
|
||||
# load_dev()
|
||||
predict(load_dev())
|
||||
# with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
# index = 0
|
||||
# for _ in get_words(file):
|
||||
# index += 1
|
||||
# print(index) # 141820215
|
||||
return str_prediction
|
||||
|
||||
word_gap_prediction = WordPred()
|
||||
word_gap_prediction.read_train_data('./train/in.tsv.xz')
|
||||
# word_gap_prediction.generate_outputs('dev-0/in.tsv.xz', 'dev-0/out.tsv')
|
||||
# word_gap_prediction.generate_outputs('test-A/in.tsv.xz', 'test-A/out.tsv')
|
@ -7,7 +7,8 @@ import queue
|
||||
# text = lzma.open('train/in.tsv.xz').read()
|
||||
def read_file(file):
|
||||
for line in file:
|
||||
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', line.split("\t")[7].replace("\\n"," ").replace("\n","").lower())).split(" ")
|
||||
text = line.split("\t")
|
||||
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower())).split(" ")
|
||||
|
||||
def get_words(file):
|
||||
for words in read_file(file):
|
||||
@ -26,10 +27,7 @@ def set_trigram_count(first_word, second_word, third_word, trigrams):
|
||||
trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
|
||||
|
||||
def load_train():
|
||||
trigrams = {}
|
||||
bigrams = {}
|
||||
index = 0
|
||||
expected = open('train/expected.tsv', 'r')
|
||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
for words in read_file(file):
|
||||
expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
|
||||
|
133
run_nc_old.py
Normal file
133
run_nc_old.py
Normal file
@ -0,0 +1,133 @@
|
||||
from encodings import search_function
|
||||
import lzma
|
||||
from re import L
|
||||
import regex as re
|
||||
import string
|
||||
import queue
|
||||
# text = lzma.open('train/in.tsv.xz').read()
|
||||
def read_file(file):
|
||||
for line in file:
|
||||
text = line.split("\t")
|
||||
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower())).split(" ")
|
||||
|
||||
def get_words(file):
|
||||
for words in read_file(file):
|
||||
yield from words
|
||||
|
||||
def set_bigram_count(first_word, second_word, bigrams):
|
||||
if f"{first_word}_{second_word}" not in bigrams:
|
||||
bigrams[f"{first_word}_{second_word}"] = 1
|
||||
else:
|
||||
bigrams[f"{first_word}_{second_word}"] += 1
|
||||
|
||||
def set_trigram_count(first_word, second_word, third_word, trigrams):
|
||||
if f"{first_word}_{second_word}_{third_word}" not in trigrams:
|
||||
trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
|
||||
else:
|
||||
trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
|
||||
|
||||
def load_train():
|
||||
trigrams = {}
|
||||
bigrams = {}
|
||||
index = 0
|
||||
expected = open('train/expected.tsv', 'r')
|
||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
for words in read_file(file):
|
||||
expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
|
||||
mv = 0
|
||||
if not words[0]:
|
||||
mv = 1
|
||||
set_bigram_count(words[0+mv], words[1+mv], bigrams)
|
||||
set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
|
||||
print(bigrams)
|
||||
print(trigrams)
|
||||
|
||||
|
||||
|
||||
def predict(search_for_words):
|
||||
trigrams = {}
|
||||
bigrams = {}
|
||||
trigrams_nc = {}
|
||||
bigrams_nc = {}
|
||||
index = 0
|
||||
expected = open('train/expected.tsv', 'r')
|
||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
for words in read_file(file):
|
||||
expected_word = re.sub(r"[^\w\d'\s]+", '', expected.readline().replace("\n", "").lower())
|
||||
mv = 0
|
||||
if not words[0]:
|
||||
mv = 1
|
||||
for search_for_word in search_for_words:
|
||||
if search_for_word[0] == words[0+mv] and search_for_word[1] == words[1+mv]:
|
||||
set_bigram_count(words[0+mv], words[1+mv], bigrams)
|
||||
set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams)
|
||||
elif search_for_word[0] == words[0+mv]:
|
||||
set_bigram_count(words[0+mv], words[1+mv], bigrams_nc)
|
||||
set_trigram_count(expected_word, words[0+mv], words[1+mv], trigrams_nc)
|
||||
|
||||
if index == 100000:
|
||||
break
|
||||
index += 1
|
||||
|
||||
print(len(search_for_words))
|
||||
print(len(bigrams))
|
||||
print(len(trigrams))
|
||||
print(len(bigrams_nc))
|
||||
print(len(trigrams_nc))
|
||||
|
||||
left_context_search_for_word = {}
|
||||
for bigram in bigrams:
|
||||
max_count = 0
|
||||
for trigram in trigrams:
|
||||
if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams[trigram] > max_count:
|
||||
max_count = trigrams[trigram]
|
||||
left_context = trigram.split("_")[0]
|
||||
left_context_search_for_word[bigram] = left_context
|
||||
|
||||
left_context_search_for_word_nc = {}
|
||||
for bigram in bigrams_nc:
|
||||
max_count = 0
|
||||
for trigram in trigrams_nc:
|
||||
if bigram == '_'.join(trigram.split("_")[1:3]) and trigrams_nc[trigram] > max_count:
|
||||
max_count = trigrams_nc[trigram]
|
||||
left_context = trigram.split("_")[0]
|
||||
left_context_search_for_word_nc[bigram] = left_context
|
||||
|
||||
for index, search_for_word in enumerate(search_for_words):
|
||||
hash_search_for_word = '_'.join(search_for_word)
|
||||
if hash_search_for_word in left_context_search_for_word:
|
||||
left_context = left_context_search_for_word[hash_search_for_word]
|
||||
print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams['_'.join([left_context]+search_for_word)]/bigrams[hash_search_for_word]}")
|
||||
else:
|
||||
for lfc in left_context_search_for_word_nc:
|
||||
if search_for_word[0] == lfc.split("_")[0]:
|
||||
left_context = left_context_search_for_word[lfc]
|
||||
print(f"{index+1}: {left_context} {' '.join(search_for_word)} {trigrams_nc['_'.join([left_context]+lfc)]/bigrams_nc[lfc]}")
|
||||
else:
|
||||
print(f"{index+1}: ??? {' '.join(search_for_word)}")
|
||||
|
||||
def load_dev():
|
||||
search_for_words = []
|
||||
with lzma.open('dev-0/in.tsv.xz', mode='rt') as file:
|
||||
index = 0
|
||||
for words in read_file(file):
|
||||
if words[0]:
|
||||
search_for_words.append([words[0], words[1]])
|
||||
else:
|
||||
search_for_words.append([words[1], words[2]])
|
||||
if index == 100:
|
||||
break
|
||||
index += 1
|
||||
print(search_for_words)
|
||||
return search_for_words
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load_train()
|
||||
# load_dev()
|
||||
predict(load_dev())
|
||||
# with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||
# index = 0
|
||||
# for _ in get_words(file):
|
||||
# index += 1
|
||||
# print(index) # 141820215
|
||||
|
Loading…
Reference in New Issue
Block a user