This commit is contained in:
Michal Maciaszek 2020-12-08 14:14:52 +01:00
parent 44755ca275
commit 07afcec739
6 changed files with 45803 additions and 45817 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +1,42 @@
import pickle import pickle
import sys import sys
from math import log from math import log
import regex as re import regex as re
def count_prob(bigrams, unigrams):
def get_prob(count, total, classes): prob = (bigrams + 1.0) / (unigrams + 1)
prob = (count + 1.0) / (total + classes)
if prob > 1.0: if prob > 1.0:
return 1.0 return 1.0
else: else:
return prob return prob
def main(): def main():
ngrams = pickle.load(open('ngrams.pkl', 'rb')) ngrams = pickle.load(open('ngrams_2.pkl', 'rb'))
vocabulary_size = len(ngrams[1]) vocabulary_size = len(ngrams[1])
# a = ngrams[1]
# print(a)
# lookfor1 = str(".")
# #lookfor = tuple(lookfor1)
# # print(lookfor)
# b = a.get((',',),0)
for line in sys.stdin: for line in sys.stdin:
words = re.findall(r'.*\t.*\t.* (.*?) (.*?)\t(.*?) (.*?) ', line.lower())[0] words = re.findall(r'.*\t.*\t.* (.*?)\t(.*?) ', line.lower())[0]
left_words = [str(words[0]), str(words[1])] #print(words)
right_words = [str(words[2]), str(words[3])] left_word = [str(words[0])]
right_word = [str(words[1])]
probabilities = [] probabilities = []
for word in ngrams[1].keys(): for word in ngrams[1].keys():
word = str(word[0]) word = str(word[0])
pre_ngram = tuple(left_words + [word]) pre_ngram = tuple(left_word + [word])
post_ngram = tuple([word] + right_words)
pre_ngram_prob = get_prob(ngrams[3].get(pre_ngram, 0), ngrams[2].get(tuple(left_words), 0), post_ngram = tuple([word] + right_word)
vocabulary_size) #print(pre_ngram)
post_ngram_prob = get_prob(ngrams[3].get(post_ngram, 0), ngrams[2].get(post_ngram[0:2], 0), #print("bigram:", ngrams[2].get(pre_ngram, 0), "unigram", ngrams[1].get(word[0],0))
vocabulary_size) pre_ngram_prob = count_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get((word[0],),0) + vocabulary_size)
#if pre_ngram_prob>0:
post_ngram_prob = count_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get((word[0],),0) + vocabulary_size)
probabilities.append((word, pre_ngram_prob * post_ngram_prob)) probabilities.append((word, pre_ngram_prob * post_ngram_prob))
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50] probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
probability = 1.0 probability = 1.0

View File

@ -1,5 +1,5 @@
xzcat train/train.tsv.xz | ./train.py xzcat train/train.tsv.xz | python3 ./train.py
cat dev-0/in.tsv | ./predict.py > dev-0/out.tsv cat dev-0/in.tsv | python3 ./predict.py > dev-0/out.tsv
cat dev-1/in.tsv | ./predict.py > dev-1/out.tsv cat dev-1/in.tsv | python3 ./predict.py > dev-1/out.tsv
cat test-A/in.tsv | ./predict.py > test-A/out.tsv cat test-A/in.tsv | python3 ./predict.py > test-A/out.tsv

File diff suppressed because it is too large Load Diff

View File

@ -1,57 +1,36 @@
import pickle #!/usr/bin/python3
import sys
from math import log import sys
import regex as re import regex as re
import pickle
def into_words(sentence):
return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower())
def count_prob(bigrams, unigrams):
prob = (bigrams + 1.0) / (unigrams + 1)
if prob > 1.0:
return 1.0
else:
return prob
def main(): def main():
ngrams = pickle.load(open('ngrams_2.pkl', 'rb')) ngrams = {1: {}, 2: {}}
vocabulary_size = len(ngrams[1]) lowest_ngram = 1
for line in sys.stdin: highest_ngram = 2
words = re.findall(r'.*\t.*\t.* (.*?)\t(.*?) ', line.lower())[0]
#print(words)
left_word = [str(words[0])]
right_word = [str(words[1])]
probabilities = []
for word in ngrams[1].keys():
word = str(word[0])
pre_ngram = tuple(left_word + [word])
post_ngram = tuple([word] + right_word)
#print(pre_ngram)
pre_ngram_prob = count_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
#if pre_ngram_prob>0:
post_ngram_prob = count_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
probabilities.append((word, pre_ngram_prob * post_ngram_prob))
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
probability = 1.0
text = ''
counter = 0 counter = 0
has_log_prob0 = False for line in sys.stdin: #dla kazdej linii z pliku
for p in probabilities: line = line.split('\t')[4] # podziel na 4
word = p[0] tokens = into_words(line) #na slowa
prob = p[1] number_of_tokens = len(tokens) #ile slow?
if counter == 0 and (probability - prob <= 0.0): for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram
text = word + ':' + str(log(0.95)) + ' :' + str(log(0.05)) for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1
has_log_prob0 = True ngram = tuple(tokens[i:i+n])
break if ngram in ngrams[n]:
if counter > 0 and (probability - prob <= 0.0): ngrams[n][ngram] += 1
text += ':' + str(log(probability)) else:
has_log_prob0 = True ngrams[n][ngram] = 1
break if counter % 1000 == 0:
text += word + ':' + str(log(prob)) + ' ' print('counter = ', counter)
probability -= prob
counter += 1 counter += 1
if not has_log_prob0: ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
text += ':' + str(log(0.0001)) ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000])
print(text) pickle.dump(ngrams, open('ngrams_2.pkl', 'wb'))
if __name__ == '__main__': if __name__ == '__main__':