improved
This commit is contained in:
parent
44755ca275
commit
07afcec739
39972
dev-0/out.tsv
39972
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
23256
dev-1/out.tsv
23256
dev-1/out.tsv
File diff suppressed because it is too large
Load Diff
37
predict.py
37
predict.py
@ -1,35 +1,42 @@
|
|||||||
import pickle
|
import pickle
|
||||||
import sys
|
import sys
|
||||||
from math import log
|
from math import log
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
def count_prob(bigrams, unigrams):
|
||||||
def get_prob(count, total, classes):
|
prob = (bigrams + 1.0) / (unigrams + 1)
|
||||||
prob = (count + 1.0) / (total + classes)
|
|
||||||
if prob > 1.0:
|
if prob > 1.0:
|
||||||
return 1.0
|
return 1.0
|
||||||
else:
|
else:
|
||||||
return prob
|
return prob
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ngrams = pickle.load(open('ngrams.pkl', 'rb'))
|
ngrams = pickle.load(open('ngrams_2.pkl', 'rb'))
|
||||||
vocabulary_size = len(ngrams[1])
|
vocabulary_size = len(ngrams[1])
|
||||||
|
# a = ngrams[1]
|
||||||
|
# print(a)
|
||||||
|
# lookfor1 = str(".")
|
||||||
|
# #lookfor = tuple(lookfor1)
|
||||||
|
# # print(lookfor)
|
||||||
|
# b = a.get((',',),0)
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
words = re.findall(r'.*\t.*\t.* (.*?) (.*?)\t(.*?) (.*?) ', line.lower())[0]
|
words = re.findall(r'.*\t.*\t.* (.*?)\t(.*?) ', line.lower())[0]
|
||||||
left_words = [str(words[0]), str(words[1])]
|
#print(words)
|
||||||
right_words = [str(words[2]), str(words[3])]
|
left_word = [str(words[0])]
|
||||||
|
right_word = [str(words[1])]
|
||||||
|
|
||||||
probabilities = []
|
probabilities = []
|
||||||
for word in ngrams[1].keys():
|
for word in ngrams[1].keys():
|
||||||
word = str(word[0])
|
word = str(word[0])
|
||||||
pre_ngram = tuple(left_words + [word])
|
pre_ngram = tuple(left_word + [word])
|
||||||
post_ngram = tuple([word] + right_words)
|
|
||||||
pre_ngram_prob = get_prob(ngrams[3].get(pre_ngram, 0), ngrams[2].get(tuple(left_words), 0),
|
post_ngram = tuple([word] + right_word)
|
||||||
vocabulary_size)
|
#print(pre_ngram)
|
||||||
post_ngram_prob = get_prob(ngrams[3].get(post_ngram, 0), ngrams[2].get(post_ngram[0:2], 0),
|
#print("bigram:", ngrams[2].get(pre_ngram, 0), "unigram", ngrams[1].get(word[0],0))
|
||||||
vocabulary_size)
|
pre_ngram_prob = count_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get((word[0],),0) + vocabulary_size)
|
||||||
|
#if pre_ngram_prob>0:
|
||||||
|
post_ngram_prob = count_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get((word[0],),0) + vocabulary_size)
|
||||||
|
|
||||||
probabilities.append((word, pre_ngram_prob * post_ngram_prob))
|
probabilities.append((word, pre_ngram_prob * post_ngram_prob))
|
||||||
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
|
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
|
||||||
probability = 1.0
|
probability = 1.0
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
xzcat train/train.tsv.xz | ./train.py
|
xzcat train/train.tsv.xz | python3 ./train.py
|
||||||
|
|
||||||
cat dev-0/in.tsv | ./predict.py > dev-0/out.tsv
|
cat dev-0/in.tsv | python3 ./predict.py > dev-0/out.tsv
|
||||||
cat dev-1/in.tsv | ./predict.py > dev-1/out.tsv
|
cat dev-1/in.tsv | python3 ./predict.py > dev-1/out.tsv
|
||||||
cat test-A/in.tsv | ./predict.py > test-A/out.tsv
|
cat test-A/in.tsv | python3 ./predict.py > test-A/out.tsv
|
||||||
|
28264
test-A/out.tsv
28264
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
77
train.py
77
train.py
@ -1,57 +1,36 @@
|
|||||||
import pickle
|
#!/usr/bin/python3
|
||||||
import sys
|
|
||||||
from math import log
|
import sys
|
||||||
import regex as re
|
import regex as re
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
def into_words(sentence):
|
||||||
|
return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower())
|
||||||
|
|
||||||
def count_prob(bigrams, unigrams):
|
|
||||||
prob = (bigrams + 1.0) / (unigrams + 1)
|
|
||||||
if prob > 1.0:
|
|
||||||
return 1.0
|
|
||||||
else:
|
|
||||||
return prob
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ngrams = pickle.load(open('ngrams_2.pkl', 'rb'))
|
ngrams = {1: {}, 2: {}}
|
||||||
vocabulary_size = len(ngrams[1])
|
lowest_ngram = 1
|
||||||
for line in sys.stdin:
|
highest_ngram = 2
|
||||||
words = re.findall(r'.*\t.*\t.* (.*?)\t(.*?) ', line.lower())[0]
|
|
||||||
#print(words)
|
|
||||||
left_word = [str(words[0])]
|
|
||||||
right_word = [str(words[1])]
|
|
||||||
|
|
||||||
probabilities = []
|
|
||||||
for word in ngrams[1].keys():
|
|
||||||
word = str(word[0])
|
|
||||||
pre_ngram = tuple(left_word + [word])
|
|
||||||
post_ngram = tuple([word] + right_word)
|
|
||||||
#print(pre_ngram)
|
|
||||||
pre_ngram_prob = count_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
|
|
||||||
#if pre_ngram_prob>0:
|
|
||||||
post_ngram_prob = count_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
|
|
||||||
|
|
||||||
probabilities.append((word, pre_ngram_prob * post_ngram_prob))
|
|
||||||
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
|
|
||||||
probability = 1.0
|
|
||||||
text = ''
|
|
||||||
counter = 0
|
counter = 0
|
||||||
has_log_prob0 = False
|
for line in sys.stdin: #dla kazdej linii z pliku
|
||||||
for p in probabilities:
|
line = line.split('\t')[4] # podziel na 4
|
||||||
word = p[0]
|
tokens = into_words(line) #na slowa
|
||||||
prob = p[1]
|
number_of_tokens = len(tokens) #ile slow?
|
||||||
if counter == 0 and (probability - prob <= 0.0):
|
for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram
|
||||||
text = word + ':' + str(log(0.95)) + ' :' + str(log(0.05))
|
for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1
|
||||||
has_log_prob0 = True
|
ngram = tuple(tokens[i:i+n])
|
||||||
break
|
if ngram in ngrams[n]:
|
||||||
if counter > 0 and (probability - prob <= 0.0):
|
ngrams[n][ngram] += 1
|
||||||
text += ':' + str(log(probability))
|
else:
|
||||||
has_log_prob0 = True
|
ngrams[n][ngram] = 1
|
||||||
break
|
if counter % 1000 == 0:
|
||||||
text += word + ':' + str(log(prob)) + ' '
|
print('counter = ', counter)
|
||||||
probability -= prob
|
|
||||||
counter += 1
|
counter += 1
|
||||||
if not has_log_prob0:
|
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
|
||||||
text += ':' + str(log(0.0001))
|
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000])
|
||||||
print(text)
|
pickle.dump(ngrams, open('ngrams_2.pkl', 'wb'))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
Reference in New Issue
Block a user