my brilliant solution2

This commit is contained in:
Dominika Grajewska 2020-03-22 18:35:08 +01:00
parent 73b72d2df3
commit 874b7f6266
5 changed files with 10497 additions and 0 deletions

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

24
dev-0/predict.py Executable file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env python3
import sys
import pickle
from tokenize import tokenize
import math
model = pickle.load(open('model.pkl', 'rb', ))
pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptics_count, paranormal_count = model
for line in sys.stdin:
document = line.rstrip()
terms = tokenize(document)
log_prob_skeptic = math.log(pskeptic)
log_prob_paranormal = math.log(1-pskeptic)
for term in terms:
if term not in skeptics_count:
skeptics_count[term]=0
if term not in paranormal_count:
paranormal_count[term]=0
log_prob_skeptic += math.log((skeptics_count[term] +1)/(skeptic_words_total + vocabulary_size))
log_prob_paranormal += math.log((paranormal_count[term] +1)/(paranormal_words_total + vocabulary_size))
if log_prob_skeptic > log_prob_paranormal:
print("S")
else:
print("P")

3
dev-0/tokenize.py Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env python3
def tokenize(d):
return d.split(' ')

46
dev-0/train.py Executable file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env python3
import sys
import pickle
import math
def tokenize(d):
return d.split(' ')
def train():
documents_total = 0
skeptic_documents_total = 0
paranormal_documents_total = 0
vocabulary = set()
skeptic_words_total = 0
paranormal_words_total = 0
skeptic_count = {}
paranormal_count = {}
for line in sys.stdin:
line = line.rstrip()
fields = line.split('\t')
label = fields[0].strip()
document = fields[1]
terms = tokenize(document)
for t in terms:
vocabulary.add(t)
documents_total +=1
if label == 'S':
skeptic_documents_total +=1
skeptic_words_total += len(terms)
for term in terms:
if term in skeptic_count:
skeptic_count[term] += 1
else:
skeptic_count[term] = 1
else:
paranormal_words_total += len(terms)
for term in terms:
if term in paranormal_count:
paranormal_count[term] += 1
else:
paranormal_count[term] = 1
pskeptic = skeptic_documents_total / documents_total
vocabulary_size = len(vocabulary)
model = (pskeptic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
pickle.dump(model, open("model.pkl", "wb"))
train()

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff