my brilliant solution2

2020-03-22 18:35:08 +01:00 · 2020-03-22 18:35:08 +01:00 · 874b7f6266
commit 874b7f6266
parent 73b72d2df3
5 changed files with 10497 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/predict.py
+++ b/dev-0/predict.py
@ -0,0 +1,24 @@
 #!/usr/bin/env python3
 import sys
 import pickle
 from tokenize import tokenize
 import math
 model = pickle.load(open('model.pkl', 'rb', ))
 pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptics_count, paranormal_count = model
 for line in sys.stdin:
 	document = line.rstrip()
 	terms = tokenize(document)
 	log_prob_skeptic = math.log(pskeptic)
 	log_prob_paranormal = math.log(1-pskeptic)
 	for term in terms:
 		if term not in skeptics_count:
 			skeptics_count[term]=0
 		if term not in paranormal_count:
 			paranormal_count[term]=0                
 		log_prob_skeptic += math.log((skeptics_count[term] +1)/(skeptic_words_total + vocabulary_size))
 		log_prob_paranormal += math.log((paranormal_count[term] +1)/(paranormal_words_total + vocabulary_size))
 	if log_prob_skeptic > log_prob_paranormal:
 		print("S")
 	else:
 		print("P")
--- a/dev-0/tokenize.py
+++ b/dev-0/tokenize.py
@ -0,0 +1,3 @@
 #!/usr/bin/env python3
 def tokenize(d):
    return d.split(' ')
--- a/dev-0/train.py
+++ b/dev-0/train.py
@ -0,0 +1,46 @@
 #!/usr/bin/env python3
 import sys
 import pickle
 import math
 def tokenize(d):
 	return d.split(' ')
 def train():
 	documents_total = 0
 	skeptic_documents_total = 0
 	paranormal_documents_total = 0
 	vocabulary = set()
 	skeptic_words_total = 0
 	paranormal_words_total = 0
 	skeptic_count = {}
 	paranormal_count = {}
 	for line in sys.stdin:
 		line = line.rstrip()
 		fields = line.split('\t')
 		label = fields[0].strip()
 		document = fields[1]
 		terms = tokenize(document)
 		for t in terms:
 			vocabulary.add(t)
 		documents_total +=1
 		if label == 'S':
 			skeptic_documents_total +=1
 			skeptic_words_total += len(terms)
 			for term in terms:
 				if term in skeptic_count:
 					skeptic_count[term] += 1
 				else:
 					skeptic_count[term] = 1
 		else:
 			paranormal_words_total += len(terms)
 			for term in terms:
 				if term in paranormal_count:
 					paranormal_count[term] += 1
 				else:
 					paranormal_count[term] = 1
 	pskeptic = skeptic_documents_total / documents_total
 	vocabulary_size = len(vocabulary)
 	model = (pskeptic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
 	pickle.dump(model, open("model.pkl", "wb"))
 train()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv