my brilliant solution2

2020-03-22 18:35:08 +01:00 · 2020-03-22 18:35:08 +01:00 · 874b7f6266
parent 73b72d2df3
commit 874b7f6266
5 changed files with 10497 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/predict.py
+++ b/dev-0/predict.py
@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import sys
+import pickle
+from tokenize import tokenize
+import math
+model = pickle.load(open('model.pkl', 'rb', ))
+pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptics_count, paranormal_count = model
+for line in sys.stdin:
+	document = line.rstrip()
+	terms = tokenize(document)
+	log_prob_skeptic = math.log(pskeptic)
+	log_prob_paranormal = math.log(1-pskeptic)
+        
+	for term in terms:
+		if term not in skeptics_count:
+			skeptics_count[term]=0
+		if term not in paranormal_count:
+			paranormal_count[term]=0                
+		log_prob_skeptic += math.log((skeptics_count[term] +1)/(skeptic_words_total + vocabulary_size))
+		log_prob_paranormal += math.log((paranormal_count[term] +1)/(paranormal_words_total + vocabulary_size))
+	if log_prob_skeptic > log_prob_paranormal:
+		print("S")
+	else:
+		print("P")
--- a/dev-0/tokenize.py
+++ b/dev-0/tokenize.py
@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+def tokenize(d):
+    return d.split(' ')
--- a/dev-0/train.py
+++ b/dev-0/train.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+import sys
+import pickle
+import math
+def tokenize(d):
+	return d.split(' ')
+def train():
+	documents_total = 0
+	skeptic_documents_total = 0
+	paranormal_documents_total = 0
+	vocabulary = set()
+	skeptic_words_total = 0
+	paranormal_words_total = 0
+	skeptic_count = {}
+	paranormal_count = {}
+	for line in sys.stdin:
+		line = line.rstrip()
+		fields = line.split('\t')
+		label = fields[0].strip()
+		document = fields[1]
+		terms = tokenize(document)
+		for t in terms:
+			vocabulary.add(t)
+		documents_total +=1
+		if label == 'S':
+			skeptic_documents_total +=1
+			skeptic_words_total += len(terms)
+			for term in terms:
+				if term in skeptic_count:
+					skeptic_count[term] += 1
+				else:
+					skeptic_count[term] = 1
+		else:
+			paranormal_words_total += len(terms)
+			for term in terms:
+				if term in paranormal_count:
+					paranormal_count[term] += 1
+				else:
+					paranormal_count[term] = 1
+                
+            
+	pskeptic = skeptic_documents_total / documents_total
+	vocabulary_size = len(vocabulary)
+	model = (pskeptic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
+	pickle.dump(model, open("model.pkl", "wb"))
+train()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv