tokenizer_nltk

2020-03-26 12:26:28 +01:00 · 2020-03-26 12:26:28 +01:00 · c88dde7f62
parent 73b72d2df3
commit c88dde7f62
4 changed files with 10454 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-A/predict.py
+++ b/test-A/predict.py
@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import sys
+import pickle
+from tokenizator import tokenizator
+import math
+model = pickle.load(open('model.pkl', 'rb', ))
+pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptics_count, paranormal_count = model
+for line in sys.stdin:
+	document = line.rstrip()
+	terms = tokenizator(document)
+	log_prob_skeptic = math.log(pskeptic)
+	log_prob_paranormal = math.log(1-pskeptic)
+        
+	for term in terms:
+		if term not in skeptics_count:
+			skeptics_count[term]=0
+		if term not in paranormal_count:
+			paranormal_count[term]=0                
+		log_prob_skeptic += math.log((skeptics_count[term] +1)/(skeptic_words_total + vocabulary_size))
+		log_prob_paranormal += math.log((paranormal_count[term] +1)/(paranormal_words_total + vocabulary_size))
+	if log_prob_skeptic > log_prob_paranormal:
+		print("S")
+	else:
+		print("P")
--- a/test-A/tokenizator.py
+++ b/test-A/tokenizator.py
@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+def tokenizator(d):
+    return word_tokenize(d)