paranormal-or-skeptic/code_regression2

61 lines
1.8 KiB
Plaintext
Raw Normal View History

2020-06-08 19:11:20 +02:00
import pickle
import re
def calculate_words(linetxt):
word_counts = {}
tokens = linetxt.split(' ')
for token in tokens:
if token in word_counts.keys():
word_counts[token]+=1
else:
word_counts[token]=1
word_counts[''] = 1
return word_counts
def tokenize_list(string_input):
string=string_input.replace('\\n',' ')
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
return text
def prediction(input,output):
loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
#print(loaded_model)
weights, word, vocabulary = loaded_model
#print("WORD: ")
#print(word)
#print(" WEIGHTS: ")
#print(weights)
output_f = open(output,'w')
with open(input, encoding='utf-8') as input_f:
for line in input_f:
text, timestamp = line.rstrip('\n').split('\t')
tokens = tokenize_list(text.lower())
line_vocabulary = calculate_words(tokens)
tokens = tokens.split(' ')
y_hat = weights[0]
for token in tokens:
if token in vocabulary.keys():
y_hat += weights[word[token]] * line_vocabulary[token]
if y_hat > 0.5:
output_f.write("1\n")
2020-06-08 19:21:35 +02:00
#print(y_hat)
2020-06-08 19:11:20 +02:00
else:
output_f.write("0\n")
2020-06-08 19:21:35 +02:00
#print(y_hat)
2020-06-08 19:11:20 +02:00
output_f.close()
def main():
prediction("dev-0/in.tsv","dev-0/out.tsv")
prediction("test-A/in.tsv","test-A/out.tsv")
main()