56 lines
1.6 KiB
Python
56 lines
1.6 KiB
Python
|
#!/usr/bin/python3
|
|||
|
|
|||
|
import pickle, re, sys
|
|||
|
from nltk.corpus import stopwords
|
|||
|
|
|||
|
def clear_post(post):
|
|||
|
post = post.replace('\\n', ' ')
|
|||
|
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
|
|||
|
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
|||
|
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
|||
|
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
|
|||
|
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
|||
|
post = re.sub(r' +', ' ', post)
|
|||
|
post = post.rstrip(' ')
|
|||
|
post = post.split(' ')
|
|||
|
stop_words = set(stopwords.words('english'))
|
|||
|
post_no_stop = [w for w in post if not w in stop_words]
|
|||
|
return post_no_stop
|
|||
|
|
|||
|
def calc_prob(posts, weights, word_to_index_mapping):
|
|||
|
for post in posts:
|
|||
|
d = post.split(' ')
|
|||
|
y_hat = weights[0]
|
|||
|
for token in d:
|
|||
|
try:
|
|||
|
y_hat += weights[word_to_index_mapping[token]] * post.count(token)
|
|||
|
except KeyError:
|
|||
|
y_hat += 0
|
|||
|
if y_hat > 0.5:
|
|||
|
print("1")
|
|||
|
else:
|
|||
|
print("0")
|
|||
|
|
|||
|
def main():
|
|||
|
if len(sys.argv) != 2:
|
|||
|
print("Expected model")
|
|||
|
return
|
|||
|
|
|||
|
model = str(sys.argv[1])
|
|||
|
|
|||
|
posts = []
|
|||
|
for line in sys.stdin:
|
|||
|
text, timestap = line.rstrip('\n').split('\t')
|
|||
|
post = clear_post(text)
|
|||
|
posts.append(" ".join(post))
|
|||
|
|
|||
|
with open(model, 'rb') as f:
|
|||
|
pickle_list = pickle.load(f)
|
|||
|
|
|||
|
weights = pickle_list[0]
|
|||
|
lowest_loss_weights = pickle_list[1]
|
|||
|
word_to_index_mapping = pickle_list[2]
|
|||
|
calc_prob(posts, weights, word_to_index_mapping)
|
|||
|
|
|||
|
main()
|