BitSearch/twitter_pred.py
Krzysztof Szubiczuk 42de0bde5e model pickling
2022-01-27 17:56:37 +01:00

70 lines
2.0 KiB
Python
Raw Permalink Blame History

# %%
import pickle
import json
import re
# %%
with open('model_pkl' , 'rb') as f:
model = pickle.load(f)
# %%
with open('vectorizer_pkl' , 'rb') as f:
vectorizer = pickle.load(f)
# %%
change_dict = {
# tokens
" username ": ['@\w+|@'],
" url ": ['http\S*'],
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
# standardization
', ': ['\s,'],
'. ': ['\s\.'],
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
"'": ["<EFBFBD>"],
'?': ["\s\?"],
'!': ["\s\!"],
'".': ["\s\"\."],
'",': ["\s\"\,"],
'" ': ["\s\"\s"]
}
def clean_lines(line, change_dict):
line = str(line).lower()
for change_to, change_regex_list in change_dict.items():
for change_regex in change_regex_list:
line = re.sub(change_regex, change_to, line)
return line
def get_rep_idx_to_cut_out_from_str(line):
occurence = 0
idx_to_cut = []
for idx, letter in enumerate(line):
if idx > 0:
occurence = occurence+1 if line[idx-1] == letter else 0
if occurence >= 2:
idx_to_cut.append(idx)
return idx_to_cut
def truncate_duplicated_letters_to_two(line):
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
str_out =''
for i,s in enumerate(line):
if i not in idx_to_cut:
str_out += s
return str_out
def clean_data(l):
text = [clean_lines(x, change_dict) for x in l]
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
return text
# %%
text_to_predict = ["ethereum is great asset", "etherum is goin down"]
data_clean = clean_data(text_to_predict)
test_matrix = vectorizer.transform(data_clean)
data_predicted = model.predict(test_matrix).tolist()
# %%
positives = sum([1 for x in data_predicted if x == 1])
negatives = sum([1 for x in data_predicted if x == -1])
# %%
data_to_send = {"pos_perc": positives/(positives+negatives),
"neg_perc": negatives/(positives+negatives)}