70 lines
2.0 KiB
Python
70 lines
2.0 KiB
Python
# %%
|
||
import pickle
|
||
import json
|
||
import re
|
||
# %%
|
||
with open('model_pkl' , 'rb') as f:
|
||
model = pickle.load(f)
|
||
# %%
|
||
with open('vectorizer_pkl' , 'rb') as f:
|
||
vectorizer = pickle.load(f)
|
||
# %%
|
||
change_dict = {
|
||
# tokens
|
||
" username ": ['@\w+|@'],
|
||
" url ": ['http\S*'],
|
||
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
|
||
# standardization
|
||
', ': ['\s,'],
|
||
'. ': ['\s\.'],
|
||
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
|
||
"'": ["<EFBFBD>"],
|
||
'?': ["\s\?"],
|
||
'!': ["\s\!"],
|
||
'".': ["\s\"\."],
|
||
'",': ["\s\"\,"],
|
||
'" ': ["\s\"\s"]
|
||
}
|
||
|
||
def clean_lines(line, change_dict):
|
||
line = str(line).lower()
|
||
for change_to, change_regex_list in change_dict.items():
|
||
for change_regex in change_regex_list:
|
||
line = re.sub(change_regex, change_to, line)
|
||
return line
|
||
|
||
def get_rep_idx_to_cut_out_from_str(line):
|
||
occurence = 0
|
||
idx_to_cut = []
|
||
for idx, letter in enumerate(line):
|
||
if idx > 0:
|
||
occurence = occurence+1 if line[idx-1] == letter else 0
|
||
if occurence >= 2:
|
||
idx_to_cut.append(idx)
|
||
return idx_to_cut
|
||
|
||
def truncate_duplicated_letters_to_two(line):
|
||
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
|
||
str_out =''
|
||
for i,s in enumerate(line):
|
||
if i not in idx_to_cut:
|
||
str_out += s
|
||
return str_out
|
||
|
||
def clean_data(l):
|
||
text = [clean_lines(x, change_dict) for x in l]
|
||
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||
return text
|
||
# %%
|
||
text_to_predict = ["ethereum is great asset", "etherum is goin down"]
|
||
data_clean = clean_data(text_to_predict)
|
||
test_matrix = vectorizer.transform(data_clean)
|
||
data_predicted = model.predict(test_matrix).tolist()
|
||
|
||
# %%
|
||
positives = sum([1 for x in data_predicted if x == 1])
|
||
negatives = sum([1 for x in data_predicted if x == -1])
|
||
# %%
|
||
data_to_send = {"pos_perc": positives/(positives+negatives),
|
||
"neg_perc": negatives/(positives+negatives)} |