model pickling

This commit is contained in:
Krzysztof Szubiczuk 2022-01-27 17:56:37 +01:00
parent 4b4ede27c5
commit 42de0bde5e
5 changed files with 197 additions and 1 deletions

BIN
model_pkl Normal file

Binary file not shown.

View File

@ -9,6 +9,7 @@ from sklearn.feature_extraction.text \
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from copy import deepcopy
# %% [markdown]
### Reading data - this part need changing when data
@ -101,6 +102,7 @@ def train_model_and_predict(train_set, test_set,
# print(confusion_matrix(predictions,y_test_arr))
print("Classification report")
print(classification_report(predictions,y_test_arr))
return
# %%
vectorizers = [
("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
@ -119,4 +121,6 @@ for vec in vectorizers:
model = LogisticRegression(max_iter=1000),
colname_text = 'Tweet',
colname_sent = 'sent_score')
# %%
# %% [markdown]
### TODO:
#### models to test: SVM, Random Trees, Bayes

122
twitter_model.py Normal file
View File

@ -0,0 +1,122 @@
# %%
import pandas as pd
import os
import re
import numpy as np
from sklearn.feature_extraction.text \
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from copy import deepcopy
# %% [markdown]
### Reading data - this part need changing when data
# %%
path = os.getcwd()
filename = 'BTC_tweets_daily_example.csv'
filepath = path+'/'+filename
data_all = pd.read_csv(filepath, header=0,
delimiter=',',
# encoding_errors='surrogateescape'
)
# %% [markdown]
### Function definitions
# %%
change_dict = {
# tokens
" username ": ['@\w+|@'],
" url ": ['http\S*'],
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
# standardization
', ': ['\s,'],
'. ': ['\s\.'],
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
"'": ["<EFBFBD>"],
'?': ["\s\?"],
'!': ["\s\!"],
'".': ["\s\"\."],
'",': ["\s\"\,"],
'" ': ["\s\"\s"]
}
def clean_lines(line, change_dict):
line = str(line).lower()
for change_to, change_regex_list in change_dict.items():
for change_regex in change_regex_list:
line = re.sub(change_regex, change_to, line)
return line
def get_rep_idx_to_cut_out_from_str(line):
occurence = 0
idx_to_cut = []
for idx, letter in enumerate(line):
if idx > 0:
occurence = occurence+1 if line[idx-1] == letter else 0
if occurence >= 2:
idx_to_cut.append(idx)
return idx_to_cut
def truncate_duplicated_letters_to_two(line):
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
str_out =''
for i,s in enumerate(line):
if i not in idx_to_cut:
str_out += s
return str_out
# %% [markdown]
### Cleaning
# %%
def clean_data_frame(df, text_colanme = "Tweet", is_sent_colname = True, sent_colname = "sent_score"):
data_all = deepcopy(df)
text = [clean_lines(x, change_dict) for x in data_all.loc[:, text_colanme].values.tolist()]
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
# data_all_clean = deepcopy(df)
data_all.Tweet = text
if is_sent_colname:
data_all_clean = data_all.dropna(subset = [sent_colname], inplace=False)
return data_all_clean
# %%
data_all_clean = clean_data_frame(data_all)
# %% [markdown]
### Testing models
# %%
data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']]
idx = data_model.index
data_model['random_number'] = np.random.randn(len(idx))
train_set = data_model[data_model['random_number'] <= 0.8]
test_set = data_model[data_model['random_number'] > 0.8]
# %%
def train_model_and_predict(train_set, test_set,
vectorizer,
# vectorizer_name,
model,
colname_text = 'Tweet',
colname_sent = 'sent_score'):
train_matrix = vectorizer.fit_transform(train_set[colname_text])
test_matrix = vectorizer.transform(test_set[colname_text])
X_train = train_matrix
X_test = test_matrix
y_train = train_set[colname_sent]
y_test = test_set[colname_sent]
model.fit(X_train,y_train)
predictions = model.predict(X_test).tolist()
y_test_arr = np.asarray(y_test)
return {"model": model, "predictions": predictions, "test_set": test_set}
# %%
results_model = train_model_and_predict(train_set, test_set,
vectorizer = TfidfVectorizer(norm=None),
# vectorizer_name = vec[0],
model = LogisticRegression(max_iter=1000),
colname_text = 'Tweet',
colname_sent = 'sent_score')
# %%
tweet_model = results_model["model"]
# %%
import pickle
# %%
with open('model_pkl', 'wb') as files:
pickle.dump(tweet_model, files)
# %%

70
twitter_pred.py Normal file
View File

@ -0,0 +1,70 @@
# %%
import pickle
import json
import re
# %%
with open('model_pkl' , 'rb') as f:
model = pickle.load(f)
# %%
with open('vectorizer_pkl' , 'rb') as f:
vectorizer = pickle.load(f)
# %%
change_dict = {
# tokens
" username ": ['@\w+|@'],
" url ": ['http\S*'],
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
# standardization
', ': ['\s,'],
'. ': ['\s\.'],
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
"'": ["<EFBFBD>"],
'?': ["\s\?"],
'!': ["\s\!"],
'".': ["\s\"\."],
'",': ["\s\"\,"],
'" ': ["\s\"\s"]
}
def clean_lines(line, change_dict):
line = str(line).lower()
for change_to, change_regex_list in change_dict.items():
for change_regex in change_regex_list:
line = re.sub(change_regex, change_to, line)
return line
def get_rep_idx_to_cut_out_from_str(line):
occurence = 0
idx_to_cut = []
for idx, letter in enumerate(line):
if idx > 0:
occurence = occurence+1 if line[idx-1] == letter else 0
if occurence >= 2:
idx_to_cut.append(idx)
return idx_to_cut
def truncate_duplicated_letters_to_two(line):
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
str_out =''
for i,s in enumerate(line):
if i not in idx_to_cut:
str_out += s
return str_out
def clean_data(l):
text = [clean_lines(x, change_dict) for x in l]
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
return text
# %%
text_to_predict = ["ethereum is great asset", "etherum is goin down"]
data_clean = clean_data(text_to_predict)
test_matrix = vectorizer.transform(data_clean)
data_predicted = model.predict(test_matrix).tolist()
# %%
positives = sum([1 for x in data_predicted if x == 1])
negatives = sum([1 for x in data_predicted if x == -1])
# %%
data_to_send = {"pos_perc": positives/(positives+negatives),
"neg_perc": negatives/(positives+negatives)}

BIN
vectorizer_pkl Normal file

Binary file not shown.