diff --git a/model_pkl b/model_pkl new file mode 100644 index 0000000..e3a58bf Binary files /dev/null and b/model_pkl differ diff --git a/twitter.py b/twitter.py index d340896..b393e6f 100644 --- a/twitter.py +++ b/twitter.py @@ -9,6 +9,7 @@ from sklearn.feature_extraction.text \ from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix,classification_report +from sklearn.model_selection import train_test_split from copy import deepcopy # %% [markdown] ### Reading data - this part need changing when data @@ -101,6 +102,7 @@ def train_model_and_predict(train_set, test_set, # print(confusion_matrix(predictions,y_test_arr)) print("Classification report") print(classification_report(predictions,y_test_arr)) + return # %% vectorizers = [ ("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')), @@ -119,4 +121,6 @@ for vec in vectorizers: model = LogisticRegression(max_iter=1000), colname_text = 'Tweet', colname_sent = 'sent_score') -# %% +# %% [markdown] +### TODO: +#### models to test: SVM, Random Trees, Bayes \ No newline at end of file diff --git a/twitter_model.py b/twitter_model.py new file mode 100644 index 0000000..03515aa --- /dev/null +++ b/twitter_model.py @@ -0,0 +1,122 @@ +# %% +import pandas as pd +import os +import re +import numpy as np +from sklearn.feature_extraction.text \ + import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer +from sklearn.pipeline import Pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import confusion_matrix,classification_report +from sklearn.model_selection import train_test_split +from copy import deepcopy +# %% [markdown] +### Reading data - this part need changing when data +# %% +path = os.getcwd() +filename = 'BTC_tweets_daily_example.csv' +filepath = path+'/'+filename +data_all = pd.read_csv(filepath, header=0, + delimiter=',', + # encoding_errors='surrogateescape' + ) +# %% [markdown] +### Function definitions +# %% +change_dict = { + # tokens + " username ": ['@\w+|@'], + " url ": ['http\S*'], + " emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"], + " number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"], + # standardization + ', ': ['\s,'], + '. ': ['\s\.'], + ' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'], + "'": ["�"], + '?': ["\s\?"], + '!': ["\s\!"], + '".': ["\s\"\."], + '",': ["\s\"\,"], + '" ': ["\s\"\s"] + } + +def clean_lines(line, change_dict): + line = str(line).lower() + for change_to, change_regex_list in change_dict.items(): + for change_regex in change_regex_list: + line = re.sub(change_regex, change_to, line) + return line + +def get_rep_idx_to_cut_out_from_str(line): + occurence = 0 + idx_to_cut = [] + for idx, letter in enumerate(line): + if idx > 0: + occurence = occurence+1 if line[idx-1] == letter else 0 + if occurence >= 2: + idx_to_cut.append(idx) + return idx_to_cut + +def truncate_duplicated_letters_to_two(line): + idx_to_cut = get_rep_idx_to_cut_out_from_str(line) + str_out ='' + for i,s in enumerate(line): + if i not in idx_to_cut: + str_out += s + return str_out +# %% [markdown] +### Cleaning +# %% +def clean_data_frame(df, text_colanme = "Tweet", is_sent_colname = True, sent_colname = "sent_score"): + data_all = deepcopy(df) + text = [clean_lines(x, change_dict) for x in data_all.loc[:, text_colanme].values.tolist()] + text = [truncate_duplicated_letters_to_two(x).strip() for x in text] + # data_all_clean = deepcopy(df) + data_all.Tweet = text + if is_sent_colname: + data_all_clean = data_all.dropna(subset = [sent_colname], inplace=False) + return data_all_clean +# %% +data_all_clean = clean_data_frame(data_all) +# %% [markdown] +### Testing models +# %% +data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']] +idx = data_model.index +data_model['random_number'] = np.random.randn(len(idx)) +train_set = data_model[data_model['random_number'] <= 0.8] +test_set = data_model[data_model['random_number'] > 0.8] +# %% +def train_model_and_predict(train_set, test_set, + vectorizer, + # vectorizer_name, + model, + colname_text = 'Tweet', + colname_sent = 'sent_score'): + train_matrix = vectorizer.fit_transform(train_set[colname_text]) + test_matrix = vectorizer.transform(test_set[colname_text]) + X_train = train_matrix + X_test = test_matrix + y_train = train_set[colname_sent] + y_test = test_set[colname_sent] + model.fit(X_train,y_train) + predictions = model.predict(X_test).tolist() + y_test_arr = np.asarray(y_test) + return {"model": model, "predictions": predictions, "test_set": test_set} +# %% +results_model = train_model_and_predict(train_set, test_set, + vectorizer = TfidfVectorizer(norm=None), + # vectorizer_name = vec[0], + model = LogisticRegression(max_iter=1000), + colname_text = 'Tweet', + colname_sent = 'sent_score') +# %% +tweet_model = results_model["model"] +# %% +import pickle + +# %% +with open('model_pkl', 'wb') as files: + pickle.dump(tweet_model, files) +# %% diff --git a/twitter_pred.py b/twitter_pred.py new file mode 100644 index 0000000..a6169f2 --- /dev/null +++ b/twitter_pred.py @@ -0,0 +1,70 @@ +# %% +import pickle +import json +import re +# %% +with open('model_pkl' , 'rb') as f: + model = pickle.load(f) +# %% +with open('vectorizer_pkl' , 'rb') as f: + vectorizer = pickle.load(f) +# %% +change_dict = { + # tokens + " username ": ['@\w+|@'], + " url ": ['http\S*'], + " emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"], + " number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"], + # standardization + ', ': ['\s,'], + '. ': ['\s\.'], + ' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'], + "'": ["�"], + '?': ["\s\?"], + '!': ["\s\!"], + '".': ["\s\"\."], + '",': ["\s\"\,"], + '" ': ["\s\"\s"] + } + +def clean_lines(line, change_dict): + line = str(line).lower() + for change_to, change_regex_list in change_dict.items(): + for change_regex in change_regex_list: + line = re.sub(change_regex, change_to, line) + return line + +def get_rep_idx_to_cut_out_from_str(line): + occurence = 0 + idx_to_cut = [] + for idx, letter in enumerate(line): + if idx > 0: + occurence = occurence+1 if line[idx-1] == letter else 0 + if occurence >= 2: + idx_to_cut.append(idx) + return idx_to_cut + +def truncate_duplicated_letters_to_two(line): + idx_to_cut = get_rep_idx_to_cut_out_from_str(line) + str_out ='' + for i,s in enumerate(line): + if i not in idx_to_cut: + str_out += s + return str_out + +def clean_data(l): + text = [clean_lines(x, change_dict) for x in l] + text = [truncate_duplicated_letters_to_two(x).strip() for x in text] + return text +# %% +text_to_predict = ["ethereum is great asset", "etherum is goin down"] +data_clean = clean_data(text_to_predict) +test_matrix = vectorizer.transform(data_clean) +data_predicted = model.predict(test_matrix).tolist() + +# %% +positives = sum([1 for x in data_predicted if x == 1]) +negatives = sum([1 for x in data_predicted if x == -1]) +# %% +data_to_send = {"pos_perc": positives/(positives+negatives), + "neg_perc": negatives/(positives+negatives)} \ No newline at end of file diff --git a/vectorizer_pkl b/vectorizer_pkl new file mode 100644 index 0000000..a4d03aa Binary files /dev/null and b/vectorizer_pkl differ