Compare commits
5 Commits
master
...
sent_analy
Author | SHA1 | Date | |
---|---|---|---|
|
42de0bde5e | ||
|
4b4ede27c5 | ||
|
8a94bb7f1f | ||
|
eea6f1b259 | ||
|
de6152b9f8 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -513,6 +513,9 @@ FodyWeavers.xsd
|
|||||||
*.msm
|
*.msm
|
||||||
*.msp
|
*.msp
|
||||||
|
|
||||||
|
# Big csv files
|
||||||
|
*.csv
|
||||||
|
|
||||||
# JetBrains Rider
|
# JetBrains Rider
|
||||||
|
|
||||||
### VisualStudio Patch ###
|
### VisualStudio Patch ###
|
||||||
|
91
twitter.py
91
twitter.py
@ -1,35 +1,49 @@
|
|||||||
# %%
|
# %%
|
||||||
|
# from platform import java_ver
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text \
|
||||||
|
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import confusion_matrix,classification_report
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from copy import deepcopy
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Reading data - this part need changing when data
|
### Reading data - this part need changing when data
|
||||||
# %%
|
# %%
|
||||||
path = os.getcwd()
|
path = os.getcwd()
|
||||||
filename = 'training_data_clean.csv'
|
filename = 'BTC_tweets_daily_example.csv'
|
||||||
filepath = path+'/'+filename
|
filepath = path+'/'+filename
|
||||||
data = pd.read_csv(filepath, header=None,
|
data_all = pd.read_csv(filepath, header=0,
|
||||||
delimiter=',', encoding_errors='surrogateescape')
|
delimiter=',',
|
||||||
data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
# encoding_errors='surrogateescape'
|
||||||
|
)
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Function definitions
|
### Function definitions
|
||||||
# %%
|
# %%
|
||||||
change_dict = {
|
change_dict = {
|
||||||
# tokens
|
# tokens
|
||||||
"USERNAME": ['@\w+|@'],
|
" username ": ['@\w+|@'],
|
||||||
"URL": ['http\S*'],
|
" url ": ['http\S*'],
|
||||||
"EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
|
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||||||
|
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
|
||||||
# standardization
|
# standardization
|
||||||
', ': ['\s,'],
|
', ': ['\s,'],
|
||||||
'. ': ['\s\.'],
|
'. ': ['\s\.'],
|
||||||
' ': ['\s{2,}'],
|
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
|
||||||
"'": ["<EFBFBD>"],
|
"'": ["<EFBFBD>"],
|
||||||
'?': ["\s\?+|\?+"],
|
'?': ["\s\?"],
|
||||||
'!': ["\s\!+|\!+"]
|
'!': ["\s\!"],
|
||||||
|
'".': ["\s\"\."],
|
||||||
|
'",': ["\s\"\,"],
|
||||||
|
'" ': ["\s\"\s"]
|
||||||
}
|
}
|
||||||
|
|
||||||
def clean_lines(line, change_dict):
|
def clean_lines(line, change_dict):
|
||||||
line = line.lower()
|
line = str(line).lower()
|
||||||
for change_to, change_regex_list in change_dict.items():
|
for change_to, change_regex_list in change_dict.items():
|
||||||
for change_regex in change_regex_list:
|
for change_regex in change_regex_list:
|
||||||
line = re.sub(change_regex, change_to, line)
|
line = re.sub(change_regex, change_to, line)
|
||||||
@ -55,7 +69,58 @@ def truncate_duplicated_letters_to_two(line):
|
|||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Cleaning
|
### Cleaning
|
||||||
# %%
|
# %%
|
||||||
text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
|
text = [clean_lines(x, change_dict) for x in data_all.loc[:, 'Tweet'].values.tolist()]
|
||||||
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||||||
data.text = text
|
data_all_clean = deepcopy(data_all)
|
||||||
|
data_all_clean.Tweet = text
|
||||||
|
data_all_clean = data_all_clean.dropna(subset = ["sent_score"], inplace=False)
|
||||||
|
# %% [markdown]
|
||||||
|
### Testing models
|
||||||
# %%
|
# %%
|
||||||
|
data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']]
|
||||||
|
idx = data_model.index
|
||||||
|
data_model['random_number'] = np.random.randn(len(idx))
|
||||||
|
train_set = data_model[data_model['random_number'] <= 0.8]
|
||||||
|
test_set = data_model[data_model['random_number'] > 0.8]
|
||||||
|
# %%
|
||||||
|
def train_model_and_predict(train_set, test_set,
|
||||||
|
vectorizer, vectorizer_name,
|
||||||
|
model,
|
||||||
|
colname_text = 'Tweet',
|
||||||
|
colname_sent = 'sent_score'):
|
||||||
|
train_matrix = vectorizer.fit_transform(train_set[colname_text])
|
||||||
|
test_matrix = vectorizer.transform(test_set[colname_text])
|
||||||
|
X_train = train_matrix
|
||||||
|
X_test = test_matrix
|
||||||
|
y_train = train_set[colname_sent]
|
||||||
|
y_test = test_set[colname_sent]
|
||||||
|
model.fit(X_train,y_train)
|
||||||
|
predictions = model.predict(X_test)
|
||||||
|
y_test_arr = np.asarray(y_test)
|
||||||
|
print(f"{vectorizer_name}")
|
||||||
|
# print("Confussion matrix")
|
||||||
|
# print(confusion_matrix(predictions,y_test_arr))
|
||||||
|
print("Classification report")
|
||||||
|
print(classification_report(predictions,y_test_arr))
|
||||||
|
return
|
||||||
|
# %%
|
||||||
|
vectorizers = [
|
||||||
|
("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
|
||||||
|
("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')),
|
||||||
|
# ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')),
|
||||||
|
("TfidfVectorizer", TfidfVectorizer()),
|
||||||
|
("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)),
|
||||||
|
("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)),
|
||||||
|
("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)),
|
||||||
|
]
|
||||||
|
# %%
|
||||||
|
for vec in vectorizers:
|
||||||
|
train_model_and_predict(train_set, test_set,
|
||||||
|
vectorizer = vec[1],
|
||||||
|
vectorizer_name = vec[0],
|
||||||
|
model = LogisticRegression(max_iter=1000),
|
||||||
|
colname_text = 'Tweet',
|
||||||
|
colname_sent = 'sent_score')
|
||||||
|
# %% [markdown]
|
||||||
|
### TODO:
|
||||||
|
#### models to test: SVM, Random Trees, Bayes
|
122
twitter_model.py
Normal file
122
twitter_model.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text \
|
||||||
|
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import confusion_matrix,classification_report
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from copy import deepcopy
|
||||||
|
# %% [markdown]
|
||||||
|
### Reading data - this part need changing when data
|
||||||
|
# %%
|
||||||
|
path = os.getcwd()
|
||||||
|
filename = 'BTC_tweets_daily_example.csv'
|
||||||
|
filepath = path+'/'+filename
|
||||||
|
data_all = pd.read_csv(filepath, header=0,
|
||||||
|
delimiter=',',
|
||||||
|
# encoding_errors='surrogateescape'
|
||||||
|
)
|
||||||
|
# %% [markdown]
|
||||||
|
### Function definitions
|
||||||
|
# %%
|
||||||
|
change_dict = {
|
||||||
|
# tokens
|
||||||
|
" username ": ['@\w+|@'],
|
||||||
|
" url ": ['http\S*'],
|
||||||
|
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||||||
|
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
|
||||||
|
# standardization
|
||||||
|
', ': ['\s,'],
|
||||||
|
'. ': ['\s\.'],
|
||||||
|
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
|
||||||
|
"'": ["<EFBFBD>"],
|
||||||
|
'?': ["\s\?"],
|
||||||
|
'!': ["\s\!"],
|
||||||
|
'".': ["\s\"\."],
|
||||||
|
'",': ["\s\"\,"],
|
||||||
|
'" ': ["\s\"\s"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_lines(line, change_dict):
|
||||||
|
line = str(line).lower()
|
||||||
|
for change_to, change_regex_list in change_dict.items():
|
||||||
|
for change_regex in change_regex_list:
|
||||||
|
line = re.sub(change_regex, change_to, line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
def get_rep_idx_to_cut_out_from_str(line):
|
||||||
|
occurence = 0
|
||||||
|
idx_to_cut = []
|
||||||
|
for idx, letter in enumerate(line):
|
||||||
|
if idx > 0:
|
||||||
|
occurence = occurence+1 if line[idx-1] == letter else 0
|
||||||
|
if occurence >= 2:
|
||||||
|
idx_to_cut.append(idx)
|
||||||
|
return idx_to_cut
|
||||||
|
|
||||||
|
def truncate_duplicated_letters_to_two(line):
|
||||||
|
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
|
||||||
|
str_out =''
|
||||||
|
for i,s in enumerate(line):
|
||||||
|
if i not in idx_to_cut:
|
||||||
|
str_out += s
|
||||||
|
return str_out
|
||||||
|
# %% [markdown]
|
||||||
|
### Cleaning
|
||||||
|
# %%
|
||||||
|
def clean_data_frame(df, text_colanme = "Tweet", is_sent_colname = True, sent_colname = "sent_score"):
|
||||||
|
data_all = deepcopy(df)
|
||||||
|
text = [clean_lines(x, change_dict) for x in data_all.loc[:, text_colanme].values.tolist()]
|
||||||
|
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||||||
|
# data_all_clean = deepcopy(df)
|
||||||
|
data_all.Tweet = text
|
||||||
|
if is_sent_colname:
|
||||||
|
data_all_clean = data_all.dropna(subset = [sent_colname], inplace=False)
|
||||||
|
return data_all_clean
|
||||||
|
# %%
|
||||||
|
data_all_clean = clean_data_frame(data_all)
|
||||||
|
# %% [markdown]
|
||||||
|
### Testing models
|
||||||
|
# %%
|
||||||
|
data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']]
|
||||||
|
idx = data_model.index
|
||||||
|
data_model['random_number'] = np.random.randn(len(idx))
|
||||||
|
train_set = data_model[data_model['random_number'] <= 0.8]
|
||||||
|
test_set = data_model[data_model['random_number'] > 0.8]
|
||||||
|
# %%
|
||||||
|
def train_model_and_predict(train_set, test_set,
|
||||||
|
vectorizer,
|
||||||
|
# vectorizer_name,
|
||||||
|
model,
|
||||||
|
colname_text = 'Tweet',
|
||||||
|
colname_sent = 'sent_score'):
|
||||||
|
train_matrix = vectorizer.fit_transform(train_set[colname_text])
|
||||||
|
test_matrix = vectorizer.transform(test_set[colname_text])
|
||||||
|
X_train = train_matrix
|
||||||
|
X_test = test_matrix
|
||||||
|
y_train = train_set[colname_sent]
|
||||||
|
y_test = test_set[colname_sent]
|
||||||
|
model.fit(X_train,y_train)
|
||||||
|
predictions = model.predict(X_test).tolist()
|
||||||
|
y_test_arr = np.asarray(y_test)
|
||||||
|
return {"model": model, "predictions": predictions, "test_set": test_set}
|
||||||
|
# %%
|
||||||
|
results_model = train_model_and_predict(train_set, test_set,
|
||||||
|
vectorizer = TfidfVectorizer(norm=None),
|
||||||
|
# vectorizer_name = vec[0],
|
||||||
|
model = LogisticRegression(max_iter=1000),
|
||||||
|
colname_text = 'Tweet',
|
||||||
|
colname_sent = 'sent_score')
|
||||||
|
# %%
|
||||||
|
tweet_model = results_model["model"]
|
||||||
|
# %%
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
# %%
|
||||||
|
with open('model_pkl', 'wb') as files:
|
||||||
|
pickle.dump(tweet_model, files)
|
||||||
|
# %%
|
70
twitter_pred.py
Normal file
70
twitter_pred.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
# %%
|
||||||
|
import pickle
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
# %%
|
||||||
|
with open('model_pkl' , 'rb') as f:
|
||||||
|
model = pickle.load(f)
|
||||||
|
# %%
|
||||||
|
with open('vectorizer_pkl' , 'rb') as f:
|
||||||
|
vectorizer = pickle.load(f)
|
||||||
|
# %%
|
||||||
|
change_dict = {
|
||||||
|
# tokens
|
||||||
|
" username ": ['@\w+|@'],
|
||||||
|
" url ": ['http\S*'],
|
||||||
|
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||||||
|
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
|
||||||
|
# standardization
|
||||||
|
', ': ['\s,'],
|
||||||
|
'. ': ['\s\.'],
|
||||||
|
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
|
||||||
|
"'": ["<EFBFBD>"],
|
||||||
|
'?': ["\s\?"],
|
||||||
|
'!': ["\s\!"],
|
||||||
|
'".': ["\s\"\."],
|
||||||
|
'",': ["\s\"\,"],
|
||||||
|
'" ': ["\s\"\s"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_lines(line, change_dict):
|
||||||
|
line = str(line).lower()
|
||||||
|
for change_to, change_regex_list in change_dict.items():
|
||||||
|
for change_regex in change_regex_list:
|
||||||
|
line = re.sub(change_regex, change_to, line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
def get_rep_idx_to_cut_out_from_str(line):
|
||||||
|
occurence = 0
|
||||||
|
idx_to_cut = []
|
||||||
|
for idx, letter in enumerate(line):
|
||||||
|
if idx > 0:
|
||||||
|
occurence = occurence+1 if line[idx-1] == letter else 0
|
||||||
|
if occurence >= 2:
|
||||||
|
idx_to_cut.append(idx)
|
||||||
|
return idx_to_cut
|
||||||
|
|
||||||
|
def truncate_duplicated_letters_to_two(line):
|
||||||
|
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
|
||||||
|
str_out =''
|
||||||
|
for i,s in enumerate(line):
|
||||||
|
if i not in idx_to_cut:
|
||||||
|
str_out += s
|
||||||
|
return str_out
|
||||||
|
|
||||||
|
def clean_data(l):
|
||||||
|
text = [clean_lines(x, change_dict) for x in l]
|
||||||
|
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||||||
|
return text
|
||||||
|
# %%
|
||||||
|
text_to_predict = ["ethereum is great asset", "etherum is goin down"]
|
||||||
|
data_clean = clean_data(text_to_predict)
|
||||||
|
test_matrix = vectorizer.transform(data_clean)
|
||||||
|
data_predicted = model.predict(test_matrix).tolist()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
positives = sum([1 for x in data_predicted if x == 1])
|
||||||
|
negatives = sum([1 for x in data_predicted if x == -1])
|
||||||
|
# %%
|
||||||
|
data_to_send = {"pos_perc": positives/(positives+negatives),
|
||||||
|
"neg_perc": negatives/(positives+negatives)}
|
BIN
vectorizer_pkl
Normal file
BIN
vectorizer_pkl
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user