123 lines
4.3 KiB
Python
123 lines
4.3 KiB
Python
# %%
|
||
# from platform import java_ver
|
||
import pandas as pd
|
||
import os
|
||
import re
|
||
import numpy as np
|
||
from sklearn.feature_extraction.text \
|
||
import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.metrics import confusion_matrix,classification_report
|
||
from copy import deepcopy
|
||
# %% [markdown]
|
||
### Reading data - this part need changing when data
|
||
# %%
|
||
path = os.getcwd()
|
||
filename = 'BTC_tweets_daily_example.csv'
|
||
filepath = path+'/'+filename
|
||
data_all = pd.read_csv(filepath, header=0,
|
||
delimiter=',',
|
||
# encoding_errors='surrogateescape'
|
||
)
|
||
# %% [markdown]
|
||
### Function definitions
|
||
# %%
|
||
change_dict = {
|
||
# tokens
|
||
" username ": ['@\w+|@'],
|
||
" url ": ['http\S*'],
|
||
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||
" number ": ["[\+\-\$]?[\d]+[,\.\:k]?[\d]?[%]?"],
|
||
# standardization
|
||
', ': ['\s,'],
|
||
'. ': ['\s\.'],
|
||
' ': ['\s{2,}', '\n', '^rt[\s]+', '\s\:\s'],
|
||
"'": ["<EFBFBD>"],
|
||
'?': ["\s\?"],
|
||
'!': ["\s\!"],
|
||
'".': ["\s\"\."],
|
||
'",': ["\s\"\,"],
|
||
'" ': ["\s\"\s"]
|
||
}
|
||
|
||
def clean_lines(line, change_dict):
|
||
line = str(line).lower()
|
||
for change_to, change_regex_list in change_dict.items():
|
||
for change_regex in change_regex_list:
|
||
line = re.sub(change_regex, change_to, line)
|
||
return line
|
||
|
||
def get_rep_idx_to_cut_out_from_str(line):
|
||
occurence = 0
|
||
idx_to_cut = []
|
||
for idx, letter in enumerate(line):
|
||
if idx > 0:
|
||
occurence = occurence+1 if line[idx-1] == letter else 0
|
||
if occurence >= 2:
|
||
idx_to_cut.append(idx)
|
||
return idx_to_cut
|
||
|
||
def truncate_duplicated_letters_to_two(line):
|
||
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
|
||
str_out =''
|
||
for i,s in enumerate(line):
|
||
if i not in idx_to_cut:
|
||
str_out += s
|
||
return str_out
|
||
# %% [markdown]
|
||
### Cleaning
|
||
# %%
|
||
text = [clean_lines(x, change_dict) for x in data_all.loc[:, 'Tweet'].values.tolist()]
|
||
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||
data_all_clean = deepcopy(data_all)
|
||
data_all_clean.Tweet = text
|
||
data_all_clean = data_all_clean.dropna(subset = ["sent_score"], inplace=False)
|
||
# %% [markdown]
|
||
### Testing models
|
||
# %%
|
||
data_model = data_all_clean.loc[:, ['Tweet', 'sent_score']]
|
||
idx = data_model.index
|
||
data_model['random_number'] = np.random.randn(len(idx))
|
||
train_set = data_model[data_model['random_number'] <= 0.8]
|
||
test_set = data_model[data_model['random_number'] > 0.8]
|
||
# %%
|
||
def train_model_and_predict(train_set, test_set,
|
||
vectorizer, vectorizer_name,
|
||
model,
|
||
colname_text = 'Tweet',
|
||
colname_sent = 'sent_score'):
|
||
train_matrix = vectorizer.fit_transform(train_set[colname_text])
|
||
test_matrix = vectorizer.transform(test_set[colname_text])
|
||
X_train = train_matrix
|
||
X_test = test_matrix
|
||
y_train = train_set[colname_sent]
|
||
y_test = test_set[colname_sent]
|
||
model.fit(X_train,y_train)
|
||
predictions = model.predict(X_test)
|
||
y_test_arr = np.asarray(y_test)
|
||
print(f"{vectorizer_name}")
|
||
# print("Confussion matrix")
|
||
# print(confusion_matrix(predictions,y_test_arr))
|
||
print("Classification report")
|
||
print(classification_report(predictions,y_test_arr))
|
||
# %%
|
||
vectorizers = [
|
||
("CountVectorizer", CountVectorizer(token_pattern=r'\b\w+\b')),
|
||
("HashingVectorizer, n_features=2**15", HashingVectorizer(n_features=2**15, analyzer='word', token_pattern=r'\b\w+\b')),
|
||
# ("HashingVectorizer, n_features=2**20", HashingVectorizer(n_features=2**20, analyzer='word', token_pattern=r'\b\w+\b')),
|
||
("TfidfVectorizer", TfidfVectorizer()),
|
||
("TfidfVectorizer, smooth_idf=False", TfidfVectorizer(smooth_idf=False)),
|
||
("TfidfVectorizer, sublinear_tf=True", TfidfVectorizer(sublinear_tf=True)),
|
||
("TfidfVectorizer, norm=None", TfidfVectorizer(norm=None)),
|
||
]
|
||
# %%
|
||
for vec in vectorizers:
|
||
train_model_and_predict(train_set, test_set,
|
||
vectorizer = vec[1],
|
||
vectorizer_name = vec[0],
|
||
model = LogisticRegression(max_iter=1000),
|
||
colname_text = 'Tweet',
|
||
colname_sent = 'sent_score')
|
||
# %%
|