From eea6f1b25962966162bec781b746317d4c1f7a19 Mon Sep 17 00:00:00 2001 From: Krzysztof Szubiczuk Date: Tue, 4 Jan 2022 18:02:13 +0100 Subject: [PATCH] Improved data cleaning - removed more emojis and tokenized numbers --- twitter.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/twitter.py b/twitter.py index bb27103..068c47a 100644 --- a/twitter.py +++ b/twitter.py @@ -6,30 +6,35 @@ import re ### Reading data - this part need changing when data # %% path = os.getcwd() -filename = 'training_data_clean.csv' +filename = 'BTC_tweets_daily_example.csv' filepath = path+'/'+filename -data = pd.read_csv(filepath, header=None, - delimiter=',', encoding_errors='surrogateescape') -data.columns = ['index', 'id','date', 'query', 'user', 'text'] +data_all = pd.read_csv(filepath, header=0, + delimiter=',', + # encoding_errors='surrogateescape' + ) +# data.columns = ['index', 'id','date', 'query', 'user', 'text'] +# %% +data = data_all.loc[:,['Tweet', 'Sentiment']] # %% [markdown] ### Function definitions # %% change_dict = { # tokens - "USERNAME": ['@\w+|@'], - "URL": ['http\S*'], - "EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"], + " username ": ['@\w+|@'], + " url ": ['http\S*'], + " emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"], + " number ": ["[\+\-\$]?[\d]+[,\.]?[\d]+[%]?"], # standardization ', ': ['\s,'], '. ': ['\s\.'], - ' ': ['\s{2,}'], + ' ': ['\s{2,}', '\n'], "'": ["�"], - '?': ["\s\?+|\?+"], - '!': ["\s\!+|\!+"] + '?': ["\s\?"], + '!': ["\s\!"], } def clean_lines(line, change_dict): - line = line.lower() + line = str(line).lower() for change_to, change_regex_list in change_dict.items(): for change_regex in change_regex_list: line = re.sub(change_regex, change_to, line) @@ -55,7 +60,6 @@ def truncate_duplicated_letters_to_two(line): # %% [markdown] ### Cleaning # %% -text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()] +text = [clean_lines(x, change_dict) for x in data.loc[:, 'Tweet'].values.tolist()] text = [truncate_duplicated_letters_to_two(x).strip() for x in text] -data.text = text -# %% +data.Tweet = text \ No newline at end of file