From eea6f1b25962966162bec781b746317d4c1f7a19 Mon Sep 17 00:00:00 2001
From: Krzysztof Szubiczuk <krzysztof.szubiczuk@gmail.com>
Date: Tue, 4 Jan 2022 18:02:13 +0100
Subject: [PATCH] Improved data cleaning - removed more emojis and tokenized
 numbers

---
 twitter.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/twitter.py b/twitter.py
index bb27103..068c47a 100644
--- a/twitter.py
+++ b/twitter.py
@@ -6,30 +6,35 @@ import re
 ### Reading data - this part need changing when data
 # %%
 path = os.getcwd()
-filename = 'training_data_clean.csv'
+filename = 'BTC_tweets_daily_example.csv'
 filepath = path+'/'+filename
-data = pd.read_csv(filepath, header=None,
-    delimiter=',', encoding_errors='surrogateescape')
-data.columns = ['index', 'id','date', 'query', 'user', 'text']
+data_all = pd.read_csv(filepath, header=0,
+    delimiter=',', 
+    # encoding_errors='surrogateescape'
+    )
+# data.columns = ['index', 'id','date', 'query', 'user', 'text']
+# %%
+data = data_all.loc[:,['Tweet', 'Sentiment']]
 # %% [markdown]
 ### Function definitions
 # %%
 change_dict = {
     # tokens
-    "USERNAME": ['@\w+|@'],
-    "URL": ['http\S*'],
-    "EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
+    " username ": ['@\w+|@'],
+    " url ": ['http\S*'],
+    " emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
+    " number ": ["[\+\-\$]?[\d]+[,\.]?[\d]+[%]?"],
     # standardization
     ', ': ['\s,'],
     '. ': ['\s\.'],
-    ' ': ['\s{2,}'],
+    ' ': ['\s{2,}', '\n'],
     "'": ["�"],
-    '?': ["\s\?+|\?+"],
-    '!': ["\s\!+|\!+"]
+    '?': ["\s\?"],
+    '!': ["\s\!"],
     }
 
 def clean_lines(line, change_dict):
-    line = line.lower()
+    line = str(line).lower()
     for change_to, change_regex_list in change_dict.items():
         for change_regex in change_regex_list:
             line = re.sub(change_regex, change_to, line)
@@ -55,7 +60,6 @@ def truncate_duplicated_letters_to_two(line):
 # %% [markdown]
 ### Cleaning
 # %%
-text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
+text = [clean_lines(x, change_dict) for x in data.loc[:, 'Tweet'].values.tolist()]
 text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
-data.text = text
-# %%
+data.Tweet = text
\ No newline at end of file