From bc8df9aa8639134ed1da804750e4ff54f258b8f7 Mon Sep 17 00:00:00 2001
From: Krzysztof Szubiczuk <krzszu1@st.amu.edu.pl>
Date: Mon, 3 Jan 2022 18:40:57 +0100
Subject: [PATCH] Functions to clean, normalize and create token in text data.
 Data source needs to be changed to proper one

---
 twitter.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 twitter.py

diff --git a/twitter.py b/twitter.py
new file mode 100644
index 0000000..bb27103
--- /dev/null
+++ b/twitter.py
@@ -0,0 +1,61 @@
+# %%
+import pandas as pd
+import os
+import re
+# %% [markdown]
+### Reading data - this part need changing when data
+# %%
+path = os.getcwd()
+filename = 'training_data_clean.csv'
+filepath = path+'/'+filename
+data = pd.read_csv(filepath, header=None,
+    delimiter=',', encoding_errors='surrogateescape')
+data.columns = ['index', 'id','date', 'query', 'user', 'text']
+# %% [markdown]
+### Function definitions
+# %%
+change_dict = {
+    # tokens
+    "USERNAME": ['@\w+|@'],
+    "URL": ['http\S*'],
+    "EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
+    # standardization
+    ', ': ['\s,'],
+    '. ': ['\s\.'],
+    ' ': ['\s{2,}'],
+    "'": ["�"],
+    '?': ["\s\?+|\?+"],
+    '!': ["\s\!+|\!+"]
+    }
+
+def clean_lines(line, change_dict):
+    line = line.lower()
+    for change_to, change_regex_list in change_dict.items():
+        for change_regex in change_regex_list:
+            line = re.sub(change_regex, change_to, line)
+    return line
+
+def get_rep_idx_to_cut_out_from_str(line):
+    occurence = 0
+    idx_to_cut = []
+    for idx, letter in enumerate(line):
+        if idx > 0:
+            occurence = occurence+1 if line[idx-1] == letter else 0
+            if occurence >= 2:
+                idx_to_cut.append(idx)
+    return idx_to_cut
+
+def truncate_duplicated_letters_to_two(line):
+    idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
+    str_out =''
+    for i,s in enumerate(line):
+        if i not in idx_to_cut:
+            str_out += s
+    return str_out
+# %% [markdown]
+### Cleaning
+# %%
+text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
+text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
+data.text = text
+# %%