From bc8df9aa8639134ed1da804750e4ff54f258b8f7 Mon Sep 17 00:00:00 2001 From: Krzysztof Szubiczuk Date: Mon, 3 Jan 2022 18:40:57 +0100 Subject: [PATCH] Functions to clean, normalize and create token in text data. Data source needs to be changed to proper one --- twitter.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 twitter.py diff --git a/twitter.py b/twitter.py new file mode 100644 index 0000000..bb27103 --- /dev/null +++ b/twitter.py @@ -0,0 +1,61 @@ +# %% +import pandas as pd +import os +import re +# %% [markdown] +### Reading data - this part need changing when data +# %% +path = os.getcwd() +filename = 'training_data_clean.csv' +filepath = path+'/'+filename +data = pd.read_csv(filepath, header=None, + delimiter=',', encoding_errors='surrogateescape') +data.columns = ['index', 'id','date', 'query', 'user', 'text'] +# %% [markdown] +### Function definitions +# %% +change_dict = { + # tokens + "USERNAME": ['@\w+|@'], + "URL": ['http\S*'], + "EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"], + # standardization + ', ': ['\s,'], + '. ': ['\s\.'], + ' ': ['\s{2,}'], + "'": ["�"], + '?': ["\s\?+|\?+"], + '!': ["\s\!+|\!+"] + } + +def clean_lines(line, change_dict): + line = line.lower() + for change_to, change_regex_list in change_dict.items(): + for change_regex in change_regex_list: + line = re.sub(change_regex, change_to, line) + return line + +def get_rep_idx_to_cut_out_from_str(line): + occurence = 0 + idx_to_cut = [] + for idx, letter in enumerate(line): + if idx > 0: + occurence = occurence+1 if line[idx-1] == letter else 0 + if occurence >= 2: + idx_to_cut.append(idx) + return idx_to_cut + +def truncate_duplicated_letters_to_two(line): + idx_to_cut = get_rep_idx_to_cut_out_from_str(line) + str_out ='' + for i,s in enumerate(line): + if i not in idx_to_cut: + str_out += s + return str_out +# %% [markdown] +### Cleaning +# %% +text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()] +text = [truncate_duplicated_letters_to_two(x).strip() for x in text] +data.text = text +# %%