# %% import pandas as pd import os import re # %% [markdown] ### Reading data - this part need changing when data # %% path = os.getcwd() filename = 'training_data_clean.csv' filepath = path+'/'+filename data = pd.read_csv(filepath, header=None, delimiter=',', encoding_errors='surrogateescape') data.columns = ['index', 'id','date', 'query', 'user', 'text'] # %% [markdown] ### Function definitions # %% change_dict = { # tokens "USERNAME": ['@\w+|@'], "URL": ['http\S*'], "EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"], # standardization ', ': ['\s,'], '. ': ['\s\.'], ' ': ['\s{2,}'], "'": ["�"], '?': ["\s\?+|\?+"], '!': ["\s\!+|\!+"] } def clean_lines(line, change_dict): line = line.lower() for change_to, change_regex_list in change_dict.items(): for change_regex in change_regex_list: line = re.sub(change_regex, change_to, line) return line def get_rep_idx_to_cut_out_from_str(line): occurence = 0 idx_to_cut = [] for idx, letter in enumerate(line): if idx > 0: occurence = occurence+1 if line[idx-1] == letter else 0 if occurence >= 2: idx_to_cut.append(idx) return idx_to_cut def truncate_duplicated_letters_to_two(line): idx_to_cut = get_rep_idx_to_cut_out_from_str(line) str_out ='' for i,s in enumerate(line): if i not in idx_to_cut: str_out += s return str_out # %% [markdown] ### Cleaning # %% text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()] text = [truncate_duplicated_letters_to_two(x).strip() for x in text] data.text = text # %%