Functions to clean, normalize and create token in text data. Data source needs to be changed to proper one

2022-01-03 18:40:57 +01:00 · 2022-01-03 18:40:57 +01:00 · bc8df9aa86
commit bc8df9aa86
parent 46736a9370
1 changed files with 61 additions and 0 deletions
--- a/twitter.py
+++ b/twitter.py
@ -0,0 +1,61 @@
 # %%
 import pandas as pd
 import os
 import re
 # %% [markdown]
 ### Reading data - this part need changing when data
 # %%
 path = os.getcwd()
 filename = 'training_data_clean.csv'
 filepath = path+'/'+filename
 data = pd.read_csv(filepath, header=None,
    delimiter=',', encoding_errors='surrogateescape')
 data.columns = ['index', 'id','date', 'query', 'user', 'text']
 # %% [markdown]
 ### Function definitions
 # %%
 change_dict = {
    # tokens
    "USERNAME": ['@\w+|@'],
    "URL": ['http\S*'],
    "EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
    # standardization
    ', ': ['\s,'],
    '. ': ['\s\.'],
    ' ': ['\s{2,}'],
    "'": ["<EFBFBD>"],
    '?': ["\s\?+|\?+"],
    '!': ["\s\!+|\!+"]
    }
 def clean_lines(line, change_dict):
    line = line.lower()
    for change_to, change_regex_list in change_dict.items():
        for change_regex in change_regex_list:
            line = re.sub(change_regex, change_to, line)
    return line
 def get_rep_idx_to_cut_out_from_str(line):
    occurence = 0
    idx_to_cut = []
    for idx, letter in enumerate(line):
        if idx > 0:
            occurence = occurence+1 if line[idx-1] == letter else 0
            if occurence >= 2:
                idx_to_cut.append(idx)
    return idx_to_cut
 def truncate_duplicated_letters_to_two(line):
    idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
    str_out =''
    for i,s in enumerate(line):
        if i not in idx_to_cut:
            str_out += s
    return str_out
 # %% [markdown]
 ### Cleaning
 # %%
 text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
 text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
 data.text = text
 # %%