Functions to clean, normalize and create token in text data. Data source needs to be changed to proper one

This commit is contained in:
Krzysztof Szubiczuk 2022-01-03 18:40:57 +01:00
parent 46736a9370
commit bc8df9aa86
1 changed files with 61 additions and 0 deletions

61
twitter.py Normal file
View File

@ -0,0 +1,61 @@
# %%
import pandas as pd
import os
import re
# %% [markdown]
### Reading data - this part need changing when data
# %%
path = os.getcwd()
filename = 'training_data_clean.csv'
filepath = path+'/'+filename
data = pd.read_csv(filepath, header=None,
delimiter=',', encoding_errors='surrogateescape')
data.columns = ['index', 'id','date', 'query', 'user', 'text']
# %% [markdown]
### Function definitions
# %%
change_dict = {
# tokens
"USERNAME": ['@\w+|@'],
"URL": ['http\S*'],
"EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
# standardization
', ': ['\s,'],
'. ': ['\s\.'],
' ': ['\s{2,}'],
"'": ["<EFBFBD>"],
'?': ["\s\?+|\?+"],
'!': ["\s\!+|\!+"]
}
def clean_lines(line, change_dict):
line = line.lower()
for change_to, change_regex_list in change_dict.items():
for change_regex in change_regex_list:
line = re.sub(change_regex, change_to, line)
return line
def get_rep_idx_to_cut_out_from_str(line):
occurence = 0
idx_to_cut = []
for idx, letter in enumerate(line):
if idx > 0:
occurence = occurence+1 if line[idx-1] == letter else 0
if occurence >= 2:
idx_to_cut.append(idx)
return idx_to_cut
def truncate_duplicated_letters_to_two(line):
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
str_out =''
for i,s in enumerate(line):
if i not in idx_to_cut:
str_out += s
return str_out
# %% [markdown]
### Cleaning
# %%
text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
data.text = text
# %%