Functions to clean, normalize and create token in text data. Data source needs to be changed to proper one
This commit is contained in:
parent
46736a9370
commit
bc8df9aa86
61
twitter.py
Normal file
61
twitter.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
# %% [markdown]
|
||||||
|
### Reading data - this part need changing when data
|
||||||
|
# %%
|
||||||
|
path = os.getcwd()
|
||||||
|
filename = 'training_data_clean.csv'
|
||||||
|
filepath = path+'/'+filename
|
||||||
|
data = pd.read_csv(filepath, header=None,
|
||||||
|
delimiter=',', encoding_errors='surrogateescape')
|
||||||
|
data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
||||||
|
# %% [markdown]
|
||||||
|
### Function definitions
|
||||||
|
# %%
|
||||||
|
change_dict = {
|
||||||
|
# tokens
|
||||||
|
"USERNAME": ['@\w+|@'],
|
||||||
|
"URL": ['http\S*'],
|
||||||
|
"EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
|
||||||
|
# standardization
|
||||||
|
', ': ['\s,'],
|
||||||
|
'. ': ['\s\.'],
|
||||||
|
' ': ['\s{2,}'],
|
||||||
|
"'": ["<EFBFBD>"],
|
||||||
|
'?': ["\s\?+|\?+"],
|
||||||
|
'!': ["\s\!+|\!+"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def clean_lines(line, change_dict):
|
||||||
|
line = line.lower()
|
||||||
|
for change_to, change_regex_list in change_dict.items():
|
||||||
|
for change_regex in change_regex_list:
|
||||||
|
line = re.sub(change_regex, change_to, line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
def get_rep_idx_to_cut_out_from_str(line):
|
||||||
|
occurence = 0
|
||||||
|
idx_to_cut = []
|
||||||
|
for idx, letter in enumerate(line):
|
||||||
|
if idx > 0:
|
||||||
|
occurence = occurence+1 if line[idx-1] == letter else 0
|
||||||
|
if occurence >= 2:
|
||||||
|
idx_to_cut.append(idx)
|
||||||
|
return idx_to_cut
|
||||||
|
|
||||||
|
def truncate_duplicated_letters_to_two(line):
|
||||||
|
idx_to_cut = get_rep_idx_to_cut_out_from_str(line)
|
||||||
|
str_out =''
|
||||||
|
for i,s in enumerate(line):
|
||||||
|
if i not in idx_to_cut:
|
||||||
|
str_out += s
|
||||||
|
return str_out
|
||||||
|
# %% [markdown]
|
||||||
|
### Cleaning
|
||||||
|
# %%
|
||||||
|
text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
|
||||||
|
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||||||
|
data.text = text
|
||||||
|
# %%
|
Loading…
Reference in New Issue
Block a user