Improved data cleaning - removed more emojis and tokenized numbers

This commit is contained in:
Krzysztof Szubiczuk 2022-01-04 18:02:13 +01:00
parent de6152b9f8
commit eea6f1b259

View File

@ -6,30 +6,35 @@ import re
### Reading data - this part need changing when data ### Reading data - this part need changing when data
# %% # %%
path = os.getcwd() path = os.getcwd()
filename = 'training_data_clean.csv' filename = 'BTC_tweets_daily_example.csv'
filepath = path+'/'+filename filepath = path+'/'+filename
data = pd.read_csv(filepath, header=None, data_all = pd.read_csv(filepath, header=0,
delimiter=',', encoding_errors='surrogateescape') delimiter=',',
data.columns = ['index', 'id','date', 'query', 'user', 'text'] # encoding_errors='surrogateescape'
)
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
# %%
data = data_all.loc[:,['Tweet', 'Sentiment']]
# %% [markdown] # %% [markdown]
### Function definitions ### Function definitions
# %% # %%
change_dict = { change_dict = {
# tokens # tokens
"USERNAME": ['@\w+|@'], " username ": ['@\w+|@'],
"URL": ['http\S*'], " url ": ['http\S*'],
"EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"], " emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
" number ": ["[\+\-\$]?[\d]+[,\.]?[\d]+[%]?"],
# standardization # standardization
', ': ['\s,'], ', ': ['\s,'],
'. ': ['\s\.'], '. ': ['\s\.'],
' ': ['\s{2,}'], ' ': ['\s{2,}', '\n'],
"'": ["<EFBFBD>"], "'": ["<EFBFBD>"],
'?': ["\s\?+|\?+"], '?': ["\s\?"],
'!': ["\s\!+|\!+"] '!': ["\s\!"],
} }
def clean_lines(line, change_dict): def clean_lines(line, change_dict):
line = line.lower() line = str(line).lower()
for change_to, change_regex_list in change_dict.items(): for change_to, change_regex_list in change_dict.items():
for change_regex in change_regex_list: for change_regex in change_regex_list:
line = re.sub(change_regex, change_to, line) line = re.sub(change_regex, change_to, line)
@ -55,7 +60,6 @@ def truncate_duplicated_letters_to_two(line):
# %% [markdown] # %% [markdown]
### Cleaning ### Cleaning
# %% # %%
text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()] text = [clean_lines(x, change_dict) for x in data.loc[:, 'Tweet'].values.tolist()]
text = [truncate_duplicated_letters_to_two(x).strip() for x in text] text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
data.text = text data.Tweet = text
# %%