Improved data cleaning - removed more emojis and tokenized numbers
This commit is contained in:
parent
de6152b9f8
commit
eea6f1b259
32
twitter.py
32
twitter.py
@ -6,30 +6,35 @@ import re
|
|||||||
### Reading data - this part need changing when data
|
### Reading data - this part need changing when data
|
||||||
# %%
|
# %%
|
||||||
path = os.getcwd()
|
path = os.getcwd()
|
||||||
filename = 'training_data_clean.csv'
|
filename = 'BTC_tweets_daily_example.csv'
|
||||||
filepath = path+'/'+filename
|
filepath = path+'/'+filename
|
||||||
data = pd.read_csv(filepath, header=None,
|
data_all = pd.read_csv(filepath, header=0,
|
||||||
delimiter=',', encoding_errors='surrogateescape')
|
delimiter=',',
|
||||||
data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
# encoding_errors='surrogateescape'
|
||||||
|
)
|
||||||
|
# data.columns = ['index', 'id','date', 'query', 'user', 'text']
|
||||||
|
# %%
|
||||||
|
data = data_all.loc[:,['Tweet', 'Sentiment']]
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Function definitions
|
### Function definitions
|
||||||
# %%
|
# %%
|
||||||
change_dict = {
|
change_dict = {
|
||||||
# tokens
|
# tokens
|
||||||
"USERNAME": ['@\w+|@'],
|
" username ": ['@\w+|@'],
|
||||||
"URL": ['http\S*'],
|
" url ": ['http\S*'],
|
||||||
"EMOJI": ["[;:][dbop\(\)\[\]]|[dbop\(\)\[\]][;:]|xd+|\S*&\S*"],
|
" emoji ": ["[;:][dbop\(\)\[\]]|[^\w][dbop\(\)\[\]][;:]|xd+|\S*&\S*", "[^\w\s,.?!:;#\'\"\(\)\$\-\+%\[\]\|]"],
|
||||||
|
" number ": ["[\+\-\$]?[\d]+[,\.]?[\d]+[%]?"],
|
||||||
# standardization
|
# standardization
|
||||||
', ': ['\s,'],
|
', ': ['\s,'],
|
||||||
'. ': ['\s\.'],
|
'. ': ['\s\.'],
|
||||||
' ': ['\s{2,}'],
|
' ': ['\s{2,}', '\n'],
|
||||||
"'": ["<EFBFBD>"],
|
"'": ["<EFBFBD>"],
|
||||||
'?': ["\s\?+|\?+"],
|
'?': ["\s\?"],
|
||||||
'!': ["\s\!+|\!+"]
|
'!': ["\s\!"],
|
||||||
}
|
}
|
||||||
|
|
||||||
def clean_lines(line, change_dict):
|
def clean_lines(line, change_dict):
|
||||||
line = line.lower()
|
line = str(line).lower()
|
||||||
for change_to, change_regex_list in change_dict.items():
|
for change_to, change_regex_list in change_dict.items():
|
||||||
for change_regex in change_regex_list:
|
for change_regex in change_regex_list:
|
||||||
line = re.sub(change_regex, change_to, line)
|
line = re.sub(change_regex, change_to, line)
|
||||||
@ -55,7 +60,6 @@ def truncate_duplicated_letters_to_two(line):
|
|||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
### Cleaning
|
### Cleaning
|
||||||
# %%
|
# %%
|
||||||
text = [clean_lines(x, change_dict) for x in data.loc[:, 'text'].values.tolist()]
|
text = [clean_lines(x, change_dict) for x in data.loc[:, 'Tweet'].values.tolist()]
|
||||||
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
text = [truncate_duplicated_letters_to_two(x).strip() for x in text]
|
||||||
data.text = text
|
data.Tweet = text
|
||||||
# %%
|
|
Loading…
Reference in New Issue
Block a user