preprocessing as separated module
This commit is contained in:
parent
0456ca00ee
commit
14525005fb
71
main.py
71
main.py
@ -1,73 +1,12 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import string
|
from prefect import task, Flow, context
|
||||||
import re
|
from pandas import DataFrame
|
||||||
import nltk
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
from nltk.stem import WordNetLemmatizer
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from prefect import task, Flow, context
|
|
||||||
from pandas import DataFrame
|
|
||||||
|
|
||||||
nltk.download('stopwords')
|
|
||||||
nltk.download('wordnet')
|
|
||||||
nltk.download('punkt')
|
|
||||||
|
|
||||||
|
|
||||||
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
|
from preprocessing import preprocess_text
|
||||||
def remove_urls(text):
|
|
||||||
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
|
|
||||||
return new_text
|
|
||||||
|
|
||||||
|
|
||||||
# make all text lowercase
|
|
||||||
def text_lowercase(text):
|
|
||||||
return text.lower()
|
|
||||||
|
|
||||||
|
|
||||||
# remove numbers
|
|
||||||
def remove_numbers(text):
|
|
||||||
result = re.sub(r'\d+', '', text)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
# remove punctuation
|
|
||||||
def remove_punctuation(text):
|
|
||||||
translator = str.maketrans('', '', string.punctuation)
|
|
||||||
return text.translate(translator)
|
|
||||||
|
|
||||||
|
|
||||||
# tokenize
|
|
||||||
def tokenize(text):
|
|
||||||
text = word_tokenize(text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
# remove stopwords
|
|
||||||
stop_words = set(stopwords.words('english'))
|
|
||||||
def remove_stopwords(text):
|
|
||||||
text = [i for i in text if not i in stop_words]
|
|
||||||
return text
|
|
||||||
|
|
||||||
# lemmatize
|
|
||||||
lemmatizer = WordNetLemmatizer()
|
|
||||||
def lemmatize(text):
|
|
||||||
text = [lemmatizer.lemmatize(token) for token in text]
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def preprocessing(text):
|
|
||||||
text = text_lowercase(text)
|
|
||||||
text = remove_urls(text)
|
|
||||||
text = remove_numbers(text)
|
|
||||||
text = remove_punctuation(text)
|
|
||||||
text = tokenize(text)
|
|
||||||
text = remove_stopwords(text)
|
|
||||||
text = lemmatize(text)
|
|
||||||
text = ' '.join(text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
@task
|
@task
|
||||||
@ -91,7 +30,7 @@ def get_test_set() -> DataFrame:
|
|||||||
def preprocess_train(train: DataFrame) -> DataFrame:
|
def preprocess_train(train: DataFrame) -> DataFrame:
|
||||||
pp_text_train = []
|
pp_text_train = []
|
||||||
for text_data in train['text']:
|
for text_data in train['text']:
|
||||||
pp_text_data = preprocessing(text_data)
|
pp_text_data = preprocess_text(text_data)
|
||||||
pp_text_train.append(pp_text_data)
|
pp_text_train.append(pp_text_data)
|
||||||
train['pp_text'] = pp_text_train
|
train['pp_text'] = pp_text_train
|
||||||
return train
|
return train
|
||||||
@ -101,7 +40,7 @@ def preprocess_train(train: DataFrame) -> DataFrame:
|
|||||||
def preprocess_test(test: DataFrame) -> DataFrame:
|
def preprocess_test(test: DataFrame) -> DataFrame:
|
||||||
pp_text_test = []
|
pp_text_test = []
|
||||||
for text_data in test['text']:
|
for text_data in test['text']:
|
||||||
pp_text_data = preprocessing(text_data)
|
pp_text_data = preprocess_text(text_data)
|
||||||
pp_text_test.append(pp_text_data)
|
pp_text_test.append(pp_text_data)
|
||||||
test['pp_text'] = pp_text_test
|
test['pp_text'] = pp_text_test
|
||||||
return test
|
return test
|
||||||
|
64
preprocessing.py
Normal file
64
preprocessing.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import string
|
||||||
|
import re
|
||||||
|
import nltk
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
|
||||||
|
nltk.download('stopwords')
|
||||||
|
nltk.download('wordnet')
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
|
||||||
|
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
|
||||||
|
def remove_urls(text):
|
||||||
|
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
# make all text lowercase
|
||||||
|
def text_lowercase(text):
|
||||||
|
return text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
# remove numbers
|
||||||
|
def remove_numbers(text):
|
||||||
|
result = re.sub(r'\d+', '', text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# remove punctuation
|
||||||
|
def remove_punctuation(text):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
return text.translate(translator)
|
||||||
|
|
||||||
|
|
||||||
|
# tokenize
|
||||||
|
def tokenize(text):
|
||||||
|
text = word_tokenize(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# remove stopwords
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
def remove_stopwords(text):
|
||||||
|
text = [i for i in text if not i in stop_words]
|
||||||
|
return text
|
||||||
|
|
||||||
|
# lemmatize
|
||||||
|
lemmatizer = WordNetLemmatizer()
|
||||||
|
def lemmatize(text):
|
||||||
|
text = [lemmatizer.lemmatize(token) for token in text]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
text = text_lowercase(text)
|
||||||
|
text = remove_urls(text)
|
||||||
|
text = remove_numbers(text)
|
||||||
|
text = remove_punctuation(text)
|
||||||
|
text = tokenize(text)
|
||||||
|
text = remove_stopwords(text)
|
||||||
|
text = lemmatize(text)
|
||||||
|
text = ' '.join(text)
|
||||||
|
return text
|
Loading…
Reference in New Issue
Block a user