From 14525005fbbc0ce2777f0c302689b659ab3ec529 Mon Sep 17 00:00:00 2001 From: Stanislaw-Golebiewski Date: Sun, 14 Jun 2020 16:48:52 +0200 Subject: [PATCH] preprocessing as separated module --- main.py | 71 ++++-------------------------------------------- preprocessing.py | 64 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 66 deletions(-) create mode 100644 preprocessing.py diff --git a/main.py b/main.py index 84a644e..936a288 100644 --- a/main.py +++ b/main.py @@ -1,73 +1,12 @@ import pandas as pd -import string -import re -import nltk -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer +from prefect import task, Flow, context +from pandas import DataFrame from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split -from prefect import task, Flow, context -from pandas import DataFrame - -nltk.download('stopwords') -nltk.download('wordnet') -nltk.download('punkt') -# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression) -def remove_urls(text): - new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()) - return new_text - - -# make all text lowercase -def text_lowercase(text): - return text.lower() - - -# remove numbers -def remove_numbers(text): - result = re.sub(r'\d+', '', text) - return result - - -# remove punctuation -def remove_punctuation(text): - translator = str.maketrans('', '', string.punctuation) - return text.translate(translator) - - -# tokenize -def tokenize(text): - text = word_tokenize(text) - return text - - -# remove stopwords -stop_words = set(stopwords.words('english')) -def remove_stopwords(text): - text = [i for i in text if not i in stop_words] - return text - -# lemmatize -lemmatizer = WordNetLemmatizer() -def lemmatize(text): - text = [lemmatizer.lemmatize(token) for token in text] - return text - - -def preprocessing(text): - text = text_lowercase(text) - text = remove_urls(text) - text = remove_numbers(text) - text = remove_punctuation(text) - text = tokenize(text) - text = remove_stopwords(text) - text = lemmatize(text) - text = ' '.join(text) - return text +from preprocessing import preprocess_text @task @@ -91,7 +30,7 @@ def get_test_set() -> DataFrame: def preprocess_train(train: DataFrame) -> DataFrame: pp_text_train = [] for text_data in train['text']: - pp_text_data = preprocessing(text_data) + pp_text_data = preprocess_text(text_data) pp_text_train.append(pp_text_data) train['pp_text'] = pp_text_train return train @@ -101,7 +40,7 @@ def preprocess_train(train: DataFrame) -> DataFrame: def preprocess_test(test: DataFrame) -> DataFrame: pp_text_test = [] for text_data in test['text']: - pp_text_data = preprocessing(text_data) + pp_text_data = preprocess_text(text_data) pp_text_test.append(pp_text_data) test['pp_text'] = pp_text_test return test diff --git a/preprocessing.py b/preprocessing.py new file mode 100644 index 0000000..0b34b2f --- /dev/null +++ b/preprocessing.py @@ -0,0 +1,64 @@ +import string +import re +import nltk +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer + +nltk.download('stopwords') +nltk.download('wordnet') +nltk.download('punkt') + + +# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression) +def remove_urls(text): + new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()) + return new_text + + +# make all text lowercase +def text_lowercase(text): + return text.lower() + + +# remove numbers +def remove_numbers(text): + result = re.sub(r'\d+', '', text) + return result + + +# remove punctuation +def remove_punctuation(text): + translator = str.maketrans('', '', string.punctuation) + return text.translate(translator) + + +# tokenize +def tokenize(text): + text = word_tokenize(text) + return text + + +# remove stopwords +stop_words = set(stopwords.words('english')) +def remove_stopwords(text): + text = [i for i in text if not i in stop_words] + return text + +# lemmatize +lemmatizer = WordNetLemmatizer() +def lemmatize(text): + text = [lemmatizer.lemmatize(token) for token in text] + return text + + +def preprocess_text(text): + text = text_lowercase(text) + text = remove_urls(text) + text = remove_numbers(text) + text = remove_punctuation(text) + text = tokenize(text) + text = remove_stopwords(text) + text = lemmatize(text) + text = ' '.join(text) + return text