preprocessing as separated module

This commit is contained in:
Stanislaw-Golebiewski 2020-06-14 16:48:52 +02:00
parent 0456ca00ee
commit 14525005fb
2 changed files with 69 additions and 66 deletions

71
main.py
View File

@ -1,73 +1,12 @@
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from prefect import task, Flow, context
from pandas import DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from prefect import task, Flow, context
from pandas import DataFrame
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text
# make all text lowercase
def text_lowercase(text):
return text.lower()
# remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
# tokenize
def tokenize(text):
text = word_tokenize(text)
return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text if not i in stop_words]
return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text]
return text
def preprocessing(text):
text = text_lowercase(text)
text = remove_urls(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = tokenize(text)
text = remove_stopwords(text)
text = lemmatize(text)
text = ' '.join(text)
return text
from preprocessing import preprocess_text
@task
@ -91,7 +30,7 @@ def get_test_set() -> DataFrame:
def preprocess_train(train: DataFrame) -> DataFrame:
pp_text_train = []
for text_data in train['text']:
pp_text_data = preprocessing(text_data)
pp_text_data = preprocess_text(text_data)
pp_text_train.append(pp_text_data)
train['pp_text'] = pp_text_train
return train
@ -101,7 +40,7 @@ def preprocess_train(train: DataFrame) -> DataFrame:
def preprocess_test(test: DataFrame) -> DataFrame:
pp_text_test = []
for text_data in test['text']:
pp_text_data = preprocessing(text_data)
pp_text_data = preprocess_text(text_data)
pp_text_test.append(pp_text_data)
test['pp_text'] = pp_text_test
return test

64
preprocessing.py Normal file
View File

@ -0,0 +1,64 @@
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text
# make all text lowercase
def text_lowercase(text):
return text.lower()
# remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# remove punctuation
def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation)
return text.translate(translator)
# tokenize
def tokenize(text):
text = word_tokenize(text)
return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
text = [i for i in text if not i in stop_words]
return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text]
return text
def preprocess_text(text):
text = text_lowercase(text)
text = remove_urls(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = tokenize(text)
text = remove_stopwords(text)
text = lemmatize(text)
text = ' '.join(text)
return text