transform into Prefect flow

This commit is contained in:
Stanislaw-Golebiewski 2020-06-14 14:23:49 +02:00
parent 1abd5290cd
commit 7400bc17e1

145
main.py
View File

@ -6,64 +6,62 @@ import nltk
import numpy as np import numpy as np
import nltk import nltk
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from prefect import task, Flow
from pandas import DataFrame
from typing import List
nltk.download('stopwords') nltk.download('stopwords')
nltk.download('wordnet') nltk.download('wordnet')
nltk.download('punkt') nltk.download('punkt')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()
train_count=train.count()
print(train_count)
print(train_count/train_count[0]*100)
train = train.drop(['keyword', 'location'], axis = 1)
test.head()
test.describe()
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression) # remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text): def remove_urls(text):
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()) new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
return new_text return new_text
# make all text lowercase # make all text lowercase
def text_lowercase(text): def text_lowercase(text):
return text.lower() return text.lower()
# remove numbers # remove numbers
def remove_numbers(text): def remove_numbers(text):
result = re.sub(r'\d+', '', text) result = re.sub(r'\d+', '', text)
return result return result
# remove punctuation # remove punctuation
def remove_punctuation(text): def remove_punctuation(text):
translator = str.maketrans('', '', string.punctuation) translator = str.maketrans('', '', string.punctuation)
return text.translate(translator) return text.translate(translator)
# tokenize # tokenize
def tokenize(text): def tokenize(text):
text = word_tokenize(text) text = word_tokenize(text)
return text return text
# remove stopwords # remove stopwords
stop_words = set(stopwords.words('english')) stop_words = set(stopwords.words('english'))
def remove_stopwords(text): def remove_stopwords(text):
text = [i for i in text if not i in stop_words] text = [i for i in text if not i in stop_words]
return text return text
# lemmatize # lemmatize
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
def lemmatize(text): def lemmatize(text):
text = [lemmatizer.lemmatize(token) for token in text] text = [lemmatizer.lemmatize(token) for token in text]
return text return text
def preprocessing(text): def preprocessing(text):
text = text_lowercase(text) text = text_lowercase(text)
text = remove_urls(text) text = remove_urls(text)
@ -76,41 +74,98 @@ def preprocessing(text):
return text return text
pp_text_train = [] # our preprocessed text column @task
for text_data in train['text']: def get_train_set() -> DataFrame:
pp_text_data = preprocessing(text_data) train = pd.read_csv('train.csv')
pp_text_train.append(pp_text_data) train = train.drop(['keyword', 'location'], axis=1)
train['pp_text'] = pp_text_train # add the preprocessed text as a column return train
pp_text_test = [] # our preprocessed text column
for text_data in test['text']:
pp_text_data = preprocessing(text_data)
pp_text_test.append(pp_text_data)
test['pp_text'] = pp_text_test # add the preprocessed text as a column
train_text_data = list(train['pp_text']) @task
test_text_data = list(test['pp_text']) def get_test_set() -> DataFrame:
corpus = train_text_data + test_text_data return pd.read_csv('test.csv')
tf=TfidfVectorizer()
# the vectorizer must be fit onto the entire corpus
fitted_vectorizer = tf.fit(corpus)
# train @task
train_transform = fitted_vectorizer.transform(train['pp_text']) def preprocess_train(train: DataFrame) -> DataFrame:
y = train['target'] pp_text_train = []
# test for text_data in train['text']:
test_transform = fitted_vectorizer.transform(test['pp_text']) pp_text_data = preprocessing(text_data)
pp_text_train.append(pp_text_data)
train['pp_text'] = pp_text_train
return train
X=train_transform
X_train, X_test, y_train, y_test = train_test_split(X, y)
scikit_log_reg = LogisticRegression() @task
model=scikit_log_reg.fit(X_train, y_train) def preprocess_test(test: DataFrame) -> DataFrame:
pp_text_test = []
for text_data in test['text']:
pp_text_data = preprocessing(text_data)
pp_text_test.append(pp_text_data)
test['pp_text'] = pp_text_test
return test
predictions = model.predict(X_test)
count = 0 @task
for guess, answer in zip(predictions, y_test): def prepare_vectorizer(train_data: DataFrame, test_data: DataFrame) -> TfidfVectorizer:
if guess == answer: train_text_data = list(train_data['pp_text'])
count += 1 test_text_data = list(test_data['pp_text'])
print(count/len(y_test)) corpus = train_text_data + test_text_data
tf = TfidfVectorizer()
fitted_vectorizer = tf.fit(corpus)
return fitted_vectorizer
@task
def transform_train(vectorizer: TfidfVectorizer, train_set: DataFrame) -> DataFrame:
return vectorizer.transform(train_set)
@task
def transform_test(vectorizer: TfidfVectorizer, test_set: DataFrame) -> DataFrame:
return vectorizer.transform(test_set)
@task
def split_test_set(X: DataFrame, Y: DataFrame) -> dict:
X_train, X_test, y_train, y_test = train_test_split(X, Y)
return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
@task
def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression:
scikit_log_reg = LogisticRegression()
model = scikit_log_reg.fit(X, Y)
return model
@task
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
predictions = model.predict(X)
count = 0
for guess, answer in zip(predictions, Y):
if guess == answer:
count += 1
print("> model score: ", count/len(Y))
if __name__ == "__main__":
with Flow("My First Flow!") as flow:
train_data = get_train_set()
test_data = get_test_set()
train_data = preprocess_train(train_data)
test_data = preprocess_test(test_data)
vectorizer = prepare_vectorizer(train_data, test_data)
vectorized_train_data = transform_train(vectorizer, train_data['pp_text'])
vectorized_test_data = transform_test(vectorizer, train_data['pp_text'])
splitted_data = split_test_set(vectorized_train_data, train_data['target'])
model = train_model(splitted_data['X_train'], splitted_data['y_train'])
evaluate(model, splitted_data['X_test'], splitted_data['y_test'])
flow.validate()
# flow.visualize()
flow.run()