2020-06-13 23:03:53 +02:00
|
|
|
import pandas as pd
|
2020-06-14 16:48:52 +02:00
|
|
|
from prefect import task, Flow, context
|
|
|
|
from pandas import DataFrame
|
2020-06-13 23:03:53 +02:00
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
2020-06-14 14:23:49 +02:00
|
|
|
from sklearn.linear_model import LogisticRegression
|
2020-06-13 23:03:53 +02:00
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
2020-06-14 14:23:49 +02:00
|
|
|
|
2020-06-14 16:48:52 +02:00
|
|
|
from preprocessing import preprocess_text
|
2020-06-13 23:03:53 +02:00
|
|
|
|
|
|
|
|
2020-06-14 14:23:49 +02:00
|
|
|
@task
|
|
|
|
def get_train_set() -> DataFrame:
|
2020-06-14 16:41:43 +02:00
|
|
|
logger = context.get("logger")
|
2020-06-14 14:23:49 +02:00
|
|
|
train = pd.read_csv('train.csv')
|
|
|
|
train = train.drop(['keyword', 'location'], axis=1)
|
2020-06-14 16:41:43 +02:00
|
|
|
logger.info(f"Train set: {len(train)} elements")
|
2020-06-14 14:23:49 +02:00
|
|
|
return train
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def get_test_set() -> DataFrame:
|
2020-06-14 16:41:43 +02:00
|
|
|
logger = context.get("logger")
|
|
|
|
test = pd.read_csv('test.csv')
|
|
|
|
logger.info(f"Test set: {len(test)} elements")
|
|
|
|
return test
|
2020-06-14 14:23:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def preprocess_train(train: DataFrame) -> DataFrame:
|
|
|
|
pp_text_train = []
|
|
|
|
for text_data in train['text']:
|
2020-06-14 16:48:52 +02:00
|
|
|
pp_text_data = preprocess_text(text_data)
|
2020-06-14 14:23:49 +02:00
|
|
|
pp_text_train.append(pp_text_data)
|
|
|
|
train['pp_text'] = pp_text_train
|
|
|
|
return train
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def preprocess_test(test: DataFrame) -> DataFrame:
|
|
|
|
pp_text_test = []
|
|
|
|
for text_data in test['text']:
|
2020-06-14 16:48:52 +02:00
|
|
|
pp_text_data = preprocess_text(text_data)
|
2020-06-14 14:23:49 +02:00
|
|
|
pp_text_test.append(pp_text_data)
|
|
|
|
test['pp_text'] = pp_text_test
|
|
|
|
return test
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def prepare_vectorizer(train_data: DataFrame, test_data: DataFrame) -> TfidfVectorizer:
|
|
|
|
train_text_data = list(train_data['pp_text'])
|
|
|
|
test_text_data = list(test_data['pp_text'])
|
|
|
|
corpus = train_text_data + test_text_data
|
|
|
|
tf = TfidfVectorizer()
|
|
|
|
fitted_vectorizer = tf.fit(corpus)
|
|
|
|
return fitted_vectorizer
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def transform_train(vectorizer: TfidfVectorizer, train_set: DataFrame) -> DataFrame:
|
|
|
|
return vectorizer.transform(train_set)
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def transform_test(vectorizer: TfidfVectorizer, test_set: DataFrame) -> DataFrame:
|
|
|
|
return vectorizer.transform(test_set)
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def split_test_set(X: DataFrame, Y: DataFrame) -> dict:
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, Y)
|
|
|
|
return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression:
|
|
|
|
scikit_log_reg = LogisticRegression()
|
|
|
|
model = scikit_log_reg.fit(X, Y)
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@task
|
|
|
|
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
|
2020-06-14 16:41:43 +02:00
|
|
|
logger = context.get("logger")
|
2020-06-14 14:23:49 +02:00
|
|
|
predictions = model.predict(X)
|
|
|
|
count = 0
|
|
|
|
for guess, answer in zip(predictions, Y):
|
|
|
|
if guess == answer:
|
|
|
|
count += 1
|
2020-06-14 16:41:43 +02:00
|
|
|
score = count/len(Y)
|
|
|
|
logger.info(f"model score: {count/len(Y)}")
|
2020-06-14 14:23:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2020-06-14 16:41:43 +02:00
|
|
|
with Flow("My First Prefect Flow!") as flow:
|
2020-06-14 14:23:49 +02:00
|
|
|
train_data = get_train_set()
|
|
|
|
test_data = get_test_set()
|
|
|
|
|
|
|
|
train_data = preprocess_train(train_data)
|
|
|
|
test_data = preprocess_test(test_data)
|
|
|
|
|
|
|
|
vectorizer = prepare_vectorizer(train_data, test_data)
|
|
|
|
|
|
|
|
vectorized_train_data = transform_train(vectorizer, train_data['pp_text'])
|
|
|
|
vectorized_test_data = transform_test(vectorizer, train_data['pp_text'])
|
|
|
|
|
|
|
|
splitted_data = split_test_set(vectorized_train_data, train_data['target'])
|
|
|
|
model = train_model(splitted_data['X_train'], splitted_data['y_train'])
|
|
|
|
evaluate(model, splitted_data['X_test'], splitted_data['y_test'])
|
|
|
|
|
|
|
|
flow.validate()
|
|
|
|
# flow.visualize()
|
|
|
|
flow.run()
|