diff --git a/README.md b/README.md index cfb15c8..fb74f9a 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,16 @@ # warsztaty-prefect +### Uruchomienie + + +1. Pobrać pliki `train.csv` oraz `test.csv` z [https://www.kaggle.com/c/nlp-getting-started](wyzwania na Keaggle) i umieścić je w tym samym katalogu co plik `main.py` + +2. Zainstalować potrzebne moduły +``` +> pip3 install -r requirements.txt +``` + +3. Uruchomić skrypt +``` +> python3 main.py +``` diff --git a/main.py b/main.py index cb3fa37..84a644e 100644 --- a/main.py +++ b/main.py @@ -1,19 +1,15 @@ import pandas as pd -import matplotlib.pyplot as plt import string import re import nltk -import numpy as np -import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split -from prefect import task, Flow +from prefect import task, Flow, context from pandas import DataFrame -from typing import List nltk.download('stopwords') nltk.download('wordnet') @@ -76,14 +72,19 @@ def preprocessing(text): @task def get_train_set() -> DataFrame: + logger = context.get("logger") train = pd.read_csv('train.csv') train = train.drop(['keyword', 'location'], axis=1) + logger.info(f"Train set: {len(train)} elements") return train @task def get_test_set() -> DataFrame: - return pd.read_csv('test.csv') + logger = context.get("logger") + test = pd.read_csv('test.csv') + logger.info(f"Test set: {len(test)} elements") + return test @task @@ -141,16 +142,18 @@ def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression: @task def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None: + logger = context.get("logger") predictions = model.predict(X) count = 0 for guess, answer in zip(predictions, Y): if guess == answer: count += 1 - print("> model score: ", count/len(Y)) + score = count/len(Y) + logger.info(f"model score: {count/len(Y)}") if __name__ == "__main__": - with Flow("My First Flow!") as flow: + with Flow("My First Prefect Flow!") as flow: train_data = get_train_set() test_data = get_test_set() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7434cf7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +nltk==3.5 +numpy==1.18.5 +pandas==1.0.4 +prefect==0.11.5 +scikit-learn==0.23.1 +scipy==1.4.1 +sklearn==0.0