add requirements.txt and update README

This commit is contained in:
Stanislaw-Golebiewski 2020-06-14 16:41:43 +02:00
parent 7400bc17e1
commit 0456ca00ee
3 changed files with 32 additions and 8 deletions

View File

@ -1,2 +1,16 @@
# warsztaty-prefect # warsztaty-prefect
### Uruchomienie
1. Pobrać pliki `train.csv` oraz `test.csv` z [https://www.kaggle.com/c/nlp-getting-started](wyzwania na Keaggle) i umieścić je w tym samym katalogu co plik `main.py`
2. Zainstalować potrzebne moduły
```
> pip3 install -r requirements.txt
```
3. Uruchomić skrypt
```
> python3 main.py
```

19
main.py
View File

@ -1,19 +1,15 @@
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
import string import string
import re import re
import nltk import nltk
import numpy as np
import nltk
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from prefect import task, Flow from prefect import task, Flow, context
from pandas import DataFrame from pandas import DataFrame
from typing import List
nltk.download('stopwords') nltk.download('stopwords')
nltk.download('wordnet') nltk.download('wordnet')
@ -76,14 +72,19 @@ def preprocessing(text):
@task @task
def get_train_set() -> DataFrame: def get_train_set() -> DataFrame:
logger = context.get("logger")
train = pd.read_csv('train.csv') train = pd.read_csv('train.csv')
train = train.drop(['keyword', 'location'], axis=1) train = train.drop(['keyword', 'location'], axis=1)
logger.info(f"Train set: {len(train)} elements")
return train return train
@task @task
def get_test_set() -> DataFrame: def get_test_set() -> DataFrame:
return pd.read_csv('test.csv') logger = context.get("logger")
test = pd.read_csv('test.csv')
logger.info(f"Test set: {len(test)} elements")
return test
@task @task
@ -141,16 +142,18 @@ def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression:
@task @task
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None: def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
logger = context.get("logger")
predictions = model.predict(X) predictions = model.predict(X)
count = 0 count = 0
for guess, answer in zip(predictions, Y): for guess, answer in zip(predictions, Y):
if guess == answer: if guess == answer:
count += 1 count += 1
print("> model score: ", count/len(Y)) score = count/len(Y)
logger.info(f"model score: {count/len(Y)}")
if __name__ == "__main__": if __name__ == "__main__":
with Flow("My First Flow!") as flow: with Flow("My First Prefect Flow!") as flow:
train_data = get_train_set() train_data = get_train_set()
test_data = get_test_set() test_data = get_test_set()

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
nltk==3.5
numpy==1.18.5
pandas==1.0.4
prefect==0.11.5
scikit-learn==0.23.1
scipy==1.4.1
sklearn==0.0