add requirements.txt and update README

This commit is contained in:
Stanislaw-Golebiewski 2020-06-14 16:41:43 +02:00
parent 7400bc17e1
commit 0456ca00ee
3 changed files with 32 additions and 8 deletions

View File

@ -1,2 +1,16 @@
# warsztaty-prefect
### Uruchomienie
1. Pobrać pliki `train.csv` oraz `test.csv` z [https://www.kaggle.com/c/nlp-getting-started](wyzwania na Keaggle) i umieścić je w tym samym katalogu co plik `main.py`
2. Zainstalować potrzebne moduły
```
> pip3 install -r requirements.txt
```
3. Uruchomić skrypt
```
> python3 main.py
```

19
main.py
View File

@ -1,19 +1,15 @@
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import nltk
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from prefect import task, Flow
from prefect import task, Flow, context
from pandas import DataFrame
from typing import List
nltk.download('stopwords')
nltk.download('wordnet')
@ -76,14 +72,19 @@ def preprocessing(text):
@task
def get_train_set() -> DataFrame:
logger = context.get("logger")
train = pd.read_csv('train.csv')
train = train.drop(['keyword', 'location'], axis=1)
logger.info(f"Train set: {len(train)} elements")
return train
@task
def get_test_set() -> DataFrame:
return pd.read_csv('test.csv')
logger = context.get("logger")
test = pd.read_csv('test.csv')
logger.info(f"Test set: {len(test)} elements")
return test
@task
@ -141,16 +142,18 @@ def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression:
@task
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
logger = context.get("logger")
predictions = model.predict(X)
count = 0
for guess, answer in zip(predictions, Y):
if guess == answer:
count += 1
print("> model score: ", count/len(Y))
score = count/len(Y)
logger.info(f"model score: {count/len(Y)}")
if __name__ == "__main__":
with Flow("My First Flow!") as flow:
with Flow("My First Prefect Flow!") as flow:
train_data = get_train_set()
test_data = get_test_set()

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
nltk==3.5
numpy==1.18.5
pandas==1.0.4
prefect==0.11.5
scikit-learn==0.23.1
scipy==1.4.1
sklearn==0.0