add requirements.txt and update README
This commit is contained in:
parent
7400bc17e1
commit
0456ca00ee
14
README.md
14
README.md
@ -1,2 +1,16 @@
|
|||||||
# warsztaty-prefect
|
# warsztaty-prefect
|
||||||
|
|
||||||
|
### Uruchomienie
|
||||||
|
|
||||||
|
|
||||||
|
1. Pobrać pliki `train.csv` oraz `test.csv` z [https://www.kaggle.com/c/nlp-getting-started](wyzwania na Keaggle) i umieścić je w tym samym katalogu co plik `main.py`
|
||||||
|
|
||||||
|
2. Zainstalować potrzebne moduły
|
||||||
|
```
|
||||||
|
> pip3 install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Uruchomić skrypt
|
||||||
|
```
|
||||||
|
> python3 main.py
|
||||||
|
```
|
||||||
|
19
main.py
19
main.py
@ -1,19 +1,15 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
import nltk
|
import nltk
|
||||||
import numpy as np
|
|
||||||
import nltk
|
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from prefect import task, Flow
|
from prefect import task, Flow, context
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
from typing import List
|
|
||||||
|
|
||||||
nltk.download('stopwords')
|
nltk.download('stopwords')
|
||||||
nltk.download('wordnet')
|
nltk.download('wordnet')
|
||||||
@ -76,14 +72,19 @@ def preprocessing(text):
|
|||||||
|
|
||||||
@task
|
@task
|
||||||
def get_train_set() -> DataFrame:
|
def get_train_set() -> DataFrame:
|
||||||
|
logger = context.get("logger")
|
||||||
train = pd.read_csv('train.csv')
|
train = pd.read_csv('train.csv')
|
||||||
train = train.drop(['keyword', 'location'], axis=1)
|
train = train.drop(['keyword', 'location'], axis=1)
|
||||||
|
logger.info(f"Train set: {len(train)} elements")
|
||||||
return train
|
return train
|
||||||
|
|
||||||
|
|
||||||
@task
|
@task
|
||||||
def get_test_set() -> DataFrame:
|
def get_test_set() -> DataFrame:
|
||||||
return pd.read_csv('test.csv')
|
logger = context.get("logger")
|
||||||
|
test = pd.read_csv('test.csv')
|
||||||
|
logger.info(f"Test set: {len(test)} elements")
|
||||||
|
return test
|
||||||
|
|
||||||
|
|
||||||
@task
|
@task
|
||||||
@ -141,16 +142,18 @@ def train_model(X: DataFrame, Y: DataFrame) -> LogisticRegression:
|
|||||||
|
|
||||||
@task
|
@task
|
||||||
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
|
def evaluate(model: LogisticRegression, X: DataFrame, Y: DataFrame) -> None:
|
||||||
|
logger = context.get("logger")
|
||||||
predictions = model.predict(X)
|
predictions = model.predict(X)
|
||||||
count = 0
|
count = 0
|
||||||
for guess, answer in zip(predictions, Y):
|
for guess, answer in zip(predictions, Y):
|
||||||
if guess == answer:
|
if guess == answer:
|
||||||
count += 1
|
count += 1
|
||||||
print("> model score: ", count/len(Y))
|
score = count/len(Y)
|
||||||
|
logger.info(f"model score: {count/len(Y)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
with Flow("My First Flow!") as flow:
|
with Flow("My First Prefect Flow!") as flow:
|
||||||
train_data = get_train_set()
|
train_data = get_train_set()
|
||||||
test_data = get_test_set()
|
test_data = get_test_set()
|
||||||
|
|
||||||
|
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
nltk==3.5
|
||||||
|
numpy==1.18.5
|
||||||
|
pandas==1.0.4
|
||||||
|
prefect==0.11.5
|
||||||
|
scikit-learn==0.23.1
|
||||||
|
scipy==1.4.1
|
||||||
|
sklearn==0.0
|
Loading…
Reference in New Issue
Block a user