update dockerfile for model training

This commit is contained in:
Filip Patyk 2023-05-07 02:09:22 +02:00
parent 4561c65980
commit 3a98aa107d
5 changed files with 39 additions and 29 deletions

View File

@ -1,4 +1,4 @@
FROM python:3.11 FROM condaforge/mambaforge:latest
# seting env variables # seting env variables
ENV KAGGLE_USERNAME=filippatyk ENV KAGGLE_USERNAME=filippatyk
@ -9,15 +9,16 @@ ENV RUN_TYPE=""
WORKDIR /app WORKDIR /app
ENV KAGGLE_CONFIG_DIR="./" ENV KAGGLE_CONFIG_DIR="./"
# install python dependencies # create mamba env and activate it
COPY requirements.txt ./ COPY environment.yml /tmp/environment.yml
RUN pip install --no-cache-dir -r requirements.txt RUN mamba env create -f /tmp/environment.yml && mamba clean -ya
RUN echo "mamba activate ium" >> ~/.bashrc
ENV PATH /opt/conda/envs/ium/bin:$PATH
COPY dataset.py ./ COPY src ./src
#make dir for data #make dir for data
RUN mkdir -p ./data RUN mkdir -p ./data; mkdir -p ./results
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./src/main.py "--$RUN_TYPE"
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE"

18
environment.yml Normal file
View File

@ -0,0 +1,18 @@
name: ium
channels:
- conda-forge
- pytorch
- nvidia
dependencies:
- python=3.10
- kaggle
- ca-certificates
- openssl
- pandas
- certifi
- scikit-learn
- pytorch
- pytorch-cuda=11.8
- transformers

View File

@ -8,16 +8,6 @@ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
class Dataset(torch.utils.data.Dataset): class Dataset(torch.utils.data.Dataset):
def __init__(self, data: pd.DataFrame) -> None: def __init__(self, data: pd.DataFrame) -> None:
self.labels = data["label"].to_list() self.labels = data["label"].to_list()
# self.texts = [
# tokenizer(
# text,
# padding="max_length",
# max_length=512,
# truncation=True,
# return_tensors="pt",
# )
# for text in data["text"]
# ]
self.texts = data["text"].to_list() self.texts = data["text"].to_list()
def __getitem__(self, idx): def __getitem__(self, idx):

View File

@ -5,7 +5,7 @@ from pathlib import Path
class NewsDataset: class NewsDataset:
def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None: def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None:
self.data_dir_path = Path(data_dir_path) self.data_dir_path = Path("./" + data_dir_path)
self.true_news_path = self.data_dir_path / "True.csv" self.true_news_path = self.data_dir_path / "True.csv"
self.fake_news_path = self.data_dir_path / "Fake.csv" self.fake_news_path = self.data_dir_path / "Fake.csv"

View File

@ -1,7 +1,7 @@
import random import random
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import argparse import argparse
import torch
from models import BertClassifier, utils from models import BertClassifier, utils
from datasets import NewsDataset from datasets import NewsDataset
from train import train from train import train
@ -12,7 +12,7 @@ SEED = 2137
# Hyperparameters # Hyperparameters
INITIAL_LR = 1e-6 INITIAL_LR = 1e-6
NUM_EPOCHS = 3 NUM_EPOCHS = 2
BATCH_SIZE = 2 BATCH_SIZE = 2
@ -31,8 +31,9 @@ parser.add_argument("--results_path", type=str, default="results/results.csv")
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
print("CUDA: ", torch.cuda.is_available())
# loading & spliting data # loading & spliting data
news_dataset = NewsDataset(data_dir_path="data", data_lenght=2000) news_dataset = NewsDataset(data_dir_path="data", data_lenght=1000)
train_val_data, test_data = train_test_split( train_val_data, test_data = train_test_split(
news_dataset.data, news_dataset.data,