update dockerfile for model training

This commit is contained in:
Filip Patyk 2023-05-07 02:09:22 +02:00
parent 4561c65980
commit 3a98aa107d
5 changed files with 39 additions and 29 deletions

View File

@ -1,4 +1,4 @@
FROM python:3.11
FROM condaforge/mambaforge:latest
# seting env variables
ENV KAGGLE_USERNAME=filippatyk
@ -9,15 +9,16 @@ ENV RUN_TYPE=""
WORKDIR /app
ENV KAGGLE_CONFIG_DIR="./"
# install python dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
# create mamba env and activate it
COPY environment.yml /tmp/environment.yml
RUN mamba env create -f /tmp/environment.yml && mamba clean -ya
RUN echo "mamba activate ium" >> ~/.bashrc
ENV PATH /opt/conda/envs/ium/bin:$PATH
COPY dataset.py ./
COPY src ./src
#make dir for data
RUN mkdir -p ./data
RUN mkdir -p ./data; mkdir -p ./results
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE"
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./src/main.py "--$RUN_TYPE"

18
environment.yml Normal file
View File

@ -0,0 +1,18 @@
name: ium
channels:
- conda-forge
- pytorch
- nvidia
dependencies:
- python=3.10
- kaggle
- ca-certificates
- openssl
- pandas
- certifi
- scikit-learn
- pytorch
- pytorch-cuda=11.8
- transformers

View File

@ -8,27 +8,17 @@ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
class Dataset(torch.utils.data.Dataset):
def __init__(self, data: pd.DataFrame) -> None:
self.labels = data["label"].to_list()
# self.texts = [
# tokenizer(
# text,
# padding="max_length",
# max_length=512,
# truncation=True,
# return_tensors="pt",
# )
# for text in data["text"]
# ]
self.texts = data["text"].to_list()
def __getitem__(self, idx):
label = self.labels[idx]
text = tokenizer(
self.texts[idx],
padding="max_length",
max_length=512,
truncation=True,
return_tensors="pt",
)
self.texts[idx],
padding="max_length",
max_length=512,
truncation=True,
return_tensors="pt",
)
return text, label

View File

@ -5,7 +5,7 @@ from pathlib import Path
class NewsDataset:
def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None:
self.data_dir_path = Path(data_dir_path)
self.data_dir_path = Path("./" + data_dir_path)
self.true_news_path = self.data_dir_path / "True.csv"
self.fake_news_path = self.data_dir_path / "Fake.csv"

View File

@ -1,7 +1,7 @@
import random
from sklearn.model_selection import train_test_split
import argparse
import torch
from models import BertClassifier, utils
from datasets import NewsDataset
from train import train
@ -12,7 +12,7 @@ SEED = 2137
# Hyperparameters
INITIAL_LR = 1e-6
NUM_EPOCHS = 3
NUM_EPOCHS = 2
BATCH_SIZE = 2
@ -31,8 +31,9 @@ parser.add_argument("--results_path", type=str, default="results/results.csv")
if __name__ == "__main__":
args = parser.parse_args()
print("CUDA: ", torch.cuda.is_available())
# loading & spliting data
news_dataset = NewsDataset(data_dir_path="data", data_lenght=2000)
news_dataset = NewsDataset(data_dir_path="data", data_lenght=1000)
train_val_data, test_data = train_test_split(
news_dataset.data,