update dockerfile for model training
This commit is contained in:
parent
4561c65980
commit
3a98aa107d
19
Dockerfile
19
Dockerfile
@ -1,4 +1,4 @@
|
||||
FROM python:3.11
|
||||
FROM condaforge/mambaforge:latest
|
||||
|
||||
# seting env variables
|
||||
ENV KAGGLE_USERNAME=filippatyk
|
||||
@ -7,17 +7,18 @@ ENV RUN_TYPE=""
|
||||
|
||||
# create working direcotyry
|
||||
WORKDIR /app
|
||||
ENV KAGGLE_CONFIG_DIR="./"
|
||||
ENV KAGGLE_CONFIG_DIR="./"
|
||||
|
||||
# install python dependencies
|
||||
COPY requirements.txt ./
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
# create mamba env and activate it
|
||||
COPY environment.yml /tmp/environment.yml
|
||||
RUN mamba env create -f /tmp/environment.yml && mamba clean -ya
|
||||
RUN echo "mamba activate ium" >> ~/.bashrc
|
||||
ENV PATH /opt/conda/envs/ium/bin:$PATH
|
||||
|
||||
COPY dataset.py ./
|
||||
COPY src ./src
|
||||
|
||||
#make dir for data
|
||||
RUN mkdir -p ./data
|
||||
RUN mkdir -p ./data; mkdir -p ./results
|
||||
|
||||
|
||||
|
||||
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE"
|
||||
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./src/main.py "--$RUN_TYPE"
|
18
environment.yml
Normal file
18
environment.yml
Normal file
@ -0,0 +1,18 @@
|
||||
name: ium
|
||||
channels:
|
||||
- conda-forge
|
||||
- pytorch
|
||||
- nvidia
|
||||
dependencies:
|
||||
- python=3.10
|
||||
- kaggle
|
||||
- ca-certificates
|
||||
- openssl
|
||||
- pandas
|
||||
- certifi
|
||||
- scikit-learn
|
||||
- pytorch
|
||||
- pytorch-cuda=11.8
|
||||
- transformers
|
||||
|
||||
|
@ -8,27 +8,17 @@ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
class Dataset(torch.utils.data.Dataset):
|
||||
def __init__(self, data: pd.DataFrame) -> None:
|
||||
self.labels = data["label"].to_list()
|
||||
# self.texts = [
|
||||
# tokenizer(
|
||||
# text,
|
||||
# padding="max_length",
|
||||
# max_length=512,
|
||||
# truncation=True,
|
||||
# return_tensors="pt",
|
||||
# )
|
||||
# for text in data["text"]
|
||||
# ]
|
||||
self.texts = data["text"].to_list()
|
||||
|
||||
def __getitem__(self, idx):
|
||||
label = self.labels[idx]
|
||||
text = tokenizer(
|
||||
self.texts[idx],
|
||||
padding="max_length",
|
||||
max_length=512,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
self.texts[idx],
|
||||
padding="max_length",
|
||||
max_length=512,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
return text, label
|
||||
|
||||
|
@ -5,7 +5,7 @@ from pathlib import Path
|
||||
|
||||
class NewsDataset:
|
||||
def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None:
|
||||
self.data_dir_path = Path(data_dir_path)
|
||||
self.data_dir_path = Path("./" + data_dir_path)
|
||||
self.true_news_path = self.data_dir_path / "True.csv"
|
||||
self.fake_news_path = self.data_dir_path / "Fake.csv"
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import random
|
||||
from sklearn.model_selection import train_test_split
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from models import BertClassifier, utils
|
||||
from datasets import NewsDataset
|
||||
from train import train
|
||||
@ -12,7 +12,7 @@ SEED = 2137
|
||||
# Hyperparameters
|
||||
|
||||
INITIAL_LR = 1e-6
|
||||
NUM_EPOCHS = 3
|
||||
NUM_EPOCHS = 2
|
||||
BATCH_SIZE = 2
|
||||
|
||||
|
||||
@ -31,8 +31,9 @@ parser.add_argument("--results_path", type=str, default="results/results.csv")
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
print("CUDA: ", torch.cuda.is_available())
|
||||
# loading & spliting data
|
||||
news_dataset = NewsDataset(data_dir_path="data", data_lenght=2000)
|
||||
news_dataset = NewsDataset(data_dir_path="data", data_lenght=1000)
|
||||
|
||||
train_val_data, test_data = train_test_split(
|
||||
news_dataset.data,
|
||||
|
Loading…
Reference in New Issue
Block a user