update dockerfile for model training
This commit is contained in:
parent
4561c65980
commit
3a98aa107d
19
Dockerfile
19
Dockerfile
@ -1,4 +1,4 @@
|
|||||||
FROM python:3.11
|
FROM condaforge/mambaforge:latest
|
||||||
|
|
||||||
# seting env variables
|
# seting env variables
|
||||||
ENV KAGGLE_USERNAME=filippatyk
|
ENV KAGGLE_USERNAME=filippatyk
|
||||||
@ -7,17 +7,18 @@ ENV RUN_TYPE=""
|
|||||||
|
|
||||||
# create working direcotyry
|
# create working direcotyry
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
ENV KAGGLE_CONFIG_DIR="./"
|
ENV KAGGLE_CONFIG_DIR="./"
|
||||||
|
|
||||||
# install python dependencies
|
# create mamba env and activate it
|
||||||
COPY requirements.txt ./
|
COPY environment.yml /tmp/environment.yml
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN mamba env create -f /tmp/environment.yml && mamba clean -ya
|
||||||
|
RUN echo "mamba activate ium" >> ~/.bashrc
|
||||||
|
ENV PATH /opt/conda/envs/ium/bin:$PATH
|
||||||
|
|
||||||
COPY dataset.py ./
|
COPY src ./src
|
||||||
|
|
||||||
#make dir for data
|
#make dir for data
|
||||||
RUN mkdir -p ./data
|
RUN mkdir -p ./data; mkdir -p ./results
|
||||||
|
|
||||||
|
|
||||||
|
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./src/main.py "--$RUN_TYPE"
|
||||||
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE"
|
|
18
environment.yml
Normal file
18
environment.yml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
name: ium
|
||||||
|
channels:
|
||||||
|
- conda-forge
|
||||||
|
- pytorch
|
||||||
|
- nvidia
|
||||||
|
dependencies:
|
||||||
|
- python=3.10
|
||||||
|
- kaggle
|
||||||
|
- ca-certificates
|
||||||
|
- openssl
|
||||||
|
- pandas
|
||||||
|
- certifi
|
||||||
|
- scikit-learn
|
||||||
|
- pytorch
|
||||||
|
- pytorch-cuda=11.8
|
||||||
|
- transformers
|
||||||
|
|
||||||
|
|
@ -8,27 +8,17 @@ tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
|||||||
class Dataset(torch.utils.data.Dataset):
|
class Dataset(torch.utils.data.Dataset):
|
||||||
def __init__(self, data: pd.DataFrame) -> None:
|
def __init__(self, data: pd.DataFrame) -> None:
|
||||||
self.labels = data["label"].to_list()
|
self.labels = data["label"].to_list()
|
||||||
# self.texts = [
|
|
||||||
# tokenizer(
|
|
||||||
# text,
|
|
||||||
# padding="max_length",
|
|
||||||
# max_length=512,
|
|
||||||
# truncation=True,
|
|
||||||
# return_tensors="pt",
|
|
||||||
# )
|
|
||||||
# for text in data["text"]
|
|
||||||
# ]
|
|
||||||
self.texts = data["text"].to_list()
|
self.texts = data["text"].to_list()
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
label = self.labels[idx]
|
label = self.labels[idx]
|
||||||
text = tokenizer(
|
text = tokenizer(
|
||||||
self.texts[idx],
|
self.texts[idx],
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
max_length=512,
|
max_length=512,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
|
|
||||||
return text, label
|
return text, label
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
class NewsDataset:
|
class NewsDataset:
|
||||||
def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None:
|
def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None:
|
||||||
self.data_dir_path = Path(data_dir_path)
|
self.data_dir_path = Path("./" + data_dir_path)
|
||||||
self.true_news_path = self.data_dir_path / "True.csv"
|
self.true_news_path = self.data_dir_path / "True.csv"
|
||||||
self.fake_news_path = self.data_dir_path / "Fake.csv"
|
self.fake_news_path = self.data_dir_path / "Fake.csv"
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import random
|
import random
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
import argparse
|
import argparse
|
||||||
|
import torch
|
||||||
from models import BertClassifier, utils
|
from models import BertClassifier, utils
|
||||||
from datasets import NewsDataset
|
from datasets import NewsDataset
|
||||||
from train import train
|
from train import train
|
||||||
@ -12,7 +12,7 @@ SEED = 2137
|
|||||||
# Hyperparameters
|
# Hyperparameters
|
||||||
|
|
||||||
INITIAL_LR = 1e-6
|
INITIAL_LR = 1e-6
|
||||||
NUM_EPOCHS = 3
|
NUM_EPOCHS = 2
|
||||||
BATCH_SIZE = 2
|
BATCH_SIZE = 2
|
||||||
|
|
||||||
|
|
||||||
@ -31,8 +31,9 @@ parser.add_argument("--results_path", type=str, default="results/results.csv")
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("CUDA: ", torch.cuda.is_available())
|
||||||
# loading & spliting data
|
# loading & spliting data
|
||||||
news_dataset = NewsDataset(data_dir_path="data", data_lenght=2000)
|
news_dataset = NewsDataset(data_dir_path="data", data_lenght=1000)
|
||||||
|
|
||||||
train_val_data, test_data = train_test_split(
|
train_val_data, test_data = train_test_split(
|
||||||
news_dataset.data,
|
news_dataset.data,
|
||||||
|
Loading…
Reference in New Issue
Block a user