diff --git a/Dockerfile b/Dockerfile index 8286282..37d69a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM condaforge/mambaforge:latest # seting env variables ENV KAGGLE_USERNAME=filippatyk @@ -7,17 +7,18 @@ ENV RUN_TYPE="" # create working direcotyry WORKDIR /app -ENV KAGGLE_CONFIG_DIR="./" +ENV KAGGLE_CONFIG_DIR="./" -# install python dependencies -COPY requirements.txt ./ -RUN pip install --no-cache-dir -r requirements.txt +# create mamba env and activate it +COPY environment.yml /tmp/environment.yml +RUN mamba env create -f /tmp/environment.yml && mamba clean -ya +RUN echo "mamba activate ium" >> ~/.bashrc +ENV PATH /opt/conda/envs/ium/bin:$PATH -COPY dataset.py ./ +COPY src ./src #make dir for data -RUN mkdir -p ./data +RUN mkdir -p ./data; mkdir -p ./results - -CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE" \ No newline at end of file +CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./src/main.py "--$RUN_TYPE" \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..9dbf082 --- /dev/null +++ b/environment.yml @@ -0,0 +1,18 @@ +name: ium +channels: + - conda-forge + - pytorch + - nvidia +dependencies: + - python=3.10 + - kaggle + - ca-certificates + - openssl + - pandas + - certifi + - scikit-learn + - pytorch + - pytorch-cuda=11.8 + - transformers + + diff --git a/src/datasets/dataset.py b/src/datasets/dataset.py index 13a0149..9b67596 100644 --- a/src/datasets/dataset.py +++ b/src/datasets/dataset.py @@ -8,27 +8,17 @@ tokenizer = BertTokenizer.from_pretrained("bert-base-cased") class Dataset(torch.utils.data.Dataset): def __init__(self, data: pd.DataFrame) -> None: self.labels = data["label"].to_list() - # self.texts = [ - # tokenizer( - # text, - # padding="max_length", - # max_length=512, - # truncation=True, - # return_tensors="pt", - # ) - # for text in data["text"] - # ] self.texts = data["text"].to_list() def __getitem__(self, idx): label = self.labels[idx] text = tokenizer( - self.texts[idx], - padding="max_length", - max_length=512, - truncation=True, - return_tensors="pt", - ) + self.texts[idx], + padding="max_length", + max_length=512, + truncation=True, + return_tensors="pt", + ) return text, label diff --git a/src/datasets/news_dataset.py b/src/datasets/news_dataset.py index 2858935..fb6515a 100644 --- a/src/datasets/news_dataset.py +++ b/src/datasets/news_dataset.py @@ -5,7 +5,7 @@ from pathlib import Path class NewsDataset: def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None: - self.data_dir_path = Path(data_dir_path) + self.data_dir_path = Path("./" + data_dir_path) self.true_news_path = self.data_dir_path / "True.csv" self.fake_news_path = self.data_dir_path / "Fake.csv" diff --git a/src/main.py b/src/main.py index b48d32c..7ee45fa 100644 --- a/src/main.py +++ b/src/main.py @@ -1,7 +1,7 @@ import random from sklearn.model_selection import train_test_split import argparse - +import torch from models import BertClassifier, utils from datasets import NewsDataset from train import train @@ -12,7 +12,7 @@ SEED = 2137 # Hyperparameters INITIAL_LR = 1e-6 -NUM_EPOCHS = 3 +NUM_EPOCHS = 2 BATCH_SIZE = 2 @@ -31,8 +31,9 @@ parser.add_argument("--results_path", type=str, default="results/results.csv") if __name__ == "__main__": args = parser.parse_args() + print("CUDA: ", torch.cuda.is_available()) # loading & spliting data - news_dataset = NewsDataset(data_dir_path="data", data_lenght=2000) + news_dataset = NewsDataset(data_dir_path="data", data_lenght=1000) train_val_data, test_data = train_test_split( news_dataset.data,