diff --git a/.gitignore b/.gitignore index adbb97d..a98485a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ +.vscode +__pycache__ data/ \ No newline at end of file diff --git a/README.md b/README.md index 2e7fd63..b22d46b 100644 --- a/README.md +++ b/README.md @@ -4,3 +4,4 @@ ### Filip Patyk ### 424714 +[https://git.wmi.amu.edu.pl/AITech/aitech-iumkv](https://git.wmi.amu.edu.pl/AITech/aitech-ium) \ No newline at end of file diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py new file mode 100644 index 0000000..2115777 --- /dev/null +++ b/src/datasets/__init__.py @@ -0,0 +1,4 @@ +__all__ = ["Dataset", "NewsDataset"] + +from .news_dataset import NewsDataset +from .dataset import Dataset diff --git a/src/datasets/dataset.py b/src/datasets/dataset.py new file mode 100644 index 0000000..13a0149 --- /dev/null +++ b/src/datasets/dataset.py @@ -0,0 +1,36 @@ +import torch +import pandas as pd +from transformers import BertTokenizer + +tokenizer = BertTokenizer.from_pretrained("bert-base-cased") + + +class Dataset(torch.utils.data.Dataset): + def __init__(self, data: pd.DataFrame) -> None: + self.labels = data["label"].to_list() + # self.texts = [ + # tokenizer( + # text, + # padding="max_length", + # max_length=512, + # truncation=True, + # return_tensors="pt", + # ) + # for text in data["text"] + # ] + self.texts = data["text"].to_list() + + def __getitem__(self, idx): + label = self.labels[idx] + text = tokenizer( + self.texts[idx], + padding="max_length", + max_length=512, + truncation=True, + return_tensors="pt", + ) + + return text, label + + def __len__(self) -> int: + return len(self.labels) diff --git a/src/datasets/news_dataset.py b/src/datasets/news_dataset.py new file mode 100644 index 0000000..2858935 --- /dev/null +++ b/src/datasets/news_dataset.py @@ -0,0 +1,34 @@ +import pandas as pd + +from pathlib import Path + + +class NewsDataset: + def __init__(self, data_dir_path: str = "data", data_lenght: int = None) -> None: + self.data_dir_path = Path(data_dir_path) + self.true_news_path = self.data_dir_path / "True.csv" + self.fake_news_path = self.data_dir_path / "Fake.csv" + + self.true_news = self.load_news(self.true_news_path, data_lenght) + self.fake_news = self.load_news(self.fake_news_path, data_lenght) + + self.true_news["label"] = 1 + self.fake_news["label"] = 0 + + def load_news(self, file_path: Path, trim: int = None) -> pd.DataFrame: + news = pd.read_csv(file_path) + news = news.drop(columns=["title", "subject", "date"]) + + return news if not trim else news.head(trim) + + @property + def data(self) -> pd.DataFrame: + dataset = pd.concat([self.true_news, self.fake_news], axis=0) + dataset["text"] = dataset["text"].str.strip() + dataset.dropna(axis=0, how="any", inplace=False, subset=["text"]) + return dataset + + +if __name__ == "__main__": + dataset = NewsDataset() + print(dataset.data.head(5)) diff --git a/src/evaluate.py b/src/evaluate.py new file mode 100644 index 0000000..0877170 --- /dev/null +++ b/src/evaluate.py @@ -0,0 +1,47 @@ +import os + +import pandas as pd +import torch +import torch.nn as nn + +from datasets import Dataset + +NUM_WORKERS = os.cpu_count() +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +PIN_MEMORY = True if DEVICE == "cuda" else False + + +def evaluate( + model: nn.Module, + test_data: pd.DataFrame, + batch_size: int, +) -> None: + test_dataset = Dataset(test_data) + + test_dataloader = torch.utils.data.DataLoader( + test_dataset, + batch_size=batch_size, + num_workers=NUM_WORKERS, + pin_memory=PIN_MEMORY, + shuffle=True, + ) + + model.to(DEVICE) + total_acc_test = 0 + + with torch.no_grad(): + for test_input, test_label in test_dataloader: + test_label = test_label.to(DEVICE) + mask = test_input["attention_mask"].to(DEVICE) + input_id = test_input["input_ids"].squeeze(1).to(DEVICE) + + output = model(input_id, mask) + + acc = (output.argmax(dim=1) == test_label).sum().item() + total_acc_test += acc + + print(f"Test Accuracy: {total_acc_test / len(test_data): .3f}") + + +if __name__ == "__main__": + pass diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..39ceda9 --- /dev/null +++ b/src/main.py @@ -0,0 +1,48 @@ +import random +from sklearn.model_selection import train_test_split + +from models import BertClassifier +from datasets import NewsDataset +from train import train +from evaluate import evaluate + +SEED = 2137 + +# Hyperparameters + +INITIAL_LR = 1e-6 +NUM_EPOCHS = 5 +BATCH_SIZE = 2 + +if __name__ == "__main__": + # loading & spliting data + news_dataset = NewsDataset(data_dir_path="data", data_lenght=2000) + + train_val_data, test_data = train_test_split( + news_dataset.data, + test_size=0.8, + shuffle=True, + random_state=random.seed(SEED), + ) + train_data, val_data = train_test_split( + train_val_data, + test_size=0.2, + shuffle=True, + random_state=random.seed(SEED), + ) + # trainig model + trained_model = train( + model=BertClassifier(), + train_data=train_data, + val_data=val_data, + learning_rate=INITIAL_LR, + epochs=NUM_EPOCHS, + batch_size=BATCH_SIZE, + ) + + # evaluating model + evaluate( + model=trained_model, + test_data=test_data, + batch_size=BATCH_SIZE, + ) diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..fed70df --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["BertClassifier"] + +from .bert_model import BertClassifier diff --git a/src/models/bert_model.py b/src/models/bert_model.py new file mode 100644 index 0000000..8a17999 --- /dev/null +++ b/src/models/bert_model.py @@ -0,0 +1,22 @@ +from torch import nn +from transformers import BertModel + + +class BertClassifier(nn.Module): + def __init__(self, dropout: float = 0.5, num_classes: int = 2): + super(BertClassifier, self).__init__() + + self.bert = BertModel.from_pretrained("bert-base-cased") + self.dropout = nn.Dropout(dropout) + self.linear = nn.Linear(768, num_classes) + self.relu = nn.ReLU() + + def forward(self, input_id, mask): + _, pooled_output = self.bert( + input_ids=input_id, attention_mask=mask, return_dict=False + ) + dropout_output = self.dropout(pooled_output) + linear_output = self.linear(dropout_output) + final_layer = self.relu(linear_output) + + return final_layer diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..e9cefa7 --- /dev/null +++ b/src/train.py @@ -0,0 +1,88 @@ +import os + +import pandas as pd +import torch +import torch.nn as nn +from tqdm import tqdm + +from datasets import Dataset + +NUM_WORKERS = os.cpu_count() +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +PIN_MEMORY = True if DEVICE == "cuda" else False + + +def train( + model: nn.Module, + train_data: pd.DataFrame, + val_data: pd.DataFrame, + learning_rate: float, + epochs: int, + batch_size: int, +) -> nn.Module: + train_dataset, val_dataset = Dataset(train_data), Dataset(val_data) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=NUM_WORKERS, + pin_memory=PIN_MEMORY, + shuffle=True, + ) + val_dataloader = torch.utils.data.DataLoader( + val_dataset, + batch_size=batch_size, + num_workers=NUM_WORKERS, + pin_memory=PIN_MEMORY, + shuffle=True, + ) + + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + + model.to(DEVICE) + for epoch in range(epochs): + total_acc_train = 0 + total_loss_train = 0 + for train_input, train_label in tqdm(train_dataloader): + train_label = train_label.to(DEVICE) + mask = train_input["attention_mask"].to(DEVICE) + input_id = train_input["input_ids"].squeeze(1).to(DEVICE) + + output = model(input_id, mask) + + batch_loss = criterion(output, train_label.long()) + total_loss_train += batch_loss.item() + + acc = (output.argmax(dim=1) == train_label).sum().item() + total_acc_train += acc + + model.zero_grad() + batch_loss.backward() + optimizer.step() + + total_acc_val = 0 + total_loss_val = 0 + + with torch.no_grad(): + for val_input, val_label in val_dataloader: + val_label = val_label.to(DEVICE) + mask = val_input["attention_mask"].to(DEVICE) + input_id = val_input["input_ids"].squeeze(1).to(DEVICE) + + output = model(input_id, mask) + + batch_loss = criterion(output, val_label.long()) + total_loss_val += batch_loss.item() + + acc = (output.argmax(dim=1) == val_label).sum().item() + total_acc_val += acc + + print( + f"Epochs: {epoch + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \ + | Train Accuracy: {total_acc_train / len(train_data): .3f} \ + | Val Loss: {total_loss_val / len(val_data): .3f} \ + | Val Accuracy: {total_acc_val / len(val_data): .3f}" + ) + + return model