paranormal-or-skeptic-ISI-p.../.ipynb_checkpoints/run-checkpoint.ipynb
2022-05-25 22:54:49 +02:00

456 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import lzma\n",
"import gensim.models\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torchvision import datasets, transforms\n",
"from torch.optim.lr_scheduler import StepLR"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = [line.split() for line in X_train]\n",
"X_dev0 = [line.split() for line in X_dev0]\n",
"X_test = [line.split() for line in X_test]\n",
"\n",
"def tagged_document(list_of_list_of_words):\n",
" for i, list_of_words in enumerate(list_of_list_of_words):\n",
" yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])\n",
"\n",
"data_training = list(tagged_document(X_train))\n",
"model = gensim.models.doc2vec.Doc2Vec(vector_size=1000)\n",
"model.build_vocab(data_training)\n",
"\n",
"X_train_d2v = [model.infer_vector(line) for line in X_train]\n",
"X_dev0_d2v = [model.infer_vector(line) for line in X_dev0]\n",
"X_test_d2v = [model.infer_vector(line) for line in X_test]\n",
"\n",
"y_train = np.array([int(i) for i in y_train])\n",
"y_expected_dev0 = np.array([int(i) for i in y_expected_dev0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Net(nn.Module):\n",
" \"\"\"W PyTorchu tworzenie sieci neuronowej\n",
" polega na zdefiniowaniu klasy, która dziedziczy z nn.Module.\n",
" \"\"\"\n",
" \n",
" def __init__(self):\n",
" super().__init__()\n",
" \n",
" # Warstwy splotowe\n",
" self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
" self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
" \n",
" # Warstwy dropout\n",
" self.dropout1 = nn.Dropout(0.25)\n",
" self.dropout2 = nn.Dropout(0.5)\n",
" \n",
" # Warstwy liniowe\n",
" self.fc1 = nn.Linear(9216, 128)\n",
" self.fc2 = nn.Linear(128, 10)\n",
"\n",
" def forward(self, x):\n",
" \"\"\"Definiujemy przechodzenie \"do przodu\" jako kolejne przekształcenia wejścia x\"\"\"\n",
" x = self.conv1(x)\n",
" x = F.relu(x)\n",
" x = self.conv2(x)\n",
" x = F.relu(x)\n",
" x = F.max_pool2d(x, 2)\n",
" x = self.dropout1(x)\n",
" x = torch.flatten(x, 1)\n",
" x = self.fc1(x)\n",
" x = F.relu(x)\n",
" x = self.dropout2(x)\n",
" x = self.fc2(x)\n",
" output = F.log_softmax(x, dim=1)\n",
" return output\n",
"\n",
"\n",
"def train(model, device, train_loader, optimizer, epoch, log_interval, dry_run):\n",
" \"\"\"Uczenie modelu\"\"\"\n",
" model.train()\n",
" for batch_idx, (data, target) in enumerate(train_loader):\n",
" data, target = data.to(device), target.to(device) # wrzucenie danych na kartę graficzną (jeśli dotyczy)\n",
" optimizer.zero_grad() # wyzerowanie gradientu\n",
" output = model(data) # przejście \"do przodu\"\n",
" loss = F.nll_loss(output, target) # obliczenie funkcji kosztu\n",
" loss.backward() # propagacja wsteczna\n",
" optimizer.step() # krok optymalizatora\n",
" if batch_idx % log_interval == 0:\n",
" print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
" epoch, batch_idx * len(data), len(train_loader.dataset),\n",
" 100. * batch_idx / len(train_loader), loss.item()))\n",
" if dry_run:\n",
" break\n",
"\n",
"\n",
"def test(model, device, test_loader):\n",
" \"\"\"Testowanie modelu\"\"\"\n",
" model.eval()\n",
" test_loss = 0\n",
" correct = 0\n",
" with torch.no_grad():\n",
" for data, target in test_loader:\n",
" data, target = data.to(device), target.to(device) # wrzucenie danych na kartę graficzną (jeśli dotyczy)\n",
" output = model(data) # przejście \"do przodu\"\n",
" test_loss += F.nll_loss(output, target, reduction='sum').item() # suma kosztów z każdego batcha\n",
" pred = output.argmax(dim=1, keepdim=True) # predykcja na podstawie maks. logarytmu prawdopodobieństwa\n",
" correct += pred.eq(target.view_as(pred)).sum().item()\n",
"\n",
" test_loss /= len(test_loader.dataset) # obliczenie kosztu na zbiorze testowym\n",
"\n",
" print('\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
" test_loss, correct, len(test_loader.dataset),\n",
" 100. * correct / len(test_loader.dataset)))\n",
"\n",
"\n",
"def run(\n",
" batch_size=64,\n",
" test_batch_size=1000,\n",
" epochs=14,\n",
" lr=1.0,\n",
" gamma=0.7,\n",
" no_cuda=False,\n",
" dry_run=False,\n",
" seed=1,\n",
" log_interval=10,\n",
" save_model=False,\n",
" ):\n",
" \"\"\"Main training function.\n",
" \n",
" Arguments:\n",
" batch_size - wielkość batcha podczas uczenia (default: 64),\n",
" test_batch_size - wielkość batcha podczas testowania (default: 1000)\n",
" epochs - liczba epok uczenia (default: 14)\n",
" lr - współczynnik uczenia (learning rate) (default: 1.0)\n",
" gamma - współczynnik gamma (dla optymalizatora) (default: 0.7)\n",
" no_cuda - wyłącza uczenie na karcie graficznej (default: False)\n",
" dry_run - szybko (\"na sucho\") sprawdza pojedyncze przejście (default: False)\n",
" seed - ziarno generatora liczb pseudolosowych (default: 1)\n",
" log_interval - interwał logowania stanu uczenia (default: 10)\n",
" save_model - zapisuje bieżący model (default: False)\n",
" \"\"\"\n",
" use_cuda = no_cuda and torch.cuda.is_available()\n",
"\n",
" torch.manual_seed(seed)\n",
"\n",
" device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n",
"\n",
" train_kwargs = {'batch_size': batch_size}\n",
" test_kwargs = {'batch_size': test_batch_size}\n",
" if use_cuda:\n",
" cuda_kwargs = {'num_workers': 1,\n",
" 'pin_memory': True,\n",
" 'shuffle': True}\n",
" train_kwargs.update(cuda_kwargs)\n",
" test_kwargs.update(cuda_kwargs)\n",
"\n",
" transform=transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,))\n",
" ])\n",
" dataset1 = datasets.MNIST('../data', train=True, download=True,\n",
" transform=transform)\n",
" dataset2 = datasets.MNIST('../data', train=False,\n",
" transform=transform)\n",
" train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)\n",
" test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)\n",
"\n",
" model = Net().to(device)\n",
" optimizer = optim.Adadelta(model.parameters(), lr=lr)\n",
"\n",
" scheduler = StepLR(optimizer, step_size=1, gamma=gamma)\n",
" for epoch in range(1, epochs + 1):\n",
" train(model, device, train_loader, optimizer, epoch, log_interval, dry_run)\n",
" test(model, device, test_loader)\n",
" scheduler.step()\n",
"\n",
" if save_model:\n",
" torch.save(model.state_dict(), \"mnist_cnn.pt\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.003825023"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"FEATURES = 1000\n",
"class NeuralNetworkModel(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(FEATURES,500)\n",
" self.fc2 = torch.nn.Linear(500,1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"nn_model = NeuralNetworkModel()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"BATCH_SIZE = 5"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"criterion = torch.nn.BCELoss()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" model.eval()\n",
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
" X = np.array(X_dataset[i:i+BATCH_SIZE])\n",
" X = torch.tensor(X)\n",
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" loss = criterion(Y_predictions, Y)\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" return (loss_score / items_total), (acc_score / items_total)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"for epoch in range(5):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" nn_model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = np.array(X_train_d2v[i:i+BATCH_SIZE])\n",
" X = torch.tensor(X)\n",
" Y = y_train[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
"\n",
" display(epoch)\n",
" display(get_loss_acc(nn_model, X_train_d2v, y_train))\n",
" display(get_loss_acc(nn_model, X_dev0_d2v, y_expected_dev0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}