Compare commits

..

2 Commits

Author SHA1 Message Date
253e75b68b first solution 2022-05-25 22:54:49 +02:00
dc2cadc034 naive bayes 2022-05-08 15:06:12 +02:00
16 changed files with 22223 additions and 0 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,19 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="W29" />
<option value="E501" />
<option value="W29" />
<option value="E501" />
<option value="W29" />
<option value="E501" />
<option value="W29" />
<option value="E501" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" filepath="$PROJECT_DIR$/.idea/paranormal-or-skeptic-ISI-public.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,455 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import lzma\n",
"import gensim.models\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torchvision import datasets, transforms\n",
"from torch.optim.lr_scheduler import StepLR"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = [line.split() for line in X_train]\n",
"X_dev0 = [line.split() for line in X_dev0]\n",
"X_test = [line.split() for line in X_test]\n",
"\n",
"def tagged_document(list_of_list_of_words):\n",
" for i, list_of_words in enumerate(list_of_list_of_words):\n",
" yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])\n",
"\n",
"data_training = list(tagged_document(X_train))\n",
"model = gensim.models.doc2vec.Doc2Vec(vector_size=1000)\n",
"model.build_vocab(data_training)\n",
"\n",
"X_train_d2v = [model.infer_vector(line) for line in X_train]\n",
"X_dev0_d2v = [model.infer_vector(line) for line in X_dev0]\n",
"X_test_d2v = [model.infer_vector(line) for line in X_test]\n",
"\n",
"y_train = np.array([int(i) for i in y_train])\n",
"y_expected_dev0 = np.array([int(i) for i in y_expected_dev0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Net(nn.Module):\n",
" \"\"\"W PyTorchu tworzenie sieci neuronowej\n",
" polega na zdefiniowaniu klasy, która dziedziczy z nn.Module.\n",
" \"\"\"\n",
" \n",
" def __init__(self):\n",
" super().__init__()\n",
" \n",
" # Warstwy splotowe\n",
" self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
" self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
" \n",
" # Warstwy dropout\n",
" self.dropout1 = nn.Dropout(0.25)\n",
" self.dropout2 = nn.Dropout(0.5)\n",
" \n",
" # Warstwy liniowe\n",
" self.fc1 = nn.Linear(9216, 128)\n",
" self.fc2 = nn.Linear(128, 10)\n",
"\n",
" def forward(self, x):\n",
" \"\"\"Definiujemy przechodzenie \"do przodu\" jako kolejne przekształcenia wejścia x\"\"\"\n",
" x = self.conv1(x)\n",
" x = F.relu(x)\n",
" x = self.conv2(x)\n",
" x = F.relu(x)\n",
" x = F.max_pool2d(x, 2)\n",
" x = self.dropout1(x)\n",
" x = torch.flatten(x, 1)\n",
" x = self.fc1(x)\n",
" x = F.relu(x)\n",
" x = self.dropout2(x)\n",
" x = self.fc2(x)\n",
" output = F.log_softmax(x, dim=1)\n",
" return output\n",
"\n",
"\n",
"def train(model, device, train_loader, optimizer, epoch, log_interval, dry_run):\n",
" \"\"\"Uczenie modelu\"\"\"\n",
" model.train()\n",
" for batch_idx, (data, target) in enumerate(train_loader):\n",
" data, target = data.to(device), target.to(device) # wrzucenie danych na kartę graficzną (jeśli dotyczy)\n",
" optimizer.zero_grad() # wyzerowanie gradientu\n",
" output = model(data) # przejście \"do przodu\"\n",
" loss = F.nll_loss(output, target) # obliczenie funkcji kosztu\n",
" loss.backward() # propagacja wsteczna\n",
" optimizer.step() # krok optymalizatora\n",
" if batch_idx % log_interval == 0:\n",
" print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
" epoch, batch_idx * len(data), len(train_loader.dataset),\n",
" 100. * batch_idx / len(train_loader), loss.item()))\n",
" if dry_run:\n",
" break\n",
"\n",
"\n",
"def test(model, device, test_loader):\n",
" \"\"\"Testowanie modelu\"\"\"\n",
" model.eval()\n",
" test_loss = 0\n",
" correct = 0\n",
" with torch.no_grad():\n",
" for data, target in test_loader:\n",
" data, target = data.to(device), target.to(device) # wrzucenie danych na kartę graficzną (jeśli dotyczy)\n",
" output = model(data) # przejście \"do przodu\"\n",
" test_loss += F.nll_loss(output, target, reduction='sum').item() # suma kosztów z każdego batcha\n",
" pred = output.argmax(dim=1, keepdim=True) # predykcja na podstawie maks. logarytmu prawdopodobieństwa\n",
" correct += pred.eq(target.view_as(pred)).sum().item()\n",
"\n",
" test_loss /= len(test_loader.dataset) # obliczenie kosztu na zbiorze testowym\n",
"\n",
" print('\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
" test_loss, correct, len(test_loader.dataset),\n",
" 100. * correct / len(test_loader.dataset)))\n",
"\n",
"\n",
"def run(\n",
" batch_size=64,\n",
" test_batch_size=1000,\n",
" epochs=14,\n",
" lr=1.0,\n",
" gamma=0.7,\n",
" no_cuda=False,\n",
" dry_run=False,\n",
" seed=1,\n",
" log_interval=10,\n",
" save_model=False,\n",
" ):\n",
" \"\"\"Main training function.\n",
" \n",
" Arguments:\n",
" batch_size - wielkość batcha podczas uczenia (default: 64),\n",
" test_batch_size - wielkość batcha podczas testowania (default: 1000)\n",
" epochs - liczba epok uczenia (default: 14)\n",
" lr - współczynnik uczenia (learning rate) (default: 1.0)\n",
" gamma - współczynnik gamma (dla optymalizatora) (default: 0.7)\n",
" no_cuda - wyłącza uczenie na karcie graficznej (default: False)\n",
" dry_run - szybko (\"na sucho\") sprawdza pojedyncze przejście (default: False)\n",
" seed - ziarno generatora liczb pseudolosowych (default: 1)\n",
" log_interval - interwał logowania stanu uczenia (default: 10)\n",
" save_model - zapisuje bieżący model (default: False)\n",
" \"\"\"\n",
" use_cuda = no_cuda and torch.cuda.is_available()\n",
"\n",
" torch.manual_seed(seed)\n",
"\n",
" device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n",
"\n",
" train_kwargs = {'batch_size': batch_size}\n",
" test_kwargs = {'batch_size': test_batch_size}\n",
" if use_cuda:\n",
" cuda_kwargs = {'num_workers': 1,\n",
" 'pin_memory': True,\n",
" 'shuffle': True}\n",
" train_kwargs.update(cuda_kwargs)\n",
" test_kwargs.update(cuda_kwargs)\n",
"\n",
" transform=transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,))\n",
" ])\n",
" dataset1 = datasets.MNIST('../data', train=True, download=True,\n",
" transform=transform)\n",
" dataset2 = datasets.MNIST('../data', train=False,\n",
" transform=transform)\n",
" train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)\n",
" test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)\n",
"\n",
" model = Net().to(device)\n",
" optimizer = optim.Adadelta(model.parameters(), lr=lr)\n",
"\n",
" scheduler = StepLR(optimizer, step_size=1, gamma=gamma)\n",
" for epoch in range(1, epochs + 1):\n",
" train(model, device, train_loader, optimizer, epoch, log_interval, dry_run)\n",
" test(model, device, test_loader)\n",
" scheduler.step()\n",
"\n",
" if save_model:\n",
" torch.save(model.state_dict(), \"mnist_cnn.pt\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.003825023"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"FEATURES = 1000\n",
"class NeuralNetworkModel(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(FEATURES,500)\n",
" self.fc2 = torch.nn.Linear(500,1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"nn_model = NeuralNetworkModel()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"BATCH_SIZE = 5"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"criterion = torch.nn.BCELoss()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" model.eval()\n",
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
" X = np.array(X_dataset[i:i+BATCH_SIZE])\n",
" X = torch.tensor(X)\n",
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" loss = criterion(Y_predictions, Y)\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" return (loss_score / items_total), (acc_score / items_total)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"for epoch in range(5):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" nn_model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = np.array(X_train_d2v[i:i+BATCH_SIZE])\n",
" X = torch.tensor(X)\n",
" Y = y_train[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
"\n",
" display(epoch)\n",
" display(get_loss_acc(nn_model, X_train_d2v, y_train))\n",
" display(get_loss_acc(nn_model, X_dev0_d2v, y_expected_dev0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5272
dev-0/outNB.tsv Normal file

File diff suppressed because it is too large Load Diff

588
run.ipynb Normal file
View File

@ -0,0 +1,588 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import lzma\n",
"from gensim.models import Word2Vec\n",
"import gensim.downloader\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = np.array(open('train/expected.tsv').readlines())\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = np.array(open(\"dev-0/expected.tsv\", \"r\").readlines())\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"X_train = [line.split() for line in X_train]\n",
"X_dev0 = [line.split() for line in X_dev0]\n",
"X_test = [line.split() for line in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"model_w2v = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"def vectorize(model, data):\n",
" return np.array([np.mean([model.wv[word] if word in model.wv.key_to_index else np.zeros(100, dtype=float) for word in doc], axis=0) for doc in data])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"X_train_w2v = vectorize(model_w2v, X_train)\n",
"X_dev0_w2v = vectorize(model_w2v, X_dev0)\n",
"X_test_w2v = vectorize(model_w2v, X_test)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"FEATURES = 100\n",
"\n",
"class NeuralNetworkModel(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(FEATURES,500)\n",
" self.fc2 = torch.nn.Linear(500,1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"nn_model = NeuralNetworkModel()"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"BATCH_SIZE = 42"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"criterion = torch.nn.BCELoss()"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" model.eval()\n",
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
" X = np.array(X_dataset[i:i+BATCH_SIZE]).astype(np.float32)\n",
" X = torch.tensor(X)\n",
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" loss = criterion(Y_predictions, Y)\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" return (loss_score / items_total), (acc_score / items_total)"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"def predict(model, data):\n",
" model.eval()\n",
" predictions = []\n",
" for x in data:\n",
" X = torch.tensor(np.array(x).astype(np.float32))\n",
" Y_predictions = model(X)\n",
" if Y_predictions[0] > 0.5:\n",
" predictions.append(\"1\")\n",
" else:\n",
" predictions.append(\"0\")\n",
" return predictions"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49161445487174543, 0.7499197110287693)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4990149180719994, 0.7420333839150227)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.486242138754709, 0.7533833599812141)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4960476360955079, 0.7448786039453718)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.48170865143118824, 0.7566018254086104)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49339661830880754, 0.7448786039453718)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"3"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.47863767532834156, 0.7587877573995352)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49210414077877457, 0.7503793626707133)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"4"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4755889592268004, 0.7613466446116604)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49055553189223017, 0.753793626707132)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"5"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.47395927866325194, 0.7623273787118541)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4905445413022374, 0.7541729893778453)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"6"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4721670034531442, 0.7639055318237855)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4896522785377249, 0.7522761760242792)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"7"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4713666787153674, 0.7644166186083936)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4897225151384003, 0.7532245827010622)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"8"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4687599671611641, 0.7661674361745845)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4882916720620779, 0.7524658573596358)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"9"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4669961705231401, 0.767617817590364)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.48753329053272426, 0.7534142640364189)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for epoch in range(10):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" nn_model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = X_train_w2v[i:i+BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" Y = y_train[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
"\n",
" display(epoch)\n",
" display(get_loss_acc(nn_model, X_train_w2v, y_train))\n",
" display(get_loss_acc(nn_model, X_dev0_w2v, y_expected_dev0))"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"y_pred_dev0 = predict(nn_model, X_dev0_w2v)\n",
"y_pred_test = predict(nn_model, X_test_w2v)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"open('dev-0/out.tsv', 'w').writelines([i+'\\n' for i in y_pred_dev0])"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"open('test-A/out.tsv', 'w').writelines([i+'\\n' for i in y_pred_test])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

114
run.py Executable file
View File

@ -0,0 +1,114 @@
#!/usr/bin/env python
# coding: utf-8
import lzma
from gensim.models import Word2Vec
import gensim.downloader
import numpy as np
import pandas as pd
import torch
X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train = np.array(open('train/expected.tsv').readlines())
X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_expected_dev0 = np.array(open("dev-0/expected.tsv", "r").readlines())
X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
X_train = [line.split() for line in X_train]
X_dev0 = [line.split() for line in X_dev0]
X_test = [line.split() for line in X_test]
model_w2v = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)
def vectorize(model, data):
return np.array([np.mean([model.wv[word] if word in model.wv.key_to_index else np.zeros(100, dtype=float) for word in doc], axis=0) for doc in data])
X_train_w2v = vectorize(model_w2v, X_train)
X_dev0_w2v = vectorize(model_w2v, X_dev0)
X_test_w2v = vectorize(model_w2v, X_test)
FEATURES = 100
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(FEATURES,500)
self.fc2 = torch.nn.Linear(500,1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
nn_model = NeuralNetworkModel()
BATCH_SIZE = 42
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)
def get_loss_acc(model, X_dataset, Y_dataset):
loss_score = 0
acc_score = 0
items_total = 0
model.eval()
for i in range(0, Y_dataset.shape[0], BATCH_SIZE):
X = np.array(X_dataset[i:i+BATCH_SIZE]).astype(np.float32)
X = torch.tensor(X)
Y = Y_dataset[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
loss = criterion(Y_predictions, Y)
loss_score += loss.item() * Y.shape[0]
return (loss_score / items_total), (acc_score / items_total)
def predict(model, data):
model.eval()
predictions = []
for x in data:
X = torch.tensor(np.array(x).astype(np.float32))
Y_predictions = model(X)
if Y_predictions[0] > 0.5:
predictions.append("1")
else:
predictions.append("0")
return predictions
for epoch in range(10):
loss_score = 0
acc_score = 0
items_total = 0
nn_model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = X_train_w2v[i:i+BATCH_SIZE]
X = torch.tensor(X)
Y = y_train[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = nn_model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
display(epoch)
display(get_loss_acc(nn_model, X_train_w2v, y_train))
display(get_loss_acc(nn_model, X_dev0_w2v, y_expected_dev0))
y_pred_dev0 = predict(nn_model, X_dev0_w2v)
y_pred_test = predict(nn_model, X_test_w2v)
open('dev-0/out.tsv', 'w').writelines([i+'\n' for i in y_pred_dev0])
open('test-A/out.tsv', 'w').writelines([i+'\n' for i in y_pred_test])

135
runNB.ipynb Normal file
View File

@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import lzma\n",
"\n",
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"count_vect = CountVectorizer()\n",
"X_train_counts = count_vect.fit_transform(X_train)\n",
"X_dev0_counts = count_vect.transform(X_dev0)\n",
"X_test_counts = count_vect.transform(X_test)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"clf = MultinomialNB().fit(X_train_counts, y_train)\n",
"\n",
"y_predicted_dev0_MNB = clf.predict(X_dev0_counts)\n",
"y_predicted_test_MNB = clf.predict(X_test_counts)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy dev0: 0.8025417298937785\n"
]
}
],
"source": [
"accuracy_dev0_MNB = accuracy_score(y_expected_dev0, y_predicted_dev0_MNB)\n",
"print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"open(\"dev-0/out.tsv\", mode='w').writelines(y_predicted_dev0_MNB)\n",
"open(\"test-A/out.tsv\", mode='w').writelines(y_predicted_test_MNB)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

24
runNB.py Normal file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding: utf-8
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma
X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train = open('train/expected.tsv').readlines()
X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines()
X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_dev0_counts = count_vect.transform(X_dev0)
X_test_counts = count_vect.transform(X_test)
clf = MultinomialNB().fit(X_train_counts, y_train)
y_predicted_dev0_MNB = clf.predict(X_dev0_counts)
y_predicted_test_MNB = clf.predict(X_test_counts)
open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB)
open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5152
test-A/outNB.tsv Normal file

File diff suppressed because it is too large Load Diff