first solution

This commit is contained in:
domzal 2022-05-25 22:54:49 +02:00
parent dc2cadc034
commit 253e75b68b
9 changed files with 13205 additions and 1624 deletions

View File

@ -0,0 +1,455 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import lzma\n",
"import gensim.models\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torchvision import datasets, transforms\n",
"from torch.optim.lr_scheduler import StepLR"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = [line.split() for line in X_train]\n",
"X_dev0 = [line.split() for line in X_dev0]\n",
"X_test = [line.split() for line in X_test]\n",
"\n",
"def tagged_document(list_of_list_of_words):\n",
" for i, list_of_words in enumerate(list_of_list_of_words):\n",
" yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])\n",
"\n",
"data_training = list(tagged_document(X_train))\n",
"model = gensim.models.doc2vec.Doc2Vec(vector_size=1000)\n",
"model.build_vocab(data_training)\n",
"\n",
"X_train_d2v = [model.infer_vector(line) for line in X_train]\n",
"X_dev0_d2v = [model.infer_vector(line) for line in X_dev0]\n",
"X_test_d2v = [model.infer_vector(line) for line in X_test]\n",
"\n",
"y_train = np.array([int(i) for i in y_train])\n",
"y_expected_dev0 = np.array([int(i) for i in y_expected_dev0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Net(nn.Module):\n",
" \"\"\"W PyTorchu tworzenie sieci neuronowej\n",
" polega na zdefiniowaniu klasy, która dziedziczy z nn.Module.\n",
" \"\"\"\n",
" \n",
" def __init__(self):\n",
" super().__init__()\n",
" \n",
" # Warstwy splotowe\n",
" self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
" self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
" \n",
" # Warstwy dropout\n",
" self.dropout1 = nn.Dropout(0.25)\n",
" self.dropout2 = nn.Dropout(0.5)\n",
" \n",
" # Warstwy liniowe\n",
" self.fc1 = nn.Linear(9216, 128)\n",
" self.fc2 = nn.Linear(128, 10)\n",
"\n",
" def forward(self, x):\n",
" \"\"\"Definiujemy przechodzenie \"do przodu\" jako kolejne przekształcenia wejścia x\"\"\"\n",
" x = self.conv1(x)\n",
" x = F.relu(x)\n",
" x = self.conv2(x)\n",
" x = F.relu(x)\n",
" x = F.max_pool2d(x, 2)\n",
" x = self.dropout1(x)\n",
" x = torch.flatten(x, 1)\n",
" x = self.fc1(x)\n",
" x = F.relu(x)\n",
" x = self.dropout2(x)\n",
" x = self.fc2(x)\n",
" output = F.log_softmax(x, dim=1)\n",
" return output\n",
"\n",
"\n",
"def train(model, device, train_loader, optimizer, epoch, log_interval, dry_run):\n",
" \"\"\"Uczenie modelu\"\"\"\n",
" model.train()\n",
" for batch_idx, (data, target) in enumerate(train_loader):\n",
" data, target = data.to(device), target.to(device) # wrzucenie danych na kartę graficzną (jeśli dotyczy)\n",
" optimizer.zero_grad() # wyzerowanie gradientu\n",
" output = model(data) # przejście \"do przodu\"\n",
" loss = F.nll_loss(output, target) # obliczenie funkcji kosztu\n",
" loss.backward() # propagacja wsteczna\n",
" optimizer.step() # krok optymalizatora\n",
" if batch_idx % log_interval == 0:\n",
" print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
" epoch, batch_idx * len(data), len(train_loader.dataset),\n",
" 100. * batch_idx / len(train_loader), loss.item()))\n",
" if dry_run:\n",
" break\n",
"\n",
"\n",
"def test(model, device, test_loader):\n",
" \"\"\"Testowanie modelu\"\"\"\n",
" model.eval()\n",
" test_loss = 0\n",
" correct = 0\n",
" with torch.no_grad():\n",
" for data, target in test_loader:\n",
" data, target = data.to(device), target.to(device) # wrzucenie danych na kartę graficzną (jeśli dotyczy)\n",
" output = model(data) # przejście \"do przodu\"\n",
" test_loss += F.nll_loss(output, target, reduction='sum').item() # suma kosztów z każdego batcha\n",
" pred = output.argmax(dim=1, keepdim=True) # predykcja na podstawie maks. logarytmu prawdopodobieństwa\n",
" correct += pred.eq(target.view_as(pred)).sum().item()\n",
"\n",
" test_loss /= len(test_loader.dataset) # obliczenie kosztu na zbiorze testowym\n",
"\n",
" print('\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
" test_loss, correct, len(test_loader.dataset),\n",
" 100. * correct / len(test_loader.dataset)))\n",
"\n",
"\n",
"def run(\n",
" batch_size=64,\n",
" test_batch_size=1000,\n",
" epochs=14,\n",
" lr=1.0,\n",
" gamma=0.7,\n",
" no_cuda=False,\n",
" dry_run=False,\n",
" seed=1,\n",
" log_interval=10,\n",
" save_model=False,\n",
" ):\n",
" \"\"\"Main training function.\n",
" \n",
" Arguments:\n",
" batch_size - wielkość batcha podczas uczenia (default: 64),\n",
" test_batch_size - wielkość batcha podczas testowania (default: 1000)\n",
" epochs - liczba epok uczenia (default: 14)\n",
" lr - współczynnik uczenia (learning rate) (default: 1.0)\n",
" gamma - współczynnik gamma (dla optymalizatora) (default: 0.7)\n",
" no_cuda - wyłącza uczenie na karcie graficznej (default: False)\n",
" dry_run - szybko (\"na sucho\") sprawdza pojedyncze przejście (default: False)\n",
" seed - ziarno generatora liczb pseudolosowych (default: 1)\n",
" log_interval - interwał logowania stanu uczenia (default: 10)\n",
" save_model - zapisuje bieżący model (default: False)\n",
" \"\"\"\n",
" use_cuda = no_cuda and torch.cuda.is_available()\n",
"\n",
" torch.manual_seed(seed)\n",
"\n",
" device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n",
"\n",
" train_kwargs = {'batch_size': batch_size}\n",
" test_kwargs = {'batch_size': test_batch_size}\n",
" if use_cuda:\n",
" cuda_kwargs = {'num_workers': 1,\n",
" 'pin_memory': True,\n",
" 'shuffle': True}\n",
" train_kwargs.update(cuda_kwargs)\n",
" test_kwargs.update(cuda_kwargs)\n",
"\n",
" transform=transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,))\n",
" ])\n",
" dataset1 = datasets.MNIST('../data', train=True, download=True,\n",
" transform=transform)\n",
" dataset2 = datasets.MNIST('../data', train=False,\n",
" transform=transform)\n",
" train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)\n",
" test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)\n",
"\n",
" model = Net().to(device)\n",
" optimizer = optim.Adadelta(model.parameters(), lr=lr)\n",
"\n",
" scheduler = StepLR(optimizer, step_size=1, gamma=gamma)\n",
" for epoch in range(1, epochs + 1):\n",
" train(model, device, train_loader, optimizer, epoch, log_interval, dry_run)\n",
" test(model, device, test_loader)\n",
" scheduler.step()\n",
"\n",
" if save_model:\n",
" torch.save(model.state_dict(), \"mnist_cnn.pt\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.003825023"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"FEATURES = 1000\n",
"class NeuralNetworkModel(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(FEATURES,500)\n",
" self.fc2 = torch.nn.Linear(500,1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"nn_model = NeuralNetworkModel()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"BATCH_SIZE = 5"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"criterion = torch.nn.BCELoss()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" model.eval()\n",
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
" X = np.array(X_dataset[i:i+BATCH_SIZE])\n",
" X = torch.tensor(X)\n",
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" loss = criterion(Y_predictions, Y)\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" return (loss_score / items_total), (acc_score / items_total)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"for epoch in range(5):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" nn_model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = np.array(X_train_d2v[i:i+BATCH_SIZE])\n",
" X = torch.tensor(X)\n",
" Y = y_train[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
"\n",
" display(epoch)\n",
" display(get_loss_acc(nn_model, X_train_d2v, y_train))\n",
" display(get_loss_acc(nn_model, X_dev0_d2v, y_expected_dev0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because it is too large Load Diff

5272
dev-0/outNB.tsv Normal file

File diff suppressed because it is too large Load Diff

601
run.ipynb
View File

@ -2,134 +2,587 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 27,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"#!/usr/bin/env python\n", "#!/usr/bin/env python\n",
"# coding: utf-8\n", "# coding: utf-8\n",
"\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import lzma\n", "import lzma\n",
"\n", "from gensim.models import Word2Vec\n",
"import gensim.downloader\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n", "X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n", "y_train = np.array(open('train/expected.tsv').readlines())\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n", "X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n", "y_expected_dev0 = np.array(open(\"dev-0/expected.tsv\", \"r\").readlines())\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()" "X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
], ]
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 3,
"metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"count_vect = CountVectorizer()\n", "X_train = [line.split() for line in X_train]\n",
"X_train_counts = count_vect.fit_transform(X_train)\n", "X_dev0 = [line.split() for line in X_dev0]\n",
"X_dev0_counts = count_vect.transform(X_dev0)\n", "X_test = [line.split() for line in X_test]"
"X_test_counts = count_vect.transform(X_test)" ]
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 62,
"metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"clf = MultinomialNB().fit(X_train_counts, y_train)\n", "model_w2v = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"def vectorize(model, data):\n",
" return np.array([np.mean([model.wv[word] if word in model.wv.key_to_index else np.zeros(100, dtype=float) for word in doc], axis=0) for doc in data])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"X_train_w2v = vectorize(model_w2v, X_train)\n",
"X_dev0_w2v = vectorize(model_w2v, X_dev0)\n",
"X_test_w2v = vectorize(model_w2v, X_test)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"FEATURES = 100\n",
"\n", "\n",
"y_predicted_dev0_MNB = clf.predict(X_dev0_counts)\n", "class NeuralNetworkModel(torch.nn.Module):\n",
"y_predicted_test_MNB = clf.predict(X_test_counts)" "\n",
], " def __init__(self):\n",
"metadata": { " super(NeuralNetworkModel, self).__init__()\n",
"collapsed": false, " self.fc1 = torch.nn.Linear(FEATURES,500)\n",
"pycharm": { " self.fc2 = torch.nn.Linear(500,1)\n",
"name": "#%%\n" "\n",
} " def forward(self, x):\n",
} " x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 145,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"nn_model = NeuralNetworkModel()"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"BATCH_SIZE = 42"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"criterion = torch.nn.BCELoss()"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" model.eval()\n",
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
" X = np.array(X_dataset[i:i+BATCH_SIZE]).astype(np.float32)\n",
" X = torch.tensor(X)\n",
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" loss = criterion(Y_predictions, Y)\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" return (loss_score / items_total), (acc_score / items_total)"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"def predict(model, data):\n",
" model.eval()\n",
" predictions = []\n",
" for x in data:\n",
" X = torch.tensor(np.array(x).astype(np.float32))\n",
" Y_predictions = model(X)\n",
" if Y_predictions[0] > 0.5:\n",
" predictions.append(\"1\")\n",
" else:\n",
" predictions.append(\"0\")\n",
" return predictions"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
"outputs": [ "outputs": [
{ {
"name": "stdout", "data": {
"output_type": "stream", "text/plain": [
"text": [ "0"
"Accuracy dev0: 0.8025417298937785\n" ]
] },
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49161445487174543, 0.7499197110287693)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4990149180719994, 0.7420333839150227)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.486242138754709, 0.7533833599812141)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4960476360955079, 0.7448786039453718)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.48170865143118824, 0.7566018254086104)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49339661830880754, 0.7448786039453718)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"3"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.47863767532834156, 0.7587877573995352)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49210414077877457, 0.7503793626707133)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"4"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4755889592268004, 0.7613466446116604)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.49055553189223017, 0.753793626707132)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"5"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.47395927866325194, 0.7623273787118541)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4905445413022374, 0.7541729893778453)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"6"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4721670034531442, 0.7639055318237855)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4896522785377249, 0.7522761760242792)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"7"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4713666787153674, 0.7644166186083936)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4897225151384003, 0.7532245827010622)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"8"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4687599671611641, 0.7661674361745845)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4882916720620779, 0.7524658573596358)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"9"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.4669961705231401, 0.767617817590364)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(0.48753329053272426, 0.7534142640364189)"
]
},
"metadata": {},
"output_type": "display_data"
} }
], ],
"source": [ "source": [
"accuracy_dev0_MNB = accuracy_score(y_expected_dev0, y_predicted_dev0_MNB)\n", "for epoch in range(10):\n",
"print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n" " loss_score = 0\n",
], " acc_score = 0\n",
"metadata": { " items_total = 0\n",
"collapsed": false, " nn_model.train()\n",
"pycharm": { " for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
"name": "#%%\n" " X = X_train_w2v[i:i+BATCH_SIZE]\n",
} " X = torch.tensor(X)\n",
} " Y = y_train[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
"\n",
" display(epoch)\n",
" display(get_loss_acc(nn_model, X_train_w2v, y_train))\n",
" display(get_loss_acc(nn_model, X_dev0_w2v, y_expected_dev0))"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 152,
"outputs": [],
"source": [
"open(\"dev-0/out.tsv\", mode='w').writelines(y_predicted_dev0_MNB)\n",
"open(\"test-A/out.tsv\", mode='w').writelines(y_predicted_test_MNB)"
],
"metadata": { "metadata": {
"collapsed": false,
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
} }
} },
"outputs": [],
"source": [
"y_pred_dev0 = predict(nn_model, X_dev0_w2v)\n",
"y_pred_test = predict(nn_model, X_test_w2v)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"open('dev-0/out.tsv', 'w').writelines([i+'\\n' for i in y_pred_dev0])"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"open('test-A/out.tsv', 'w').writelines([i+'\\n' for i in y_pred_test])"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {},
"outputs": [], "outputs": [],
"source": [], "source": []
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
"name": "ipython", "name": "ipython",
"version": 2 "version": 3
}, },
"file_extension": ".py", "file_extension": ".py",
"mimetype": "text/x-python", "mimetype": "text/x-python",
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython2", "pygments_lexer": "ipython3",
"version": "2.7.6" "version": "3.9.7"
} }
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 0 "nbformat_minor": 1
} }

118
run.py Normal file → Executable file
View File

@ -1,24 +1,114 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma import lzma
from gensim.models import Word2Vec
import gensim.downloader
import numpy as np
import pandas as pd
import torch
X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines() X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train = open('train/expected.tsv').readlines() y_train = np.array(open('train/expected.tsv').readlines())
X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines() X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines() y_expected_dev0 = np.array(open("dev-0/expected.tsv", "r").readlines())
X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines() X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
count_vect = CountVectorizer() X_train = [line.split() for line in X_train]
X_train_counts = count_vect.fit_transform(X_train) X_dev0 = [line.split() for line in X_dev0]
X_dev0_counts = count_vect.transform(X_dev0) X_test = [line.split() for line in X_test]
X_test_counts = count_vect.transform(X_test)
clf = MultinomialNB().fit(X_train_counts, y_train) model_w2v = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)
y_predicted_dev0_MNB = clf.predict(X_dev0_counts)
y_predicted_test_MNB = clf.predict(X_test_counts)
open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB) def vectorize(model, data):
open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB) return np.array([np.mean([model.wv[word] if word in model.wv.key_to_index else np.zeros(100, dtype=float) for word in doc], axis=0) for doc in data])
X_train_w2v = vectorize(model_w2v, X_train)
X_dev0_w2v = vectorize(model_w2v, X_dev0)
X_test_w2v = vectorize(model_w2v, X_test)
FEATURES = 100
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(FEATURES,500)
self.fc2 = torch.nn.Linear(500,1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
nn_model = NeuralNetworkModel()
BATCH_SIZE = 42
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)
def get_loss_acc(model, X_dataset, Y_dataset):
loss_score = 0
acc_score = 0
items_total = 0
model.eval()
for i in range(0, Y_dataset.shape[0], BATCH_SIZE):
X = np.array(X_dataset[i:i+BATCH_SIZE]).astype(np.float32)
X = torch.tensor(X)
Y = Y_dataset[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
loss = criterion(Y_predictions, Y)
loss_score += loss.item() * Y.shape[0]
return (loss_score / items_total), (acc_score / items_total)
def predict(model, data):
model.eval()
predictions = []
for x in data:
X = torch.tensor(np.array(x).astype(np.float32))
Y_predictions = model(X)
if Y_predictions[0] > 0.5:
predictions.append("1")
else:
predictions.append("0")
return predictions
for epoch in range(10):
loss_score = 0
acc_score = 0
items_total = 0
nn_model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = X_train_w2v[i:i+BATCH_SIZE]
X = torch.tensor(X)
Y = y_train[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = nn_model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
display(epoch)
display(get_loss_acc(nn_model, X_train_w2v, y_train))
display(get_loss_acc(nn_model, X_dev0_w2v, y_expected_dev0))
y_pred_dev0 = predict(nn_model, X_dev0_w2v)
y_pred_test = predict(nn_model, X_test_w2v)
open('dev-0/out.tsv', 'w').writelines([i+'\n' for i in y_pred_dev0])
open('test-A/out.tsv', 'w').writelines([i+'\n' for i in y_pred_test])

135
runNB.ipynb Normal file
View File

@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import lzma\n",
"\n",
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_train = open('train/expected.tsv').readlines()\n",
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"count_vect = CountVectorizer()\n",
"X_train_counts = count_vect.fit_transform(X_train)\n",
"X_dev0_counts = count_vect.transform(X_dev0)\n",
"X_test_counts = count_vect.transform(X_test)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"clf = MultinomialNB().fit(X_train_counts, y_train)\n",
"\n",
"y_predicted_dev0_MNB = clf.predict(X_dev0_counts)\n",
"y_predicted_test_MNB = clf.predict(X_test_counts)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy dev0: 0.8025417298937785\n"
]
}
],
"source": [
"accuracy_dev0_MNB = accuracy_score(y_expected_dev0, y_predicted_dev0_MNB)\n",
"print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"open(\"dev-0/out.tsv\", mode='w').writelines(y_predicted_dev0_MNB)\n",
"open(\"test-A/out.tsv\", mode='w').writelines(y_predicted_test_MNB)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

24
runNB.py Normal file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding: utf-8
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma
X_train = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train = open('train/expected.tsv').readlines()
X_dev0 = lzma.open("dev-0/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines()
X_test = lzma.open("test-A/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_dev0_counts = count_vect.transform(X_dev0)
X_test_counts = count_vect.transform(X_test)
clf = MultinomialNB().fit(X_train_counts, y_train)
y_predicted_dev0_MNB = clf.predict(X_dev0_counts)
y_predicted_test_MNB = clf.predict(X_test_counts)
open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB)
open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB)

File diff suppressed because it is too large Load Diff

5152
test-A/outNB.tsv Normal file

File diff suppressed because it is too large Load Diff