2021-05-20 20:51:25 +02:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": 9,
|
|
|
|
"metadata": {},
|
2021-05-20 20:51:25 +02:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2021-05-25 15:51:03 +02:00
|
|
|
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n",
|
|
|
|
"b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n"
|
2021-05-20 20:51:25 +02:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2021-05-25 15:51:03 +02:00
|
|
|
"from sklearn.naive_bayes import GaussianNB\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
"import pandas as pd\n",
|
|
|
|
"import torch\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
|
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
|
|
"import gensim\n",
|
|
|
|
"from gensim.models import KeyedVectors\n",
|
|
|
|
"import gensim.downloader\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
"import nltk\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"import csv\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
"\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"import numpy as np\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
"\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"import numpy as np\n",
|
|
|
|
"def tokenize_data(data):\n",
|
|
|
|
" data_tokenize = [nltk.word_tokenize(x) for x in data]\n",
|
|
|
|
" \n",
|
|
|
|
" for doc in data_tokenize:\n",
|
|
|
|
" i = 0\n",
|
|
|
|
" while i < len(doc):\n",
|
|
|
|
" if doc[i].isalpha():\n",
|
|
|
|
" doc[i] = doc[i].lower()\n",
|
|
|
|
" else:\n",
|
|
|
|
" del doc[i]\n",
|
|
|
|
" i += 1\n",
|
|
|
|
" return data_tokenize\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
"\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"class NeuralNetwork(torch.nn.Module):\n",
|
|
|
|
" def __init__(self, input_size, hidden_size, num_classes):\n",
|
|
|
|
" super(NeuralNetwork, self).__init__()\n",
|
|
|
|
" self.l1 = torch.nn.Linear(input_size, hidden_size)\n",
|
|
|
|
" self.l2 = torch.nn.Linear(hidden_size, num_classes)\n",
|
|
|
|
" \n",
|
2021-05-20 20:51:25 +02:00
|
|
|
" def forward(self, x):\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
" x = self.l1(x)\n",
|
|
|
|
" x = torch.relu(x)\n",
|
|
|
|
" x = self.l2(x)\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
" x = torch.sigmoid(x)\n",
|
|
|
|
" return x\n",
|
|
|
|
"r_in = './train/train.tsv'\n",
|
|
|
|
"\n",
|
|
|
|
"r_ind_ev = './dev-0/in.tsv'\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"tsv_read = pd.read_table(r_in, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\\t', header=None)\n",
|
|
|
|
"tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\\t', quoting=csv.QUOTE_NONE, header=None)\n",
|
|
|
|
"tsv_read_test_in = pd.read_table('./test-A/in.tsv', error_bad_lines=False,quoting=csv.QUOTE_NONE, header= None)\n",
|
2021-05-20 20:51:25 +02:00
|
|
|
"\n",
|
|
|
|
"y_train = tsv_read[0].values\n",
|
|
|
|
"X_train = tsv_read[1].values\n",
|
|
|
|
"X_dev = tsv_read_dev[0].values\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"X_test= tsv_read_test_in[0].values\n",
|
|
|
|
"\n",
|
|
|
|
"X_train = tokenize_data(X_train)\n",
|
|
|
|
"X_dev = tokenize_data(X_dev)\n",
|
|
|
|
"X_test = tokenize_data(X_test)\n",
|
|
|
|
"\n",
|
|
|
|
"\n"
|
2021-05-20 20:51:25 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {},
|
2021-05-20 20:51:25 +02:00
|
|
|
"outputs": [],
|
2021-05-25 15:51:03 +02:00
|
|
|
"source": []
|
2021-05-20 20:51:25 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": 10,
|
|
|
|
"metadata": {},
|
2021-05-20 20:51:25 +02:00
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2021-05-25 15:51:03 +02:00
|
|
|
"\n",
|
|
|
|
"model = KeyedVectors.load(\"./word2vec/word2vec_100_3_polish.bin\")\n",
|
|
|
|
"\n",
|
|
|
|
"X_train = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_train]\n",
|
|
|
|
"x_train_vectors = np.array(X_train, dtype=np.float32)\n",
|
|
|
|
"x_train_tensor = torch.tensor(x_train_vectors.astype(np.float32))\n",
|
|
|
|
"\n",
|
|
|
|
"X_dev = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_dev]\n",
|
|
|
|
"x_dev_vectors = np.array(X_dev, dtype=np.float32)\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"X_test = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_test]\n",
|
|
|
|
"x_test_vectors = np.array(X_test, dtype=np.float32)\n"
|
2021-05-20 20:51:25 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
2021-05-20 20:51:25 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": 11,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
2021-05-20 20:51:25 +02:00
|
|
|
"source": [
|
2021-05-25 15:51:03 +02:00
|
|
|
"model = NeuralNetwork(100, 200, 1)\n",
|
|
|
|
"criterion = torch.nn.BCELoss()\n",
|
|
|
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
|
|
|
|
" \n",
|
|
|
|
"batch_size = 12\n",
|
|
|
|
"\n",
|
|
|
|
" \n",
|
|
|
|
"for epoch in range(6):\n",
|
|
|
|
" loss_score = 0\n",
|
|
|
|
" acc_score = 0\n",
|
|
|
|
" items_total = 0\n",
|
|
|
|
" model.train()\n",
|
|
|
|
" for i in range(0, y_train.shape[0], batch_size):\n",
|
|
|
|
" X = x_train_vectors[i:i+batch_size]\n",
|
|
|
|
" X = torch.tensor(X.astype(np.float32))\n",
|
|
|
|
" Y = y_train[i:i+batch_size]\n",
|
|
|
|
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
|
|
|
|
" \n",
|
|
|
|
" Y_predictions = model(X)\n",
|
|
|
|
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
|
|
|
|
" items_total += Y.shape[0] \n",
|
|
|
|
" \n",
|
|
|
|
" optimizer.zero_grad()\n",
|
|
|
|
" loss = criterion(Y_predictions, Y)\n",
|
|
|
|
" loss.backward()\n",
|
|
|
|
" optimizer.step()\n",
|
|
|
|
" \n",
|
|
|
|
" \n",
|
|
|
|
" loss_score += loss.item() * Y.shape[0]\n",
|
|
|
|
" "
|
2021-05-20 20:51:25 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
2021-05-25 15:51:03 +02:00
|
|
|
"metadata": {},
|
2021-05-20 20:51:25 +02:00
|
|
|
"outputs": [],
|
2021-05-25 15:51:03 +02:00
|
|
|
"source": []
|
2021-05-20 20:51:25 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": 12,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
2021-05-20 20:51:25 +02:00
|
|
|
"source": [
|
|
|
|
"\n",
|
2021-05-25 15:51:03 +02:00
|
|
|
"\n",
|
|
|
|
"predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))\n",
|
|
|
|
"predictions = predictions.cpu().detach().numpy() \n",
|
|
|
|
"predictions = (predictions > 0.5)\n",
|
|
|
|
"predictions = np.asarray(predictions, dtype=np.int32)\n",
|
|
|
|
"predictions.tofile('dev-0/out.tsv', sep='\\n')"
|
2021-05-20 20:51:25 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-05-25 15:51:03 +02:00
|
|
|
"execution_count": 13,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
2021-05-20 20:51:25 +02:00
|
|
|
"source": [
|
2021-05-25 15:51:03 +02:00
|
|
|
"predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))\n",
|
|
|
|
"predictions = predictions.cpu().detach().numpy() \n",
|
|
|
|
"predictions = (predictions > 0.5)\n",
|
|
|
|
"predictions = np.asarray(predictions, dtype=np.int32)\n",
|
|
|
|
"predictions.tofile('test-A/out.tsv', sep='\\n')"
|
2021-05-20 20:51:25 +02:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.8.5"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 1
|
2021-05-25 15:51:03 +02:00
|
|
|
}
|