nn
This commit is contained in:
parent
756ef4277a
commit
8a69cabc52
262
.ipynb_checkpoints/run-checkpoint.ipynb
Normal file
262
.ipynb_checkpoints/run-checkpoint.ipynb
Normal file
@ -0,0 +1,262 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "74100403-147c-42cd-8285-e30778c0fb66",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import gensim\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"from sklearn.metrics import accuracy_score"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def predict_year(x, path_out, model):\n",
|
||||||
|
" results = model.predict(x)\n",
|
||||||
|
" with open(path_out, 'wt') as file:\n",
|
||||||
|
" for r in results:\n",
|
||||||
|
" file.write(str(r) + '\\n') "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def read_file(filename):\n",
|
||||||
|
" result = []\n",
|
||||||
|
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
|
||||||
|
" for line in file:\n",
|
||||||
|
" text = line.split(\"\\t\")[0].strip()\n",
|
||||||
|
" result.append(text)\n",
|
||||||
|
" return result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "86fbfb79-76e7-49f5-b722-2827f93cb03f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('train/in.tsv', 'r', encoding='utf8') as file:\n",
|
||||||
|
" train = pd.read_csv(file, sep='\\t', header=None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
|
||||||
|
" train_y = pd.read_csv(file, sep='\\t', header=None)\n",
|
||||||
|
"train_y = train_y[0:10000]\n",
|
||||||
|
"train_y = train_y[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "07ae7b22-e95d-4614-9757-15660a9834b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train = train[0:10000]\n",
|
||||||
|
"train_x = train[0]\n",
|
||||||
|
"train_x = [gensim.utils.simple_preprocess(x) for x in train_x]\n",
|
||||||
|
"#train_x"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "fde71cd8-f682-4793-bce9-0f9a9d8c176c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from gensim.test.utils import common_texts\n",
|
||||||
|
"from gensim.models import Word2Vec\n",
|
||||||
|
"\n",
|
||||||
|
"model = Word2Vec(sentences=train_x, vector_size=100, window=5, min_count=1, workers=4)\n",
|
||||||
|
"#data, min_count = 1, vector_size = 100, window = 5, sg = 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "9a4c8066-f985-478e-8944-dd45b73d9795",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_3520\\3800840358.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
|
||||||
|
" train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"words = set(model.wv.index_to_key)\n",
|
||||||
|
"train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b52269f9-f143-483d-9669-ce8f5972d6bb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"FEATURES = 100\n",
|
||||||
|
"\n",
|
||||||
|
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" super(NeuralNetworkModel, self).__init__()\n",
|
||||||
|
" self.fc1 = torch.nn.Linear(FEATURES,500)\n",
|
||||||
|
" self.fc2 = torch.nn.Linear(500,1)\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" x = self.fc1(x)\n",
|
||||||
|
" x = torch.relu(x)\n",
|
||||||
|
" x = self.fc2(x)\n",
|
||||||
|
" x = torch.sigmoid(x)\n",
|
||||||
|
" return x\n",
|
||||||
|
"\n",
|
||||||
|
"nn_model = NeuralNetworkModel()\n",
|
||||||
|
"BATCH_SIZE = 40\n",
|
||||||
|
"criterion = torch.nn.BCELoss()\n",
|
||||||
|
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n",
|
||||||
|
"\n",
|
||||||
|
"def get_loss_acc(model, data_x, data_y):\n",
|
||||||
|
" loss_score = 0\n",
|
||||||
|
" acc_score = 0\n",
|
||||||
|
" items_total = 0\n",
|
||||||
|
" model.eval()\n",
|
||||||
|
" for i in range(0, data_y.shape[0], BATCH_SIZE):\n",
|
||||||
|
" X = data_x[i:i+BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X.astype(np.float32))\n",
|
||||||
|
" Y = data_y[i:i+BATCH_SIZE]\n",
|
||||||
|
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
|
||||||
|
" Y_predictions = model(X)\n",
|
||||||
|
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
|
||||||
|
" items_total += Y.shape[0]\n",
|
||||||
|
"\n",
|
||||||
|
" loss = criterion(Y_predictions, Y)\n",
|
||||||
|
"\n",
|
||||||
|
" loss_score += loss.item() * Y.shape[0]\n",
|
||||||
|
" return (loss_score / items_total), (acc_score / items_total)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"for epoch in range(5):\n",
|
||||||
|
" loss_score = 0\n",
|
||||||
|
" acc_score = 0\n",
|
||||||
|
" items_total = 0\n",
|
||||||
|
" nn_model.train()\n",
|
||||||
|
" for i in range(0, train_y.shape[0] - 42, BATCH_SIZE):\n",
|
||||||
|
" X = train_x_vec[i:i+BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X.astype(np.float32))\n",
|
||||||
|
" Y = train_y[i:i+BATCH_SIZE]\n",
|
||||||
|
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
|
||||||
|
" Y_predictions = nn_model(X)\n",
|
||||||
|
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
|
||||||
|
" items_total += Y.shape[0]\n",
|
||||||
|
"\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" loss = criterion(Y_predictions, Y)\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" loss_score += loss.item() * Y.shape[0]\n",
|
||||||
|
"\n",
|
||||||
|
" display(epoch)\n",
|
||||||
|
" display(get_loss_acc(model, train_x_vect, train_y))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1482f342-f2ea-4c9d-b221-5ef451e3a6b3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('trenowanie modelu')\n",
|
||||||
|
"model = NeuralNetworkModel()\n",
|
||||||
|
"BATCH_SIZE = 5\n",
|
||||||
|
"criterion = torch.nn.BCELoss()\n",
|
||||||
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
|
||||||
|
"\n",
|
||||||
|
"for epoch in range(BATCH_SIZE):\n",
|
||||||
|
" model.train()\n",
|
||||||
|
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
|
||||||
|
" X = x_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" y = y_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" loss = criterion(outputs, y)\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()\n",
|
||||||
|
"\n",
|
||||||
|
"#print('predykcja wynikow')\n",
|
||||||
|
"y_dev = []\n",
|
||||||
|
"y_test = []\n",
|
||||||
|
"model.eval()\n",
|
||||||
|
"\n",
|
||||||
|
"with torch.no_grad():\n",
|
||||||
|
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
|
||||||
|
" X = x_dev[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" prediction = (outputs > 0.5)\n",
|
||||||
|
" y_dev += prediction.tolist()\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(0, len(x_test), BATCH_SIZE):\n",
|
||||||
|
" X = x_test[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" y = (outputs >= 0.5)\n",
|
||||||
|
" y_test += prediction.tolist()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
223
.ipynb_checkpoints/sceptic-checkpoint.ipynb
Normal file
223
.ipynb_checkpoints/sceptic-checkpoint.ipynb
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "equal-singles",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/utils/validation.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
|
||||||
|
" LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" dtype=np.int):\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=np.finfo(np.float).eps,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=np.finfo(np.float).eps, positive=False):\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" precompute=False, eps=np.finfo(np.float).eps,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=np.finfo(np.float).eps, random_state=None,\n",
|
||||||
|
"/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" eps=4 * np.finfo(np.float).eps, n_jobs=None,\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import lzma\n",
|
||||||
|
"import gensim.downloader\n",
|
||||||
|
"from nltk import word_tokenize"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "involved-understanding",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"x_train = pd.read_table('in.tsv', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"y_train = pd.read_table('expected.tsv', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"#x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"#x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "collaborative-cincinnati",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "AttributeError",
|
||||||
|
"evalue": "module 'torch' has no attribute 'nn'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-5-11c9482004ae>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#print('inicjalizacja modelu')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mclass\u001b[0m \u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mModule\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ml01\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mAttributeError\u001b[0m: module 'torch' has no attribute 'nn'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#print('inicjalizacja modelu')\n",
|
||||||
|
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" super(NeuralNetworkModel, self).__init__()\n",
|
||||||
|
" self.l01 = torch.nn.Linear(300, 300)\n",
|
||||||
|
" self.l02 = torch.nn.Linear(300, 1)\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" x = self.l01(x)\n",
|
||||||
|
" x = torch.relu(x)\n",
|
||||||
|
" x = self.l02(x)\n",
|
||||||
|
" x = torch.sigmoid(x)\n",
|
||||||
|
" return x"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "hydraulic-business",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('przygotowanie danych')\n",
|
||||||
|
"\n",
|
||||||
|
"x_train = x_train[0].str.lower()\n",
|
||||||
|
"y_train = y_train[0]\n",
|
||||||
|
"x_dev = x_dev[0].str.lower()\n",
|
||||||
|
"x_test = x_test[0].str.lower()\n",
|
||||||
|
"\n",
|
||||||
|
"x_train = [word_tokenize(x) for x in x_train]\n",
|
||||||
|
"x_dev = [word_tokenize(x) for x in x_dev]\n",
|
||||||
|
"x_test = [word_tokenize(x) for x in x_test]\n",
|
||||||
|
"\n",
|
||||||
|
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
|
||||||
|
"x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
|
||||||
|
"x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
|
||||||
|
"x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "heavy-sandwich",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('trenowanie modelu')\n",
|
||||||
|
"model = NeuralNetworkModel()\n",
|
||||||
|
"BATCH_SIZE = 5\n",
|
||||||
|
"criterion = torch.nn.BCELoss()\n",
|
||||||
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
|
||||||
|
"\n",
|
||||||
|
"for epoch in range(BATCH_SIZE):\n",
|
||||||
|
" model.train()\n",
|
||||||
|
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
|
||||||
|
" X = x_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" y = y_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" loss = criterion(outputs, y)\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "small-pavilion",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('predykcja wynikow')\n",
|
||||||
|
"y_dev = []\n",
|
||||||
|
"y_test = []\n",
|
||||||
|
"model.eval()\n",
|
||||||
|
"\n",
|
||||||
|
"with torch.no_grad():\n",
|
||||||
|
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
|
||||||
|
" X = x_dev[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" prediction = (outputs > 0.5)\n",
|
||||||
|
" y_dev += prediction.tolist()\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(0, len(x_test), BATCH_SIZE):\n",
|
||||||
|
" X = x_test[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" y = (outputs >= 0.5)\n",
|
||||||
|
" y_test += prediction.tolist()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "toxic-pendant",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# print('eksportowanie do plików')\n",
|
||||||
|
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
|
||||||
|
"y_test = np.asarray(y_test, dtype=np.int32)\n",
|
||||||
|
"y_dev.tofile('./dev-0/out.tsv', sep='\\n')\n",
|
||||||
|
"y_test.tofile('./test-A/out.tsv', sep='\\n')\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
5272
dev-0/in.tsv
Normal file
5272
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
726
run.ipynb
Normal file
726
run.ipynb
Normal file
@ -0,0 +1,726 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "74100403-147c-42cd-8285-e30778c0fb66",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import lzma\n",
|
||||||
|
"import gensim.downloader\n",
|
||||||
|
"from nltk import word_tokenize"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "cbe60d7b-850e-4838-b4ce-672f13bf2bb2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def predict_year(x, path_out, model):\n",
|
||||||
|
" results = model.predict(x)\n",
|
||||||
|
" with open(path_out, 'wt') as file:\n",
|
||||||
|
" for r in results:\n",
|
||||||
|
" file.write(str(r) + '\\n') "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def read_file(filename):\n",
|
||||||
|
" result = []\n",
|
||||||
|
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
|
||||||
|
" for line in file:\n",
|
||||||
|
" text = line.split(\"\\t\")[0].strip()\n",
|
||||||
|
" result.append(text)\n",
|
||||||
|
" return result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "86fbfb79-76e7-49f5-b722-2827f93cb03f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>have you had an medical issues recently?</td>\n",
|
||||||
|
" <td>1335187994</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>It's supposedly aluminum, barium, and strontiu...</td>\n",
|
||||||
|
" <td>1346187161</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>Nobel prizes don't make you rich.</td>\n",
|
||||||
|
" <td>1337160218</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>I came for the article, I stayed for the doctor.</td>\n",
|
||||||
|
" <td>1277674344</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>you resorted to insults AND got owned directly...</td>\n",
|
||||||
|
" <td>1348538535</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199995</th>\n",
|
||||||
|
" <td>It's really sad. My sister used to believe tha...</td>\n",
|
||||||
|
" <td>1334111989</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199996</th>\n",
|
||||||
|
" <td>I don't mean it in a dickish way, I'm being se...</td>\n",
|
||||||
|
" <td>1322700456</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199997</th>\n",
|
||||||
|
" <td>Fair enough, I stand corrected.</td>\n",
|
||||||
|
" <td>1354646212</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199998</th>\n",
|
||||||
|
" <td>Right. Scientists tend to think and conclude l...</td>\n",
|
||||||
|
" <td>1348777201</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199999</th>\n",
|
||||||
|
" <td>Because they are illiterate</td>\n",
|
||||||
|
" <td>1249579722</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>200000 rows × 2 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" 0 1\n",
|
||||||
|
"0 have you had an medical issues recently? 1335187994\n",
|
||||||
|
"1 It's supposedly aluminum, barium, and strontiu... 1346187161\n",
|
||||||
|
"2 Nobel prizes don't make you rich. 1337160218\n",
|
||||||
|
"3 I came for the article, I stayed for the doctor. 1277674344\n",
|
||||||
|
"4 you resorted to insults AND got owned directly... 1348538535\n",
|
||||||
|
"... ... ...\n",
|
||||||
|
"199995 It's really sad. My sister used to believe tha... 1334111989\n",
|
||||||
|
"199996 I don't mean it in a dickish way, I'm being se... 1322700456\n",
|
||||||
|
"199997 Fair enough, I stand corrected. 1354646212\n",
|
||||||
|
"199998 Right. Scientists tend to think and conclude l... 1348777201\n",
|
||||||
|
"199999 Because they are illiterate 1249579722\n",
|
||||||
|
"\n",
|
||||||
|
"[200000 rows x 2 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"x_train = pd.read_table('train/in.tsv', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"x_train = x_train[0:200000]\n",
|
||||||
|
"x_train"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199995</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199996</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199997</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199998</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>199999</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>200000 rows × 1 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" 0\n",
|
||||||
|
"0 1\n",
|
||||||
|
"1 0\n",
|
||||||
|
"2 0\n",
|
||||||
|
"3 0\n",
|
||||||
|
"4 0\n",
|
||||||
|
"... ..\n",
|
||||||
|
"199995 0\n",
|
||||||
|
"199996 0\n",
|
||||||
|
"199997 1\n",
|
||||||
|
"199998 1\n",
|
||||||
|
"199999 0\n",
|
||||||
|
"\n",
|
||||||
|
"[200000 rows x 1 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
|
||||||
|
" y_train = pd.read_csv(file, sep='\\t', header=None)\n",
|
||||||
|
"y_train = y_train[0:200000]\n",
|
||||||
|
"y_train"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "6b27e6ce-e9fd-41a1-aacf-53a5fde0a7c1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>In which case, tell them I'm in work, or dead,...</td>\n",
|
||||||
|
" <td>1328302967</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>Put me down as another for Mysterious Universe...</td>\n",
|
||||||
|
" <td>1347836881</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>The military of any country would never admit ...</td>\n",
|
||||||
|
" <td>1331905826</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>An example would have been more productive tha...</td>\n",
|
||||||
|
" <td>1315584834</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>sorry, but the authors of this article admit t...</td>\n",
|
||||||
|
" <td>1347389166</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5267</th>\n",
|
||||||
|
" <td>Your fault for going at all. That's how we get...</td>\n",
|
||||||
|
" <td>1308176634</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5268</th>\n",
|
||||||
|
" <td>EVP....that's a shot in the GH drinking game.</td>\n",
|
||||||
|
" <td>1354408646</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5269</th>\n",
|
||||||
|
" <td>i think a good hard massage is good for you. t...</td>\n",
|
||||||
|
" <td>1305726318</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5270</th>\n",
|
||||||
|
" <td>Interesting theory. Makes my imagination run w...</td>\n",
|
||||||
|
" <td>1339839088</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5271</th>\n",
|
||||||
|
" <td>Tampering of candy? More like cooking somethin...</td>\n",
|
||||||
|
" <td>1320262659</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>5272 rows × 2 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" 0 1\n",
|
||||||
|
"0 In which case, tell them I'm in work, or dead,... 1328302967\n",
|
||||||
|
"1 Put me down as another for Mysterious Universe... 1347836881\n",
|
||||||
|
"2 The military of any country would never admit ... 1331905826\n",
|
||||||
|
"3 An example would have been more productive tha... 1315584834\n",
|
||||||
|
"4 sorry, but the authors of this article admit t... 1347389166\n",
|
||||||
|
"... ... ...\n",
|
||||||
|
"5267 Your fault for going at all. That's how we get... 1308176634\n",
|
||||||
|
"5268 EVP....that's a shot in the GH drinking game. 1354408646\n",
|
||||||
|
"5269 i think a good hard massage is good for you. t... 1305726318\n",
|
||||||
|
"5270 Interesting theory. Makes my imagination run w... 1339839088\n",
|
||||||
|
"5271 Tampering of candy? More like cooking somethin... 1320262659\n",
|
||||||
|
"\n",
|
||||||
|
"[5272 rows x 2 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
|
||||||
|
" x_dev = pd.read_csv(file, sep='\\t', header=None)\n",
|
||||||
|
"x_dev"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "99ae526d-9b7c-493f-be4f-f95b1c8f4b81",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>Gentleman, I believe we can agree that this is...</td>\n",
|
||||||
|
" <td>1304170330</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>The problem is that it will just turn it r/nos...</td>\n",
|
||||||
|
" <td>1353763204</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>Well, according to some Christian apologists, ...</td>\n",
|
||||||
|
" <td>1336314173</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Don't know if this is what you are looking for...</td>\n",
|
||||||
|
" <td>1348860314</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>I respect what you're saying completely. I jus...</td>\n",
|
||||||
|
" <td>1341285952</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5147</th>\n",
|
||||||
|
" <td>GAMBIT</td>\n",
|
||||||
|
" <td>1326441107</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5148</th>\n",
|
||||||
|
" <td>&gt;Joe Rogan is no snake oil salesman.\\n\\nHe ...</td>\n",
|
||||||
|
" <td>1319464245</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5149</th>\n",
|
||||||
|
" <td>Reading further, Sagan does seem to agree with...</td>\n",
|
||||||
|
" <td>1322126150</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5150</th>\n",
|
||||||
|
" <td>Notice that they never invoke god, or any othe...</td>\n",
|
||||||
|
" <td>1307679295</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5151</th>\n",
|
||||||
|
" <td>They might co-ordinate an anniversary attack o...</td>\n",
|
||||||
|
" <td>1342409261</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>5152 rows × 2 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" 0 1\n",
|
||||||
|
"0 Gentleman, I believe we can agree that this is... 1304170330\n",
|
||||||
|
"1 The problem is that it will just turn it r/nos... 1353763204\n",
|
||||||
|
"2 Well, according to some Christian apologists, ... 1336314173\n",
|
||||||
|
"3 Don't know if this is what you are looking for... 1348860314\n",
|
||||||
|
"4 I respect what you're saying completely. I jus... 1341285952\n",
|
||||||
|
"... ... ...\n",
|
||||||
|
"5147 GAMBIT 1326441107\n",
|
||||||
|
"5148 >Joe Rogan is no snake oil salesman.\\n\\nHe ... 1319464245\n",
|
||||||
|
"5149 Reading further, Sagan does seem to agree with... 1322126150\n",
|
||||||
|
"5150 Notice that they never invoke god, or any othe... 1307679295\n",
|
||||||
|
"5151 They might co-ordinate an anniversary attack o... 1342409261\n",
|
||||||
|
"\n",
|
||||||
|
"[5152 rows x 2 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
|
||||||
|
" x_test = pd.read_csv(file, sep='\\t', header=None)\n",
|
||||||
|
"x_test"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "dba17668-971f-47f8-99ce-fc840b5cb74a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" super(NeuralNetworkModel, self).__init__()\n",
|
||||||
|
" self.l01 = torch.nn.Linear(300, 300)\n",
|
||||||
|
" self.l02 = torch.nn.Linear(300, 1)\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" x = self.l01(x)\n",
|
||||||
|
" x = torch.relu(x)\n",
|
||||||
|
" x = self.l02(x)\n",
|
||||||
|
" x = torch.sigmoid(x)\n",
|
||||||
|
" return x\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "1a275c1d-75bc-4290-9332-56396d16a0f2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"x_train = x_train[0].str.lower()\n",
|
||||||
|
"y_train = y_train[0]\n",
|
||||||
|
"x_dev = x_dev[0].str.lower()\n",
|
||||||
|
"x_test = x_test[0].str.lower()\n",
|
||||||
|
"\n",
|
||||||
|
"x_train = [word_tokenize(x) for x in x_train]\n",
|
||||||
|
"x_dev = [word_tokenize(x) for x in x_dev]\n",
|
||||||
|
"x_test = [word_tokenize(x) for x in x_test]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "031a3670-3be7-4146-97b4-0dacd4f9ae58",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from gensim.test.utils import common_texts\n",
|
||||||
|
"from gensim.models import Word2Vec\n",
|
||||||
|
"\n",
|
||||||
|
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
|
||||||
|
"x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
|
||||||
|
"x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
|
||||||
|
"x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "b7defd18-e281-4cf6-9941-cee560749677",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_22024\\3484013121.py:10: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\torch\\csrc\\utils\\tensor_new.cpp:210.)\n",
|
||||||
|
" X = torch.tensor(X)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"model = NeuralNetworkModel()\n",
|
||||||
|
"BATCH_SIZE = 5\n",
|
||||||
|
"criterion = torch.nn.BCELoss()\n",
|
||||||
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
|
||||||
|
"\n",
|
||||||
|
"for epoch in range(BATCH_SIZE):\n",
|
||||||
|
" model.train()\n",
|
||||||
|
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
|
||||||
|
" X = x_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" y = y_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" loss = criterion(outputs, y)\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "92c69ddd-fe58-477f-b2c2-06324a983bcc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y_dev = []\n",
|
||||||
|
"y_test = []\n",
|
||||||
|
"model.eval()\n",
|
||||||
|
"\n",
|
||||||
|
"with torch.no_grad():\n",
|
||||||
|
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
|
||||||
|
" X = x_dev[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" prediction = (outputs > 0.5)\n",
|
||||||
|
" y_dev += prediction.tolist()\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(0, len(x_test), BATCH_SIZE):\n",
|
||||||
|
" X = x_test[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" y = (outputs >= 0.5)\n",
|
||||||
|
" y_test += prediction.tolist()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "caff921c-d0ab-4fce-a17f-6610266b404d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
|
||||||
|
"y_test = np.asarray(y_test, dtype=np.int32)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"id": "73076eb2-810f-4f85-aa3f-05ee884c413b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('./dev-0/out.tsv', 'wt') as file:\n",
|
||||||
|
" for r in y_dev:\n",
|
||||||
|
" file.write(str(r) + '\\n') "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"id": "ddda251c-cafa-40f8-a020-48310a9f23b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('./test-A/out.tsv', 'wt') as file:\n",
|
||||||
|
" for r in y_test:\n",
|
||||||
|
" file.write(str(r) + '\\n') "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "5730562a-0200-4c8f-8f73-992fa2b36133",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[NbConvertApp] Converting notebook run.ipynb to script\n",
|
||||||
|
"[NbConvertApp] Writing 3816 bytes to run.py\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!jupyter nbconvert --to script run.ipynb"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "07a09298-204c-4905-90a8-5dcb87877368",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
191
run.py
Normal file
191
run.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import torch
|
||||||
|
import csv
|
||||||
|
import lzma
|
||||||
|
import gensim.downloader
|
||||||
|
from nltk import word_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
|
def predict_year(x, path_out, model):
|
||||||
|
results = model.predict(x)
|
||||||
|
with open(path_out, 'wt') as file:
|
||||||
|
for r in results:
|
||||||
|
file.write(str(r) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(filename):
|
||||||
|
result = []
|
||||||
|
with open(filename, 'r', encoding="utf-8") as file:
|
||||||
|
for line in file:
|
||||||
|
text = line.split("\t")[0].strip()
|
||||||
|
result.append(text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
|
x_train = pd.read_table('train/in.tsv', sep='\t', header=None, quoting=3)
|
||||||
|
x_train = x_train[0:200000]
|
||||||
|
x_train
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('train/expected.tsv', 'r', encoding='utf8') as file:
|
||||||
|
y_train = pd.read_csv(file, sep='\t', header=None)
|
||||||
|
y_train = y_train[0:200000]
|
||||||
|
y_train
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
|
||||||
|
x_dev = pd.read_csv(file, sep='\t', header=None)
|
||||||
|
x_dev
|
||||||
|
|
||||||
|
|
||||||
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('test-A/in.tsv', 'r', encoding='utf8') as file:
|
||||||
|
x_test = pd.read_csv(file, sep='\t', header=None)
|
||||||
|
x_test
|
||||||
|
|
||||||
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
class NeuralNetworkModel(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super(NeuralNetworkModel, self).__init__()
|
||||||
|
self.l01 = torch.nn.Linear(300, 300)
|
||||||
|
self.l02 = torch.nn.Linear(300, 1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.l01(x)
|
||||||
|
x = torch.relu(x)
|
||||||
|
x = self.l02(x)
|
||||||
|
x = torch.sigmoid(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
x_train = x_train[0].str.lower()
|
||||||
|
y_train = y_train[0]
|
||||||
|
x_dev = x_dev[0].str.lower()
|
||||||
|
x_test = x_test[0].str.lower()
|
||||||
|
|
||||||
|
x_train = [word_tokenize(x) for x in x_train]
|
||||||
|
x_dev = [word_tokenize(x) for x in x_dev]
|
||||||
|
x_test = [word_tokenize(x) for x in x_test]
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
from gensim.test.utils import common_texts
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
|
||||||
|
word2vec = gensim.downloader.load('word2vec-google-news-300')
|
||||||
|
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]
|
||||||
|
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]
|
||||||
|
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
model = NeuralNetworkModel()
|
||||||
|
BATCH_SIZE = 5
|
||||||
|
criterion = torch.nn.BCELoss()
|
||||||
|
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
|
||||||
|
|
||||||
|
for epoch in range(BATCH_SIZE):
|
||||||
|
model.train()
|
||||||
|
for i in range(0, y_train.shape[0], BATCH_SIZE):
|
||||||
|
X = x_train[i:i + BATCH_SIZE]
|
||||||
|
X = torch.tensor(X)
|
||||||
|
y = y_train[i:i + BATCH_SIZE]
|
||||||
|
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
outputs = model(X.float())
|
||||||
|
loss = criterion(outputs, y)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev = []
|
||||||
|
y_test = []
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
for i in range(0, len(x_dev), BATCH_SIZE):
|
||||||
|
X = x_dev[i:i + BATCH_SIZE]
|
||||||
|
X = torch.tensor(X)
|
||||||
|
outputs = model(X.float())
|
||||||
|
prediction = (outputs > 0.5)
|
||||||
|
y_dev += prediction.tolist()
|
||||||
|
|
||||||
|
for i in range(0, len(x_test), BATCH_SIZE):
|
||||||
|
X = x_test[i:i + BATCH_SIZE]
|
||||||
|
X = torch.tensor(X)
|
||||||
|
outputs = model(X.float())
|
||||||
|
y = (outputs >= 0.5)
|
||||||
|
y_test += prediction.tolist()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev = np.asarray(y_dev, dtype=np.int32)
|
||||||
|
y_test = np.asarray(y_test, dtype=np.int32)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('./dev-0/out.tsv', 'wt') as file:
|
||||||
|
for r in y_dev:
|
||||||
|
file.write(str(r) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
with open('./test-A/out.tsv', 'wt') as file:
|
||||||
|
for r in y_test:
|
||||||
|
file.write(str(r) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
get_ipython().system('jupyter nbconvert --to script run.ipynb')
|
||||||
|
|
180
sceptic.ipynb
Normal file
180
sceptic.ipynb
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "equal-singles",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import lzma\n",
|
||||||
|
"import gensim.downloader\n",
|
||||||
|
"from nltk import word_tokenize"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "involved-understanding",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"x_train = pd.read_table('in.tsv', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"y_train = pd.read_table('expected.tsv', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"#x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n",
|
||||||
|
"#x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\\t', header=None, quoting=3)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "collaborative-cincinnati",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "AttributeError",
|
||||||
|
"evalue": "module 'torch' has no attribute 'nn'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-5-11c9482004ae>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#print('inicjalizacja modelu')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mclass\u001b[0m \u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mModule\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNeuralNetworkModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ml01\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m300\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m300\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mAttributeError\u001b[0m: module 'torch' has no attribute 'nn'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#print('inicjalizacja modelu')\n",
|
||||||
|
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" super(NeuralNetworkModel, self).__init__()\n",
|
||||||
|
" self.l01 = torch.nn.Linear(300, 300)\n",
|
||||||
|
" self.l02 = torch.nn.Linear(300, 1)\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" x = self.l01(x)\n",
|
||||||
|
" x = torch.relu(x)\n",
|
||||||
|
" x = self.l02(x)\n",
|
||||||
|
" x = torch.sigmoid(x)\n",
|
||||||
|
" return x"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "hydraulic-business",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('przygotowanie danych')\n",
|
||||||
|
"\n",
|
||||||
|
"x_train = x_train.str.lower()\n",
|
||||||
|
"x_dev = x_dev[0].str.lower()\n",
|
||||||
|
"x_test = x_test[0].str.lower()\n",
|
||||||
|
"\n",
|
||||||
|
"x_train = [word_tokenize(x) for x in x_train]\n",
|
||||||
|
"x_dev = [word_tokenize(x) for x in x_dev]\n",
|
||||||
|
"x_test = [word_tokenize(x) for x in x_test]\n",
|
||||||
|
"\n",
|
||||||
|
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
|
||||||
|
"x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]\n",
|
||||||
|
"x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]\n",
|
||||||
|
"x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "heavy-sandwich",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('trenowanie modelu')\n",
|
||||||
|
"model = NeuralNetworkModel()\n",
|
||||||
|
"BATCH_SIZE = 5\n",
|
||||||
|
"criterion = torch.nn.BCELoss()\n",
|
||||||
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
|
||||||
|
"\n",
|
||||||
|
"for epoch in range(BATCH_SIZE):\n",
|
||||||
|
" model.train()\n",
|
||||||
|
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
|
||||||
|
" X = x_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" y = y_train[i:i + BATCH_SIZE]\n",
|
||||||
|
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" loss = criterion(outputs, y)\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "small-pavilion",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#print('predykcja wynikow')\n",
|
||||||
|
"y_dev = []\n",
|
||||||
|
"y_test = []\n",
|
||||||
|
"model.eval()\n",
|
||||||
|
"\n",
|
||||||
|
"with torch.no_grad():\n",
|
||||||
|
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
|
||||||
|
" X = x_dev[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" prediction = (outputs > 0.5)\n",
|
||||||
|
" y_dev += prediction.tolist()\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(0, len(x_test), BATCH_SIZE):\n",
|
||||||
|
" X = x_test[i:i + BATCH_SIZE]\n",
|
||||||
|
" X = torch.tensor(X)\n",
|
||||||
|
" outputs = model(X.float())\n",
|
||||||
|
" y = (outputs >= 0.5)\n",
|
||||||
|
" y_test += prediction.tolist()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "toxic-pendant",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# print('eksportowanie do plików')\n",
|
||||||
|
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
|
||||||
|
"y_test = np.asarray(y_test, dtype=np.int32)\n",
|
||||||
|
"y_dev.tofile('./dev-0/out.tsv', sep='\\n')\n",
|
||||||
|
"y_test.tofile('./test-A/out.tsv', sep='\\n')\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
5152
test-A/in.tsv
Normal file
5152
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
2062
test-A/out.tsv
Normal file
2062
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
289579
train/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
289579
train/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
File diff suppressed because one or more lines are too long
289579
train/in.tsv
Normal file
289579
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
BIN
word2vec.model
Normal file
BIN
word2vec.model
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user