{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "74100403-147c-42cd-8285-e30778c0fb66", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import gensim\n", "import torch\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 2, "id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46", "metadata": {}, "outputs": [], "source": [ "def predict_year(x, path_out, model):\n", " results = model.predict(x)\n", " with open(path_out, 'wt') as file:\n", " for r in results:\n", " file.write(str(r) + '\\n') " ] }, { "cell_type": "code", "execution_count": 3, "id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6", "metadata": {}, "outputs": [], "source": [ "def read_file(filename):\n", " result = []\n", " with open(filename, 'r', encoding=\"utf-8\") as file:\n", " for line in file:\n", " text = line.split(\"\\t\")[0].strip()\n", " result.append(text)\n", " return result" ] }, { "cell_type": "code", "execution_count": 4, "id": "86fbfb79-76e7-49f5-b722-2827f93cb03f", "metadata": {}, "outputs": [], "source": [ "with open('train/in.tsv', 'r', encoding='utf8') as file:\n", " train = pd.read_csv(file, sep='\\t', header=None)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3", "metadata": {}, "outputs": [], "source": [ "with open('train/expected.tsv', 'r', encoding='utf8') as file:\n", " train_y = pd.read_csv(file, sep='\\t', header=None)\n", "train_y = train_y[0:10000]\n", "train_y = train_y[0]" ] }, { "cell_type": "code", "execution_count": 6, "id": "07ae7b22-e95d-4614-9757-15660a9834b6", "metadata": {}, "outputs": [], "source": [ "train = train[0:10000]\n", "train_x = train[0]\n", "train_x = [gensim.utils.simple_preprocess(x) for x in train_x]\n", "#train_x" ] }, { "cell_type": "code", "execution_count": 7, "id": "fde71cd8-f682-4793-bce9-0f9a9d8c176c", "metadata": {}, "outputs": [], "source": [ "from gensim.test.utils import common_texts\n", "from gensim.models import Word2Vec\n", "\n", "model = Word2Vec(sentences=train_x, vector_size=100, window=5, min_count=1, workers=4)\n", "#data, min_count = 1, vector_size = 100, window = 5, sg = 1" ] }, { "cell_type": "code", "execution_count": 8, "id": "9a4c8066-f985-478e-8944-dd45b73d9795", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_3520\\3800840358.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n", " train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])\n" ] } ], "source": [ "words = set(model.wv.index_to_key)\n", "train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])" ] }, { "cell_type": "code", "execution_count": null, "id": "b52269f9-f143-483d-9669-ce8f5972d6bb", "metadata": {}, "outputs": [], "source": [ "FEATURES = 100\n", "\n", "class NeuralNetworkModel(torch.nn.Module):\n", " def __init__(self):\n", " super(NeuralNetworkModel, self).__init__()\n", " self.fc1 = torch.nn.Linear(FEATURES,500)\n", " self.fc2 = torch.nn.Linear(500,1)\n", "\n", " def forward(self, x):\n", " x = self.fc1(x)\n", " x = torch.relu(x)\n", " x = self.fc2(x)\n", " x = torch.sigmoid(x)\n", " return x\n", "\n", "nn_model = NeuralNetworkModel()\n", "BATCH_SIZE = 40\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n", "\n", "def get_loss_acc(model, data_x, data_y):\n", " loss_score = 0\n", " acc_score = 0\n", " items_total = 0\n", " model.eval()\n", " for i in range(0, data_y.shape[0], BATCH_SIZE):\n", " X = data_x[i:i+BATCH_SIZE]\n", " X = torch.tensor(X.astype(np.float32))\n", " Y = data_y[i:i+BATCH_SIZE]\n", " Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n", " Y_predictions = model(X)\n", " acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n", " items_total += Y.shape[0]\n", "\n", " loss = criterion(Y_predictions, Y)\n", "\n", " loss_score += loss.item() * Y.shape[0]\n", " return (loss_score / items_total), (acc_score / items_total)\n", "\n", "\n", "for epoch in range(5):\n", " loss_score = 0\n", " acc_score = 0\n", " items_total = 0\n", " nn_model.train()\n", " for i in range(0, train_y.shape[0] - 42, BATCH_SIZE):\n", " X = train_x_vec[i:i+BATCH_SIZE]\n", " X = torch.tensor(X.astype(np.float32))\n", " Y = train_y[i:i+BATCH_SIZE]\n", " Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n", " Y_predictions = nn_model(X)\n", " acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n", " items_total += Y.shape[0]\n", "\n", " optimizer.zero_grad()\n", " loss = criterion(Y_predictions, Y)\n", " loss.backward()\n", " optimizer.step()\n", "\n", "\n", " loss_score += loss.item() * Y.shape[0]\n", "\n", " display(epoch)\n", " display(get_loss_acc(model, train_x_vect, train_y))" ] }, { "cell_type": "code", "execution_count": null, "id": "1482f342-f2ea-4c9d-b221-5ef451e3a6b3", "metadata": {}, "outputs": [], "source": [ "#print('trenowanie modelu')\n", "model = NeuralNetworkModel()\n", "BATCH_SIZE = 5\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n", "\n", "for epoch in range(BATCH_SIZE):\n", " model.train()\n", " for i in range(0, y_train.shape[0], BATCH_SIZE):\n", " X = x_train[i:i + BATCH_SIZE]\n", " X = torch.tensor(X)\n", " y = y_train[i:i + BATCH_SIZE]\n", " y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n", " optimizer.zero_grad()\n", " outputs = model(X.float())\n", " loss = criterion(outputs, y)\n", " loss.backward()\n", " optimizer.step()\n", "\n", "#print('predykcja wynikow')\n", "y_dev = []\n", "y_test = []\n", "model.eval()\n", "\n", "with torch.no_grad():\n", " for i in range(0, len(x_dev), BATCH_SIZE):\n", " X = x_dev[i:i + BATCH_SIZE]\n", " X = torch.tensor(X)\n", " outputs = model(X.float())\n", " prediction = (outputs > 0.5)\n", " y_dev += prediction.tolist()\n", "\n", " for i in range(0, len(x_test), BATCH_SIZE):\n", " X = x_test[i:i + BATCH_SIZE]\n", " X = torch.tensor(X)\n", " outputs = model(X.float())\n", " y = (outputs >= 0.5)\n", " y_test += prediction.tolist()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }