paranormal-or-skeptic-ISI-p.../.ipynb_checkpoints/run-checkpoint.ipynb

263 lines
8.0 KiB
Plaintext
Raw Normal View History

2022-06-14 23:36:56 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "74100403-147c-42cd-8285-e30778c0fb66",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import gensim\n",
"import torch\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bf211ece-e27a-4119-a1b9-9a9a610cfb46",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1ec57d97-a852-490e-8da4-d1e4c9676cd6",
"metadata": {},
"outputs": [],
"source": [
"def read_file(filename):\n",
" result = []\n",
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
" for line in file:\n",
" text = line.split(\"\\t\")[0].strip()\n",
" result.append(text)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "86fbfb79-76e7-49f5-b722-2827f93cb03f",
"metadata": {},
"outputs": [],
"source": [
"with open('train/in.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8960c975-f756-4e36-a1ce-e9fd5fdf8fe3",
"metadata": {},
"outputs": [],
"source": [
"with open('train/expected.tsv', 'r', encoding='utf8') as file:\n",
" train_y = pd.read_csv(file, sep='\\t', header=None)\n",
"train_y = train_y[0:10000]\n",
"train_y = train_y[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "07ae7b22-e95d-4614-9757-15660a9834b6",
"metadata": {},
"outputs": [],
"source": [
"train = train[0:10000]\n",
"train_x = train[0]\n",
"train_x = [gensim.utils.simple_preprocess(x) for x in train_x]\n",
"#train_x"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fde71cd8-f682-4793-bce9-0f9a9d8c176c",
"metadata": {},
"outputs": [],
"source": [
"from gensim.test.utils import common_texts\n",
"from gensim.models import Word2Vec\n",
"\n",
"model = Word2Vec(sentences=train_x, vector_size=100, window=5, min_count=1, workers=4)\n",
"#data, min_count = 1, vector_size = 100, window = 5, sg = 1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9a4c8066-f985-478e-8944-dd45b73d9795",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\korne\\AppData\\Local\\Temp\\ipykernel_3520\\3800840358.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
" train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])\n"
]
}
],
"source": [
"words = set(model.wv.index_to_key)\n",
"train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b52269f9-f143-483d-9669-ce8f5972d6bb",
"metadata": {},
"outputs": [],
"source": [
"FEATURES = 100\n",
"\n",
"class NeuralNetworkModel(torch.nn.Module):\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(FEATURES,500)\n",
" self.fc2 = torch.nn.Linear(500,1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x\n",
"\n",
"nn_model = NeuralNetworkModel()\n",
"BATCH_SIZE = 40\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)\n",
"\n",
"def get_loss_acc(model, data_x, data_y):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" model.eval()\n",
" for i in range(0, data_y.shape[0], BATCH_SIZE):\n",
" X = data_x[i:i+BATCH_SIZE]\n",
" X = torch.tensor(X.astype(np.float32))\n",
" Y = data_y[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" loss = criterion(Y_predictions, Y)\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" return (loss_score / items_total), (acc_score / items_total)\n",
"\n",
"\n",
"for epoch in range(5):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" nn_model.train()\n",
" for i in range(0, train_y.shape[0] - 42, BATCH_SIZE):\n",
" X = train_x_vec[i:i+BATCH_SIZE]\n",
" X = torch.tensor(X.astype(np.float32))\n",
" Y = train_y[i:i+BATCH_SIZE]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
"\n",
" display(epoch)\n",
" display(get_loss_acc(model, train_x_vect, train_y))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1482f342-f2ea-4c9d-b221-5ef451e3a6b3",
"metadata": {},
"outputs": [],
"source": [
"#print('trenowanie modelu')\n",
"model = NeuralNetworkModel()\n",
"BATCH_SIZE = 5\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
"\n",
"for epoch in range(BATCH_SIZE):\n",
" model.train()\n",
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
" X = x_train[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" y = y_train[i:i + BATCH_SIZE]\n",
" y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n",
" optimizer.zero_grad()\n",
" outputs = model(X.float())\n",
" loss = criterion(outputs, y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
"#print('predykcja wynikow')\n",
"y_dev = []\n",
"y_test = []\n",
"model.eval()\n",
"\n",
"with torch.no_grad():\n",
" for i in range(0, len(x_dev), BATCH_SIZE):\n",
" X = x_dev[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" prediction = (outputs > 0.5)\n",
" y_dev += prediction.tolist()\n",
"\n",
" for i in range(0, len(x_test), BATCH_SIZE):\n",
" X = x_test[i:i + BATCH_SIZE]\n",
" X = torch.tensor(X)\n",
" outputs = model(X.float())\n",
" y = (outputs >= 0.5)\n",
" y_test += prediction.tolist()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}