done
This commit is contained in:
parent
9cb2fb2612
commit
54b82a7411
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out2.tsv
Normal file
5452
dev-0/out2.tsv
Normal file
File diff suppressed because it is too large
Load Diff
320
regresja-logistyczna.ipynb
Normal file
320
regresja-logistyczna.ipynb
Normal file
@ -0,0 +1,320 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "d2b899fb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import os\n",
|
||||||
|
"import gensim\n",
|
||||||
|
"from gensim.models import Word2Vec\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import matplotlib.gridspec as gridspec\n",
|
||||||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
||||||
|
"from sklearn.linear_model import LogisticRegression\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import csv"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"id": "39a1f19a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"98129\n",
|
||||||
|
"98129\n",
|
||||||
|
"5452\n",
|
||||||
|
"5452\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# wczytanie danych\n",
|
||||||
|
"train = pd.read_table('train/train.tsv', error_bad_lines=False, sep='\\t', quoting=csv.QUOTE_NONE, header=None)\n",
|
||||||
|
"x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
|
||||||
|
"y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
|
||||||
|
"x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
|
||||||
|
"\n",
|
||||||
|
"# podzial na x i y\n",
|
||||||
|
"x_train = train[1].values\n",
|
||||||
|
"y_train = train[0].values\n",
|
||||||
|
"x_dev = x_dev[0].values\n",
|
||||||
|
"x_test = x_test[0].values\n",
|
||||||
|
"\n",
|
||||||
|
"print(len(x_train))\n",
|
||||||
|
"print(len(y_train))\n",
|
||||||
|
"print(len(x_dev))\n",
|
||||||
|
"print(len(y_dev))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"id": "c637937e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import nltk\n",
|
||||||
|
"#nltk.download('punkt')\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# tokenizacja \n",
|
||||||
|
"def tokenize_data(data):\n",
|
||||||
|
" data_tokenize = [nltk.word_tokenize(x) for x in data]\n",
|
||||||
|
" \n",
|
||||||
|
" for doc in data_tokenize:\n",
|
||||||
|
" i = 0\n",
|
||||||
|
" while i < len(doc):\n",
|
||||||
|
" if doc[i].isalpha():\n",
|
||||||
|
" doc[i] = doc[i].lower()\n",
|
||||||
|
" else:\n",
|
||||||
|
" del doc[i]\n",
|
||||||
|
" i += 1\n",
|
||||||
|
" return data_tokenize\n",
|
||||||
|
"\n",
|
||||||
|
"x_train_tokenized = tokenize_data(x_train)\n",
|
||||||
|
"x_dev_tokenized = tokenize_data(x_dev)\n",
|
||||||
|
"x_test_tokenized = tokenize_data(x_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"id": "890b3cca",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[('róż', 0.8955456018447876), ('kwiatek', 0.8504886031150818), ('fiołek', 0.831953763961792), ('chryzantema', 0.8315931558609009), ('bukiet', 0.8306410908699036), ('wiśnia', 0.8005671501159668), ('żonkil', 0.8005172610282898), ('liść', 0.7998315095901489), ('lilia', 0.7931062579154968), ('peonia', 0.7918344140052795)]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/usr/local/Cellar/jupyterlab/3.0.14/libexec/lib/python3.9/site-packages/gensim/models/keyedvectors.py:772: RuntimeWarning: invalid value encountered in true_divide\n",
|
||||||
|
" dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from gensim.models import KeyedVectors\n",
|
||||||
|
"\n",
|
||||||
|
"word2vec_model = KeyedVectors.load(\"word2vec.bin\")\n",
|
||||||
|
"\n",
|
||||||
|
"# sprawdzenie czy dziala\n",
|
||||||
|
"print(word2vec_model.similar_by_word(\"kwiat\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 62,
|
||||||
|
"id": "6bd92640",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"x_train = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_train]\n",
|
||||||
|
"x_train_tensor = torch.tensor(np.array(x_train, dtype=np.float32).astype(np.float32))\n",
|
||||||
|
"x_train_vec = np.array(x_train, dtype=np.float32)\n",
|
||||||
|
"\n",
|
||||||
|
"x_dev = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_dev]\n",
|
||||||
|
"x_dev_vec = np.array(x_dev, dtype=np.float32)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"x_test = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_test]\n",
|
||||||
|
"x_test_vec = np.array(x_test, dtype=np.float32)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"id": "df544bfb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||||
|
"\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" super(NeuralNetworkModel, self).__init__()\n",
|
||||||
|
" self.fc1 = torch.nn.Linear(100,200)\n",
|
||||||
|
" self.fc2 = torch.nn.Linear(200,1)\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" x = self.fc1(x)\n",
|
||||||
|
" x = torch.relu(x)\n",
|
||||||
|
" x = self.fc2(x)\n",
|
||||||
|
" x = torch.sigmoid(x)\n",
|
||||||
|
" return x\n",
|
||||||
|
" \n",
|
||||||
|
"nn_model = NeuralNetworkModel()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 67,
|
||||||
|
"id": "884d80ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"criterion = torch.nn.BCELoss()\n",
|
||||||
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 68,
|
||||||
|
"id": "eacc269d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"batch_size = 12\n",
|
||||||
|
"for epoch in range(6):\n",
|
||||||
|
" loss_score = 0\n",
|
||||||
|
" acc_score = 0\n",
|
||||||
|
" items_total = 0\n",
|
||||||
|
" nn_model.train()\n",
|
||||||
|
" \n",
|
||||||
|
" for i in range(0, y_train.shape[0], batch_size):\n",
|
||||||
|
" X = x_train_vec[i:i + batch_size]\n",
|
||||||
|
" X = torch.tensor(X.astype(np.float32))\n",
|
||||||
|
" Y = y_train[i:i + batch_size]\n",
|
||||||
|
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)\n",
|
||||||
|
" Y_predictions = nn_model(X)\n",
|
||||||
|
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
|
||||||
|
" items_total += Y.shape[0]\n",
|
||||||
|
"\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" loss = criterion(Y_predictions, Y)\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()\n",
|
||||||
|
"\n",
|
||||||
|
" loss_score += loss.item() * Y.shape[0]\n",
|
||||||
|
" display(epoch)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 70,
|
||||||
|
"id": "daa85677",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# predykcje \n",
|
||||||
|
"y_pred_dev = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))\n",
|
||||||
|
"y_pred_dev = y_pred_dev.cpu().detach().numpy()\n",
|
||||||
|
"y_pred_dev = (y_pred_dev > 0.5)\n",
|
||||||
|
"y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32)\n",
|
||||||
|
"y_pred_dev.tofile('dev-0/out2.tsv', sep='\\n')\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"y_pred_test = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))\n",
|
||||||
|
"y_pred_test = y_pred_test.cpu().detach().numpy()\n",
|
||||||
|
"y_pred_test = (y_pred_test > 0.5)\n",
|
||||||
|
"y_pred_test = np.asarray(y_pred_test, dtype=np.int32)\n",
|
||||||
|
"y_pred_test.tofile('test-A/out2.tsv', sep='\\n')\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
5452
test-A/out.tsv
Normal file
5452
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
test-A/out2.tsv
Normal file
5452
test-A/out2.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user