Compare commits

...

1 Commits

Author SHA1 Message Date
patrycjalazna
54b82a7411 done 2021-05-26 01:35:32 +02:00
6 changed files with 120260 additions and 0 deletions

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/out2.tsv Normal file

File diff suppressed because it is too large Load Diff

320
regresja-logistyczna.ipynb Normal file
View File

@ -0,0 +1,320 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"id": "d2b899fb",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
"import gensim\n",
"from gensim.models import Word2Vec\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.gridspec as gridspec\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.linear_model import LogisticRegression\n",
"import torch\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "39a1f19a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"98129\n",
"98129\n",
"5452\n",
"5452\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n"
]
}
],
"source": [
"# wczytanie danych\n",
"train = pd.read_table('train/train.tsv', error_bad_lines=False, sep='\\t', quoting=csv.QUOTE_NONE, header=None)\n",
"x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
"y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
"x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
"\n",
"# podzial na x i y\n",
"x_train = train[1].values\n",
"y_train = train[0].values\n",
"x_dev = x_dev[0].values\n",
"x_test = x_test[0].values\n",
"\n",
"print(len(x_train))\n",
"print(len(y_train))\n",
"print(len(x_dev))\n",
"print(len(y_dev))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c637937e",
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"#nltk.download('punkt')\n",
"\n",
"\n",
"# tokenizacja \n",
"def tokenize_data(data):\n",
" data_tokenize = [nltk.word_tokenize(x) for x in data]\n",
" \n",
" for doc in data_tokenize:\n",
" i = 0\n",
" while i < len(doc):\n",
" if doc[i].isalpha():\n",
" doc[i] = doc[i].lower()\n",
" else:\n",
" del doc[i]\n",
" i += 1\n",
" return data_tokenize\n",
"\n",
"x_train_tokenized = tokenize_data(x_train)\n",
"x_dev_tokenized = tokenize_data(x_dev)\n",
"x_test_tokenized = tokenize_data(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "890b3cca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('róż', 0.8955456018447876), ('kwiatek', 0.8504886031150818), ('fiołek', 0.831953763961792), ('chryzantema', 0.8315931558609009), ('bukiet', 0.8306410908699036), ('wiśnia', 0.8005671501159668), ('żonkil', 0.8005172610282898), ('liść', 0.7998315095901489), ('lilia', 0.7931062579154968), ('peonia', 0.7918344140052795)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/Cellar/jupyterlab/3.0.14/libexec/lib/python3.9/site-packages/gensim/models/keyedvectors.py:772: RuntimeWarning: invalid value encountered in true_divide\n",
" dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]\n"
]
}
],
"source": [
"from gensim.models import KeyedVectors\n",
"\n",
"word2vec_model = KeyedVectors.load(\"word2vec.bin\")\n",
"\n",
"# sprawdzenie czy dziala\n",
"print(word2vec_model.similar_by_word(\"kwiat\"))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "6bd92640",
"metadata": {},
"outputs": [],
"source": [
"x_train = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_train]\n",
"x_train_tensor = torch.tensor(np.array(x_train, dtype=np.float32).astype(np.float32))\n",
"x_train_vec = np.array(x_train, dtype=np.float32)\n",
"\n",
"x_dev = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_dev]\n",
"x_dev_vec = np.array(x_dev, dtype=np.float32)\n",
"\n",
"\n",
"x_test = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_test]\n",
"x_test_vec = np.array(x_test, dtype=np.float32)\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "df544bfb",
"metadata": {},
"outputs": [],
"source": [
"class NeuralNetworkModel(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(NeuralNetworkModel, self).__init__()\n",
" self.fc1 = torch.nn.Linear(100,200)\n",
" self.fc2 = torch.nn.Linear(200,1)\n",
"\n",
" def forward(self, x):\n",
" x = self.fc1(x)\n",
" x = torch.relu(x)\n",
" x = self.fc2(x)\n",
" x = torch.sigmoid(x)\n",
" return x\n",
" \n",
"nn_model = NeuralNetworkModel()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "884d80ec",
"metadata": {},
"outputs": [],
"source": [
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr=0.01)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "eacc269d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"1"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"3"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"4"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"5"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"batch_size = 12\n",
"for epoch in range(6):\n",
" loss_score = 0\n",
" acc_score = 0\n",
" items_total = 0\n",
" nn_model.train()\n",
" \n",
" for i in range(0, y_train.shape[0], batch_size):\n",
" X = x_train_vec[i:i + batch_size]\n",
" X = torch.tensor(X.astype(np.float32))\n",
" Y = y_train[i:i + batch_size]\n",
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)\n",
" Y_predictions = nn_model(X)\n",
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
" items_total += Y.shape[0]\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions, Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss_score += loss.item() * Y.shape[0]\n",
" display(epoch)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "daa85677",
"metadata": {},
"outputs": [],
"source": [
"# predykcje \n",
"y_pred_dev = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))\n",
"y_pred_dev = y_pred_dev.cpu().detach().numpy()\n",
"y_pred_dev = (y_pred_dev > 0.5)\n",
"y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32)\n",
"y_pred_dev.tofile('dev-0/out2.tsv', sep='\\n')\n",
"\n",
"\n",
"y_pred_test = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))\n",
"y_pred_test = y_pred_test.cpu().detach().numpy()\n",
"y_pred_test = (y_pred_test > 0.5)\n",
"y_pred_test = np.asarray(y_pred_test, dtype=np.int32)\n",
"y_pred_test.tofile('test-A/out2.tsv', sep='\\n')\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

5452
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
test-A/out2.tsv Normal file

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff