init
This commit is contained in:
parent
8cb18767bb
commit
54ddfbc0e2
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
236
neural.ipynb
Normal file
236
neural.ipynb
Normal file
@ -0,0 +1,236 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gensim\n",
|
||||
"import nltk\n",
|
||||
"import pandas as pd \n",
|
||||
"import numpy as np \n",
|
||||
"import os\n",
|
||||
"import io\n",
|
||||
"import gzip\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"# wget http://publications.it.p.lodz.pl/2016/word_embeddings/pl-embeddings-cbow.txt 900MB\n",
|
||||
"\n",
|
||||
"def read_data_gz(baseUrl):\n",
|
||||
" f = gzip.open(baseUrl,'r')\n",
|
||||
" data_unzip = f.read()\n",
|
||||
" data = pd.read_table(io.StringIO(data_unzip.decode('utf-8')), error_bad_lines=False, header= None) \n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def preprocess(data):\n",
|
||||
" data_tokenize = [nltk.word_tokenize(x) for x in data]\n",
|
||||
"\n",
|
||||
" for doc in data_tokenize:\n",
|
||||
" i = 0\n",
|
||||
" while i < len(doc):\n",
|
||||
" if doc[i].isalpha():\n",
|
||||
" doc[i] = doc[i].lower()\n",
|
||||
" else:\n",
|
||||
" del doc[i]\n",
|
||||
" i += 1\n",
|
||||
" return data_tokenize\n",
|
||||
"\n",
|
||||
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" super(NeuralNetworkModel, self).__init__()\n",
|
||||
" self.fc1 = torch.nn.Linear(100,200)\n",
|
||||
" self.fc2 = torch.nn.Linear(200,1)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.fc1(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
" x = self.fc2(x)\n",
|
||||
" x = torch.sigmoid(x)\n",
|
||||
" return x\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_train = read_data_gz('./train/train.tsv.gz')\n",
|
||||
"data_dev = pd.read_table('./dev-0/in.tsv', error_bad_lines=False, header= None)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"y_train = data_train[0].values\n",
|
||||
"x_train = data_train[1].values\n",
|
||||
"x_dev = data_dev[0].values\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = gensim.models.KeyedVectors.load_word2vec_format('pl-embeddings-cbow.txt', binary=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train_tokenize = preprocess(x_train)\n",
|
||||
"x_dev_tokenize = preprocess(x_dev)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train_vectors = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_train_tokenize]\n",
|
||||
"x_train_vectors = np.array(x_train_vectors)\n",
|
||||
"\n",
|
||||
"x_dev_vectors= [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_dev_tokenize]\n",
|
||||
"x_dev_vectors = np.array(x_dev_vectors, dtype=np.float32)\n",
|
||||
"x_dev_tensor = torch.tensor(x_dev_vectors.astype(np.float32))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Trenowanie modelu...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -------------------------------------------------------------------------------------------------------------------------------------------\n",
|
||||
"model_nn = NeuralNetworkModel()\n",
|
||||
"criterion = torch.nn.BCELoss()\n",
|
||||
"optimizer = torch.optim.SGD(model_nn.parameters(), lr=0.01)\n",
|
||||
" \n",
|
||||
"batch_size = 10\n",
|
||||
"print('Trenowanie modelu...')\n",
|
||||
" \n",
|
||||
"for epoch in range(6):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
" model_nn.train()\n",
|
||||
" for i in range(0, y_train.shape[0], batch_size):\n",
|
||||
" X = x_train_vectors[i:i+batch_size]\n",
|
||||
" X = torch.tensor(X.astype(np.float32))\n",
|
||||
" Y = y_train[i:i+batch_size]\n",
|
||||
" Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
|
||||
" \n",
|
||||
" Y_predictions = model_nn(X)\n",
|
||||
" acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
|
||||
" items_total += Y.shape[0] \n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(Y_predictions, Y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * Y.shape[0]\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# -------------------------------------------------------------------------------------------------------------------------------------------\n",
|
||||
"ypred = model_nn(x_dev_tensor)\n",
|
||||
"ypred = ypred.cpu().detach().numpy() \n",
|
||||
"ypred = (ypred > 0.5)\n",
|
||||
"ypred = np.asarray(ypred, dtype=np.int32)\n",
|
||||
"\n",
|
||||
"y_exptected = pd.read_table('./dev-0/expected.tsv', header= None)\n",
|
||||
"y_exptected = y_exptected.values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Score = 0.973037417461482\n",
|
||||
"------------------------------------------------------------\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.97 0.96 0.96 1983\n",
|
||||
" 1 0.97 0.98 0.98 3469\n",
|
||||
"\n",
|
||||
" accuracy 0.97 5452\n",
|
||||
" macro avg 0.97 0.97 0.97 5452\n",
|
||||
"weighted avg 0.97 0.97 0.97 5452\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"\n",
|
||||
"print(\"Score = \",accuracy_score(y_exptected, ypred))\n",
|
||||
"\n",
|
||||
"print('-' * 60)\n",
|
||||
"print(classification_report(y_exptected, ypred))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
train/train.tsv.gz
Normal file
BIN
train/train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user