Test 2 Outputs

This commit is contained in:
Dominik Strzako 2021-05-22 16:01:18 +02:00
parent 68a99a2c2d
commit c68b2d0d1a
6 changed files with 579984 additions and 866 deletions

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 38,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -11,21 +11,22 @@
"import torch\n", "import torch\n",
"import csv\n", "import csv\n",
"from nltk.tokenize import word_tokenize\n", "from nltk.tokenize import word_tokenize\n",
"from gensim.models import Word2Vec\n", "#from gensim.models import Word2Vec\n",
"import gensim.downloader" "import gensim.downloader as api"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 39,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"class NeuralNetwork(torch.nn.Module):\n", "#Sieć neuronowa z ćwiczeń 8\n",
" def __init__(self, input_size, hidden_size, num_classes):\n", "class NeuralNetwork(torch.nn.Module): \n",
" def __init__(self, hidden_size):\n",
" super(NeuralNetwork, self).__init__()\n", " super(NeuralNetwork, self).__init__()\n",
" self.l1 = torch.nn.Linear(input_size, hidden_size)\n", " self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z Googlowego word2vec-google-news-300 który ma zawsze na wejściu wymiar 300\n",
" self.l2 = torch.nn.Linear(hidden_size, num_classes)\n", " self.l2 = torch.nn.Linear(hidden_size, 1)\n",
"\n", "\n",
" def forward(self, x):\n", " def forward(self, x):\n",
" x = self.l1(x)\n", " x = self.l1(x)\n",
@ -37,68 +38,35 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 40,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"col_names = ['content', 'id', 'label']\n" "# Wczytanie X i Y do Train oraz X do Dev i Test\n",
"X_train = pd.read_table('train/in.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])\n",
"y_train = pd.read_table('train/expected.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['label'])\n",
"X_dev = pd.read_table('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])\n",
"X_test = pd.read_table('test-A/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 41,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wczytanie danych...\n"
]
}
],
"source": [ "source": [
"print('Wczytanie danych...')\n", "# Preprocessing danych\n",
"# loading dataset\n",
"train_set_features = pd.read_table('train/in.tsv.xz', error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_names[:2])\n",
"train_set_labels = pd.read_table('train/expected.tsv', error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_names[2:])\n",
"dev_set = pd.read_table('dev-0/in.tsv.xz', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=col_names[:2])\n",
"test_set = pd.read_table('test-A/in.tsv.xz', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=col_names[:2])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Preprocessing danych...\n"
]
}
],
"source": [
"print('Preprocessing danych...')\n",
"# lowercase\n", "# lowercase\n",
"X_train = train_set_features['content'].str.lower()\n", "# https://www.datacamp.com/community/tutorials/case-conversion-python\n",
"y_train = train_set_labels['label']" "X_train = X_train.content.str.lower()\n",
"y_train = y_train['label']\n",
"X_dev = X_dev.content.str.lower()\n",
"X_test = X_test.content.str.lower()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"X_dev = dev_set['content'].str.lower()\n",
"X_test = test_set['content'].str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -110,55 +78,40 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[==================================================] 100.0% 1662.8/1662.8MB downloaded\n"
]
}
],
"source": [
"# word2vec\n",
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
"X_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_train]\n",
"X_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_dev]\n",
"X_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"model = NeuralNetwork(300, 600, 1)\n", "# word2vec\n",
"\n", "# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html\n",
"criterion = torch.nn.BCELoss()\n", "w2v = api.load('word2vec-google-news-300')\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)\n", "X_train = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_train]\n",
"\n", "X_dev = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_dev]\n",
"batch_size = 10" "X_test = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_test]"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 45,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Trenowanie modelu...\n"
]
}
],
"source": [ "source": [
"print('Trenowanie modelu...')\n", "model = NeuralNetwork(600)\n",
"for epoch in range(6):\n", "\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)\n",
"\n",
"batch_size = 15"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Trening modelu z ćwiczeń 8\n",
"for epoch in range(5):\n",
" model.train()\n", " model.train()\n",
" for i in range(0, y_train.shape[0], batch_size):\n", " for i in range(0, y_train.shape[0], batch_size):\n",
" X = X_train[i:i+batch_size]\n", " X = X_train[i:i+batch_size]\n",
@ -176,7 +129,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 47,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -218,13 +171,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 49,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"dev_prediction.tofile('./dev-0/out.tsv', sep='\\n')\n", "dev_prediction.tofile('./dev-0/out.tsv', sep='\\n')\n",
"test_prediction.tofile('./test-A/out.tsv', sep='\\n')" "test_prediction.tofile('./test-A/out.tsv', sep='\\n')"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long