Test 2 Outputs

This commit is contained in:
Dominik Strzako 2021-05-22 16:01:18 +02:00
parent 68a99a2c2d
commit c68b2d0d1a
6 changed files with 579984 additions and 866 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@ -11,21 +11,22 @@
"import torch\n",
"import csv\n",
"from nltk.tokenize import word_tokenize\n",
"from gensim.models import Word2Vec\n",
"import gensim.downloader"
"#from gensim.models import Word2Vec\n",
"import gensim.downloader as api"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"#Sieć neuronowa z ćwiczeń 8\n",
"class NeuralNetwork(torch.nn.Module): \n",
" def __init__(self, input_size, hidden_size, num_classes):\n",
" def __init__(self, hidden_size):\n",
" super(NeuralNetwork, self).__init__()\n",
" self.l1 = torch.nn.Linear(input_size, hidden_size)\n",
" self.l2 = torch.nn.Linear(hidden_size, num_classes)\n",
" self.l1 = torch.nn.Linear(300, hidden_size) #Korzystamy z Googlowego word2vec-google-news-300 który ma zawsze na wejściu wymiar 300\n",
" self.l2 = torch.nn.Linear(hidden_size, 1)\n",
"\n",
" def forward(self, x):\n",
" x = self.l1(x)\n",
@ -37,68 +38,35 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"col_names = ['content', 'id', 'label']\n"
"# Wczytanie X i Y do Train oraz X do Dev i Test\n",
"X_train = pd.read_table('train/in.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])\n",
"y_train = pd.read_table('train/expected.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['label'])\n",
"X_dev = pd.read_table('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])\n",
"X_test = pd.read_table('test-A/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wczytanie danych...\n"
]
}
],
"outputs": [],
"source": [
"print('Wczytanie danych...')\n",
"# loading dataset\n",
"train_set_features = pd.read_table('train/in.tsv.xz', error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_names[:2])\n",
"train_set_labels = pd.read_table('train/expected.tsv', error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_names[2:])\n",
"dev_set = pd.read_table('dev-0/in.tsv.xz', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=col_names[:2])\n",
"test_set = pd.read_table('test-A/in.tsv.xz', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=col_names[:2])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Preprocessing danych...\n"
]
}
],
"source": [
"print('Preprocessing danych...')\n",
"# Preprocessing danych\n",
"# lowercase\n",
"X_train = train_set_features['content'].str.lower()\n",
"y_train = train_set_labels['label']"
"# https://www.datacamp.com/community/tutorials/case-conversion-python\n",
"X_train = X_train.content.str.lower()\n",
"y_train = y_train['label']\n",
"X_dev = X_dev.content.str.lower()\n",
"X_test = X_test.content.str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"X_dev = dev_set['content'].str.lower()\n",
"X_test = test_set['content'].str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
@ -110,55 +78,40 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[==================================================] 100.0% 1662.8/1662.8MB downloaded\n"
]
}
],
"source": [
"# word2vec\n",
"word2vec = gensim.downloader.load('word2vec-google-news-300')\n",
"X_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_train]\n",
"X_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_dev]\n",
"X_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"model = NeuralNetwork(300, 600, 1)\n",
"\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)\n",
"\n",
"batch_size = 10"
"# word2vec\n",
"# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html\n",
"w2v = api.load('word2vec-google-news-300')\n",
"X_train = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_train]\n",
"X_dev = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_dev]\n",
"X_test = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Trenowanie modelu...\n"
]
}
],
"outputs": [],
"source": [
"print('Trenowanie modelu...')\n",
"for epoch in range(6):\n",
"model = NeuralNetwork(600)\n",
"\n",
"criterion = torch.nn.BCELoss()\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)\n",
"\n",
"batch_size = 15"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Trening modelu z ćwiczeń 8\n",
"for epoch in range(5):\n",
" model.train()\n",
" for i in range(0, y_train.shape[0], batch_size):\n",
" X = X_train[i:i+batch_size]\n",
@ -176,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 47,
"metadata": {},
"outputs": [
{
@ -218,13 +171,20 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"dev_prediction.tofile('./dev-0/out.tsv', sep='\\n')\n",
"test_prediction.tofile('./test-A/out.tsv', sep='\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long