|
|
|
@ -82,8 +82,9 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# word2vec\n",
|
|
|
|
|
"# word2vec zgodnie z poradą Pana Jakuba\n",
|
|
|
|
|
"# https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html\n",
|
|
|
|
|
"# https://www.kaggle.com/kstathou/word-embeddings-logistic-regression\n",
|
|
|
|
|
"w2v = api.load('word2vec-google-news-300')\n",
|
|
|
|
|
"X_train = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_train]\n",
|
|
|
|
|
"X_dev = [np.mean([w2v[w] for w in content if w in w2v] or [np.zeros(300)], axis=0) for content in X_dev]\n",
|
|
|
|
@ -129,32 +130,26 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 47,
|
|
|
|
|
"execution_count": 59,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"Predykcje...\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"print('Predykcje...')\n",
|
|
|
|
|
"dev_prediction = []\n",
|
|
|
|
|
"test_prediction = []\n",
|
|
|
|
|
"y_dev = []\n",
|
|
|
|
|
"y_test = []\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"#model.eval() will notify all your layers that you are in eval mode\n",
|
|
|
|
|
"model.eval()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"#torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up\n",
|
|
|
|
|
"with torch.no_grad():\n",
|
|
|
|
|
" for i in range(0, len(X_dev), batch_size):\n",
|
|
|
|
|
" X = X_dev[i:i+batch_size]\n",
|
|
|
|
|
" X = torch.tensor(X)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" outputs = model(X.float())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" prediction = (outputs > 0.5)\n",
|
|
|
|
|
" dev_prediction = dev_prediction + prediction.tolist()\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" y = (outputs > 0.5)\n",
|
|
|
|
|
" y_dev.extend(y)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" for i in range(0, len(X_test), batch_size):\n",
|
|
|
|
|
" X = X_test[i:i+batch_size]\n",
|
|
|
|
@ -162,21 +157,24 @@
|
|
|
|
|
"\n",
|
|
|
|
|
" outputs = model(X.float())\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" prediction = (outputs > 0.5)\n",
|
|
|
|
|
" test_prediction = test_prediction + prediction.tolist()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"dev_prediction = np.asarray(dev_prediction, dtype=np.int32)\n",
|
|
|
|
|
"test_prediction = np.asarray(test_prediction, dtype=np.int32)"
|
|
|
|
|
" y = (outputs > 0.5)\n",
|
|
|
|
|
" y_test.extend(y)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 49,
|
|
|
|
|
"execution_count": 60,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"dev_prediction.tofile('./dev-0/out.tsv', sep='\\n')\n",
|
|
|
|
|
"test_prediction.tofile('./test-A/out.tsv', sep='\\n')"
|
|
|
|
|
"y_dev = np.asarray(y_dev, dtype=np.int32)\n",
|
|
|
|
|
"y_test = np.asarray(y_test, dtype=np.int32)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"y_dev_df = pd.DataFrame({'label':y_dev})\n",
|
|
|
|
|
"y_test_df = pd.DataFrame({'label':y_test})\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"y_dev_df.to_csv(r'dev-0/out.tsv', sep='\\t', index=False, header=False)\n",
|
|
|
|
|
"y_test_df.to_csv(r'test-A/out.tsv', sep='\\t', index=False, header=False)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|