Neural network with word2vec
This commit is contained in:
parent
b217d37450
commit
7f75f2e2e2
1470
dev-0/out.tsv
1470
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
219
pytorch_classifier.ipynb
Normal file
219
pytorch_classifier.ipynb
Normal file
@ -0,0 +1,219 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.5-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.9.5 64-bit",
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "ac59ebe37160ed0dfa835113d9b8498d9f09ceb179beaac4002f036b9467c963"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lzma\n",
|
||||
"import torch\n",
|
||||
"import numpy as np\n",
|
||||
"from gensim import downloader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BATCH_SIZE = 10\n",
|
||||
"EPOCHS = 10\n",
|
||||
"FEATURES = 200"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" super(NeuralNetworkModel, self).__init__()\n",
|
||||
" self.fc1 = torch.nn.Linear(FEATURES, 1000)\n",
|
||||
" self.fc2 = torch.nn.Linear(1000, 500)\n",
|
||||
" self.fc3 = torch.nn.Linear(500, 1)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.fc1(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
" x = self.fc2(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
" x = self.fc3(x)\n",
|
||||
" x = torch.sigmoid(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Read train files\n",
|
||||
"with lzma.open(\"train/in.tsv.xz\", \"rt\", encoding=\"utf-8\") as train_file:\n",
|
||||
" x_train = [x.strip().lower() for x in train_file.readlines()]\n",
|
||||
"\n",
|
||||
"with open(\"train/expected.tsv\", \"r\", encoding=\"utf-8\") as train_file:\n",
|
||||
" y_train = np.array([int(x.strip()) for x in train_file.readlines()])\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word2vec = downloader.load(\"glove-twitter-200\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
|
||||
" or [np.zeros(FEATURES)], axis=0) for doc in x_train]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = NeuralNetworkModel()\n",
|
||||
"\n",
|
||||
"criterion = torch.nn.BCELoss()\n",
|
||||
"optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"0\n",
|
||||
"0.5444966091123856 0.7128072132302411\n",
|
||||
"1\n",
|
||||
"0.5187017436751196 0.7303153888921503\n",
|
||||
"2\n",
|
||||
"0.5117590330604093 0.7348944502191112\n",
|
||||
"3\n",
|
||||
"0.5075270808198805 0.7376916143781145\n",
|
||||
"4\n",
|
||||
"0.5043017516287736 0.7403230206610286\n",
|
||||
"5\n",
|
||||
"0.5016950109024928 0.7418977204838748\n",
|
||||
"6\n",
|
||||
"0.49942716640870777 0.7432134236253319\n",
|
||||
"7\n",
|
||||
"0.49766424133924386 0.7448606425189672\n",
|
||||
"8\n",
|
||||
"0.49617289846816215 0.745534033890579\n",
|
||||
"9\n",
|
||||
"0.49471875689137873 0.7467116054686286\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for epoch in range(EPOCHS):\n",
|
||||
" print(epoch)\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
" for i in range(0, y_train.shape[0], BATCH_SIZE):\n",
|
||||
" x = x_train_w2v[i:i+BATCH_SIZE]\n",
|
||||
" x = torch.tensor(np.array(x).astype(np.float32))\n",
|
||||
" y = y_train[i:i+BATCH_SIZE]\n",
|
||||
" y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)\n",
|
||||
" y_pred = model(x)\n",
|
||||
" acc_score += torch.sum((y_pred > 0.5) == y).item()\n",
|
||||
" items_total += y.shape[0]\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(y_pred, y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * y.shape[0]\n",
|
||||
" \n",
|
||||
" print((loss_score / items_total), (acc_score / items_total))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Read dev files\n",
|
||||
"with lzma.open(\"dev-0/in.tsv.xz\", \"rt\", encoding=\"utf-8\") as dev_file:\n",
|
||||
" x_dev = [x.strip().lower() for x in dev_file.readlines()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]\n",
|
||||
" or [np.zeros(FEATURES)], axis=0) for doc in x_train]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_dev = []\n",
|
||||
"with torch.no_grad():\n",
|
||||
" for i in range(0, len(x_dev_w2v), BATCH_SIZE):\n",
|
||||
" x = x_dev_w2v[i:i+BATCH_SIZE]\n",
|
||||
" x = torch.tensor(np.array(x).astype(np.float32))\n",
|
||||
" \n",
|
||||
" outputs = model(x\n",
|
||||
" \n",
|
||||
" y = (outputs > 0.5)\n",
|
||||
" y_dev.extend(y)"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
1412
test-A/out.tsv
1412
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user