14 KiB
14 KiB
import numpy as np
import gensim
import re
import torch
import pandas as pd
from gensim.models import Word2Vec
from gensim import downloader
from sklearn.feature_extraction.text import TfidfVectorizer
BATCH_SIZE = 64
EPOCHS = 100
FEATURES = 200
with open('train/in.tsv', 'r', encoding='utf8') as f:
X_train = f.readlines()
with open('train/expected.tsv', 'r', encoding='utf8') as f:
y_train = f.readlines()
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
X_dev = f.readlines()
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
y_dev = f.readlines()
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
X_test = f.readlines()
for i, line in enumerate(X_train):
X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_dev):
X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_test):
X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(y_train):
y_train[i] = re.sub(r'\n', '', line)
for i, line in enumerate(y_dev):
y_dev[i] = re.sub(r'\n', '', line)
def readData(fileName):
with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
X = np.array([x.strip().lower() for x in f.readlines()])
with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
y = np.array([int(x.strip()) for x in f.readlines()])
return X,y
X_file,y_file = readData('dev-0')
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(FEATURES, 500)
self.fc2 = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
word2vec = downloader.load("glove-twitter-200")
X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_train]
y_train = np.array(y_train)
def train_model(X_train, y_train):
model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
for epoch in range(EPOCHS):
print(epoch)
loss_score = 0
acc_score = 0
items_total = 0
for i in range(0, y_train.shape[0], BATCH_SIZE):
x = X_train[i:i+BATCH_SIZE]
x = torch.tensor(np.array(x).astype(np.float32))
y = y_train[i:i+BATCH_SIZE]
y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
y_pred = model(x)
acc_score += torch.sum((y_pred > 0.5) == y).item()
items_total += y.shape[0]
optimizer.zero_grad()
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
loss_score += loss.item() * y.shape[0]
print((loss_score / items_total), (acc_score / items_total))
return model
def predict(model, x_test):
y_dev = []
with torch.no_grad():
for i in range(0, len(x_test), BATCH_SIZE):
x = x_test[i:i+BATCH_SIZE]
x = torch.tensor(np.array(x).astype(np.float32))
outputs = model(x)
y = (outputs > 0.5)
y_dev.extend(y)
return y_dev
model = train_model(X_train_w2v, y_train)
0 0.5714333134919922 0.6966561801788113 1 0.5395073619374668 0.7242514132585581 2 0.5322582519146749 0.7296247310751125 3 0.5277940251241121 0.7327292379626976 4 0.5243827499623 0.7345525745996775 5 0.521483356086283 0.7361825270478868 6 0.5188610753636298 0.7376052821509848 7 0.5164497832484463 0.7390211306759122 8 0.5142272224311959 0.7402332351448137 9 0.5121725415654607 0.7413451942302446 10 0.510225843876934 0.742412260557568 11 0.5084293723366556 0.7430476657492429 12 0.5067300511753501 0.7440560261621181 13 0.5051866206455035 0.7450609332859082 14 0.503752063642534 0.7458586430645868 15 0.5024285955103476 0.7466943390232027 16 0.5011020173689057 0.7476439935216297 17 0.49986460605734995 0.7483691842295194 18 0.498722317965918 0.7489527900849163 19 0.4976401474074949 0.749584741987506 20 0.4966364578740479 0.7502788530936291 21 0.4956408892432799 0.7507208740965332 22 0.4946911594690806 0.7513459194209525 23 0.4938261365074296 0.7519433384326902 24 0.49291108882053136 0.7526996087423466 25 0.49207683927175633 0.752979325158247 26 0.4912937934254017 0.7534524257629179 27 0.49052768458365964 0.7539186197894184 28 0.48980189713607974 0.7542535888306818 29 0.48902049401931186 0.7547819420607157 30 0.48832297395034846 0.7553828143615386 31 0.48764632061179475 0.7556832505119501 32 0.4869866096390585 0.7563359221490509 33 0.48635514366306837 0.7567813964410403 34 0.48572428783405186 0.7574616943908226 35 0.4851059672855987 0.7577897568539155 36 0.4844747067054167 0.7581350857624344 37 0.4838937349887044 0.7585080409836349 38 0.48333403454228063 0.7584769613818682 39 0.4827657912931136 0.7590916468390319 40 0.48225590195293194 0.7592435915587802 41 0.48163791058193006 0.7597857579451549 42 0.4811314198011156 0.7601414467209293 43 0.4806143895582873 0.7607181459981559 44 0.4800953709221985 0.7609598762341192 45 0.47956847999038854 0.7612913919862974 46 0.4790844480555675 0.7616470807620719 47 0.47860829903493235 0.761795572192735 48 0.4781695258369003 0.762089101764976 49 0.4776893918277479 0.7624827767206876 50 0.47722041533606274 0.7628246523401213 51 0.4767699545351635 0.7631596213813847 52 0.47637271544187293 0.7633253792574738 53 0.47592309171862696 0.7635705627825222 54 0.47549356202221993 0.7638744522220189 55 0.47508612961542673 0.7642370475759638 56 0.47468646391106234 0.764351006115775 57 0.4742474519497854 0.7646790685788679 58 0.4737666401496256 0.7650623836673239 59 0.47335995538274667 0.7652972073251169 60 0.4729701449600526 0.7654422454666947 61 0.4725969795466422 0.7656252697882098 62 0.47221369839845356 0.7661121835492215 63 0.4718388513139844 0.7663021144489068 64 0.47147053143633466 0.7664575124577404 65 0.4711233925314738 0.7666543499355961 66 0.47074752713287643 0.7669340663514965 67 0.4703749315941604 0.7673242880181229 68 0.470022628463849 0.7672828485491006 69 0.4696828857076031 0.7677559491537715 70 0.4693190624670805 0.7678491879590716 71 0.4689852795644025 0.7683257418528278 72 0.46865665018555414 0.7687194168085393 73 0.468258934943202 0.7687297766757949 74 0.46797715189850664 0.7687608562775615 75 0.46764439033620286 0.7690716522952286 76 0.46732620352289256 0.769351368711129 77 0.4670077633846447 0.769700150908733 78 0.4667117469477995 0.7697692166904369 79 0.4664313273439932 0.7700420265281668 80 0.4661624620708029 0.7704426080620487 81 0.46585000600566223 0.7703148363658967 82 0.4655422194174101 0.7706739784307564 83 0.4652497145337105 0.7708708159086122 84 0.46495632112782237 0.7708673626195269 85 0.46467082155335016 0.7712023316607903 86 0.46439953297526376 0.7715269408347981 87 0.4640616501378699 0.7718032039616133 88 0.46377603995408073 0.7721139999792803 89 0.46352646427627725 0.7722072387845804 90 0.46323162764281506 0.7723971696842657 91 0.4629823635760337 0.7724765953332251 92 0.46268333841052883 0.7727770314836366 93 0.4624373474653466 0.7728978966016182 94 0.4621637105605031 0.7731396268375814 95 0.4618823675153035 0.7730463880322813 96 0.4615598618066211 0.7733571840499484 97 0.4613917053205442 0.7734089833862262 98 0.4610787309787952 0.7734642360115892 99 0.4608159763176817 0.7737197794038932
X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_dev]
y_dev=predict(model, X_dev_w2v)
y_dev = ['1' if bool(item) else '0' for item in y_dev]
with open('dev-0/out.tsv', 'wt') as f:
for pred in y_dev:
f.write(str(pred)+'\n')
X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_test]
y_test=predict(model, X_test_w2v)
y_test = ['1' if bool(item) else '0' for item in y_test]
with open('test-A/out.tsv', 'wt') as f:
for pred in y_test:
f.write(str(pred)+'\n')