import numpy from sklearn.preprocessing import LabelEncoder from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer import torch from gensim import downloader from nltk.tokenize import word_tokenize class NetworkModel(torch.nn.Module): def __init__(self): dim = 200 super(NeuralNetworkModel, self).__init__() self.fc1 = torch.nn.Linear(dim, 500) self.fc2 = torch.nn.Linear(500, 1) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x word2vec = downloader.load("word2vec-google-news-300") def word2vecOnDoc(document): return numpy.mean( [word2vec[token] for token in document if token in word2vec] or [numpy.zeros(300)], axis=0, ) def prepareData(data): data = [word_tokenize(row) for row in data] print(data) data = [word2vecOnDoc(document) for document in data] return data def trainModel(trainFileIn, trainFileExpected): with open(trainFileExpected, 'r') as f: expectedData = f.readlines() with open(trainFileIn, 'r') as f: inData = f.readlines() expectedData = prepareData(expectedData) inData = prepareData(inData) # networkModel = NetworkModel(300, 300, 1) # criterion = torch.nn.BCELoss() # optim = torch.optim.SGD(network.parameters(), lr=0.02) # epochs = 1 # batchSize = 2 # for _ in range(epochs): # network.train() # for i in range(0, inData.shape[0], batchSize): # x = inData[i : i + batchSize] # x = torch.tensor(x) # y = expectedData[i : i + batchSize] # y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) # outputs = network(x.float()) # loss = criterion(outputs, y) # print(loss) # optim.zero_grad() # loss.backward() # optim.step() # return networkModel def evaluateModel(model, inFile, outFile): with open(inFile, 'r') as f: inData = f.readlines() inData = prepareData(inData) pred = [] with torch.no_grad(): for i in range(0, len(inData), batch_size): x = inData[i : i + batch_size] x = torch.tensor(x) outputs = model(x.float()) prediction = outputs >= 0.5 pred += prediction.tolist() numpy.asarray(pred, dtype=numpyp.int32).tofile(outFile, sep="\n") model = trainModel("train/in.tsv", "train/expected.tsv") #evaluateModel(model, "dev-0/in.tsv", "dev-0/out.tsv") #evaluateModel(model, "test-A/in.tsv", "test-A/out.tsv")