import numpy from sklearn.preprocessing import LabelEncoder from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer import torch from gensim import downloader from nltk.tokenize import word_tokenize import pandas as pd class NetworkModel(torch.nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(NetworkModel, self).__init__() self.fc1 = torch.nn.Linear(input_size, hidden_size) self.fc2 = torch.nn.Linear(hidden_size, num_classes) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x word2vec = downloader.load("word2vec-google-news-300") def word2vecOnDoc(document): return numpy.mean( [word2vec[token] for token in document if token in word2vec] or [numpy.zeros(300)], axis=0, ) def prepareData(data): data = [word_tokenize(row) for row in data.content.str.lower()] data = [word2vecOnDoc(document) for document in data] return data def trainModel(trainFileIn, trainFileExpected): inData = pd.read_table( trainFileIn, error_bad_lines=False, header=None, quoting=3, usecols=["content"], names=["content", "id"], nrows=225000, ) expectedData = pd.read_table( trainFileExpected, error_bad_lines=False, header=None, quoting=3, usecols=["label"], names=["label"], nrows=225000, ) # expectedData = prepareData(expectedData) inData = prepareData(inData) networkModel = NetworkModel(300, 300, 1) criterion = torch.nn.BCELoss() optim = torch.optim.SGD(networkModel.parameters(), lr=0.02) epochs = 1 batchSize = 2 for _ in range(epochs): networkModel.train() for i in range(0, expectedData.shape[0], batchSize): x = inData[i : i + batchSize] x = torch.tensor(x) y = expectedData[i : i + batchSize] y = torch.tensor(y.astype(numpy.float32).to_numpy()).reshape(-1, 1) outputs = networkModel(x.float()) loss = criterion(outputs, y) # print(loss) optim.zero_grad() loss.backward() optim.step() return networkModel def evaluateModel(model, inFile, outFile): inData = pd.read_table( inFile, error_bad_lines=False, header=None, quoting=3, usecols=["content"], names=["content", "id"], ) inData = prepareData(inData) batchSize = 2 pred = [] with torch.no_grad(): for i in range(0, len(inData), batchSize): x = inData[i : i + batchSize] x = torch.tensor(x) outputs = model(x.float()) prediction = outputs >= 0.5 pred += prediction.tolist() numpy.asarray(pred, dtype=numpy.int32).tofile(outFile, sep="\n") model = trainModel("train/in.tsv", "train/expected.tsv") evaluateModel(model, "dev-0/in.tsv", "dev-0/out.tsv") evaluateModel(model, "test-A/in.tsv", "test-A/out.tsv")