Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

7 changed files with 0 additions and 310534 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

BIN
geval

Binary file not shown.

107
main.py
View File

@ -1,107 +0,0 @@
import numpy
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from gensim import downloader
from nltk.tokenize import word_tokenize
import pandas as pd
class NetworkModel(torch.nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(input_size, hidden_size)
self.fc2 = torch.nn.Linear(hidden_size, num_classes)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
word2vec = downloader.load("word2vec-google-news-300")
def word2vecOnDoc(document):
return numpy.mean(
[word2vec[token] for token in document if token in word2vec] or [numpy.zeros(300)],
axis=0,
)
def prepareData(data):
data = [word_tokenize(row) for row in data.content.str.lower()]
data = [word2vecOnDoc(document) for document in data]
return data
def trainModel(trainFileIn, trainFileExpected):
inData = pd.read_table(
trainFileIn,
error_bad_lines=False,
header=None,
quoting=3,
usecols=["content"],
names=["content", "id"],
nrows=225000,
)
expectedData = pd.read_table(
trainFileExpected,
error_bad_lines=False,
header=None,
quoting=3,
usecols=["label"],
names=["label"],
nrows=225000,
)
# expectedData = prepareData(expectedData)
inData = prepareData(inData)
networkModel = NetworkModel(300, 300, 1)
criterion = torch.nn.BCELoss()
optim = torch.optim.SGD(networkModel.parameters(), lr=0.02)
epochs = 1
batchSize = 2
for _ in range(epochs):
networkModel.train()
for i in range(0, expectedData.shape[0], batchSize):
x = inData[i : i + batchSize]
x = torch.tensor(x)
y = expectedData[i : i + batchSize]
y = torch.tensor(y.astype(numpy.float32).to_numpy()).reshape(-1, 1)
outputs = networkModel(x.float())
loss = criterion(outputs, y)
# print(loss)
optim.zero_grad()
loss.backward()
optim.step()
return networkModel
def evaluateModel(model, inFile, outFile):
inData = pd.read_table(
inFile,
error_bad_lines=False,
header=None,
quoting=3,
usecols=["content"],
names=["content", "id"],
)
inData = prepareData(inData)
batchSize = 2
pred = []
with torch.no_grad():
for i in range(0, len(inData), batchSize):
x = inData[i : i + batchSize]
x = torch.tensor(x)
outputs = model(x.float())
prediction = outputs >= 0.5
pred += prediction.tolist()
numpy.asarray(pred, dtype=numpy.int32).tofile(outFile, sep="\n")
model = trainModel("train/in.tsv", "train/expected.tsv")
evaluateModel(model, "dev-0/in.tsv", "dev-0/out.tsv")
evaluateModel(model, "test-A/in.tsv", "test-A/out.tsv")

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long