torch part 1

This commit is contained in:
Mariusz B 2021-05-29 14:30:20 +00:00
parent 19a5c79255
commit cd41e7ed4a
5 changed files with 300071 additions and 13 deletions

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

BIN
geval Executable file

Binary file not shown.

77
main.py
View File

@ -3,7 +3,38 @@ from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from gensim import downloader
from nltk.tokenize import word_tokenize
class NetworkModel(torch.nn.Module):
def __init__(self):
dim = 200
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(dim, 500)
self.fc2 = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
word2vec = downloader.load("word2vec-google-news-300")
def word2vecOnDoc(document):
return numpy.mean(
[word2vec[token] for token in document if token in word2vec] or [numpy.zeros(300)],
axis=0,
)
def prepareData(data):
data = [word_tokenize(row) for row in data]
print(data)
data = [word2vecOnDoc(document) for document in data]
return data
def trainModel(trainFileIn, trainFileExpected): def trainModel(trainFileIn, trainFileExpected):
with open(trainFileExpected, 'r') as f: with open(trainFileExpected, 'r') as f:
@ -12,23 +43,47 @@ def trainModel(trainFileIn, trainFileExpected):
with open(trainFileIn, 'r') as f: with open(trainFileIn, 'r') as f:
inData = f.readlines() inData = f.readlines()
expectedDataEncoded = LabelEncoder().fit_transform(expectedData) expectedData = prepareData(expectedData)
inData = prepareData(inData)
# networkModel = NetworkModel(300, 300, 1)
# criterion = torch.nn.BCELoss()
# optim = torch.optim.SGD(network.parameters(), lr=0.02)
# epochs = 1
# batchSize = 2
pipeline = Pipeline(steps=[ # for _ in range(epochs):
('tfidf', TfidfVectorizer()), # network.train()
('naive-bayes', MultinomialNB()) # for i in range(0, inData.shape[0], batchSize):
]) # x = inData[i : i + batchSize]
# x = torch.tensor(x)
return pipeline.fit(inData, expectedDataEncoded) # y = expectedData[i : i + batchSize]
# y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
# outputs = network(x.float())
# loss = criterion(outputs, y)
# print(loss)
# optim.zero_grad()
# loss.backward()
# optim.step()
# return networkModel
def evaluateModel(model, inFile, outFile): def evaluateModel(model, inFile, outFile):
with open(inFile, 'r') as f: with open(inFile, 'r') as f:
inData = f.readlines() inData = f.readlines()
prediction = model.predict(inData)
numpy.savetxt(outFile, prediction, fmt='%d', delimiter='\n') inData = prepareData(inData)
pred = []
with torch.no_grad():
for i in range(0, len(inData), batch_size):
x = inData[i : i + batch_size]
x = torch.tensor(x)
outputs = model(x.float())
prediction = outputs >= 0.5
pred += prediction.tolist()
numpy.asarray(pred, dtype=numpyp.int32).tofile(outFile, sep="\n")
model = trainModel("train/in.tsv", "train/expected.tsv") model = trainModel("train/in.tsv", "train/expected.tsv")
evaluateModel(model, "dev-0/in.tsv", "dev-0/out.tsv") #evaluateModel(model, "dev-0/in.tsv", "dev-0/out.tsv")
evaluateModel(model, "test-A/in.tsv", "test-A/out.tsv") #evaluateModel(model, "test-A/in.tsv", "test-A/out.tsv")

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long