This commit is contained in:
Piotrek96 2021-05-30 21:10:05 +02:00
parent 756ef4277a
commit 8e458b3951
6 changed files with 310517 additions and 0 deletions

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

90
skrypt.py Normal file
View File

@ -0,0 +1,90 @@
import numpy as np
import torch
from nltk.tokenize import word_tokenize
import nltk
from gensim import models
from pandas import DataFrame
import csv
eph = 5
BATCH_SIZE = 1
class MyNeuralnn_model(torch.nn.Module):
def __init__(self):
super(MyNeuralnn_model, self).__init__()
self.fc1 = torch.nn.Linear(300, 600)
self.fc2 = torch.nn.Linear(600, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
nn_model = MyNeuralnn_model()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.01)
word2vec = models.KeyedVectors.load_word2vec_format(r"GoogleNews-vectors-negative300.bin.gz", binary=True, limit = 100000)
def vectorize(document):
return np.mean([word2vec[word] for word in document if word in word2vec] or [np.zeros(300)], axis=0)
Xtrain = []
with open("train/in.tsv", 'r', encoding="utf-8") as train:
for line in csv.reader(train, delimiter="\t"):
Xtrain.append(line[0].lower())
Xtrain = [word_tokenize(x) for x in Xtrain]
Xtrain = [vectorize(document) for document in Xtrain]
Ytrain = []
with open("train/expected.tsv", 'r', encoding="utf-8") as train:
for line in csv.reader(train, delimiter="\t"):
Ytrain.append(line[0].lower())
Ytrain = DataFrame(Ytrain)
Xdev = []
with open("dev-0/in.tsv", 'r', encoding="utf-8") as train:
for line in csv.reader(train, delimiter="\t"):
Xdev.append(line[0].lower())
Xdev = [word_tokenize(x) for x in Xdev]
Xdev = [vectorize(x) for x in Xdev]
Xtest = []
with open("test-A/in.tsv", 'r', encoding="utf-8") as train:
for line in csv.reader(train, delimiter="\t"):
Xtest.append(line[0].lower())
Xtest = [word_tokenize(x) for x in Xtest]
Xtest = [vectorize(x) for x in Xtest]
for epoch in range(eph):
nn_model.train()
for i in range(0, Ytrain.shape[0], BATCH_SIZE):
x = Xtrain[i :i + BATCH_SIZE]
x = torch.tensor(x).float()
y = Ytrain[i :i + BATCH_SIZE]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1,1)
y_predictions = nn_model(x)
loss = criterion(y_predictions, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
# Predykcja dla Dev
with open('dev-0/out.tsv', 'w', encoding="utf-8") as dev_out:
for i in range(0, len(Xdev), BATCH_SIZE):
x = Xdev[i :i + BATCH_SIZE]
x = torch.tensor(x).float()
predict = nn_model(x) > 0.5
dev_out.write(str(predict.to(torch.int32)[0].item()) + '\n')
# Predykcja dla test
with open('test-A/out.tsv', 'w', encoding="utf-8") as test_A:
for i in range(0, len(Xtest), BATCH_SIZE):
x = Xtest[i :i + BATCH_SIZE]
x = torch.tensor(x).float()
predict = nn_model(x) > 0.5
test_A.write(str(predict.to(torch.int32)[0].item()) + '\n')

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long