This commit is contained in:
Jakub Kaczmarek 2022-06-08 10:59:39 +02:00
parent 756ef4277a
commit 1f199f4d86
11 changed files with 310583 additions and 0 deletions

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

130
main.py Normal file
View File

@ -0,0 +1,130 @@
import numpy as np
from regex import W
from network import LogisticRegressionModel
from gensim.models import Word2Vec
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
import nltk
import re
def vectorize(word2vec, documents):
vectorized = []
for d in documents:
vectorized.append(
np.mean(
[
word2vec.wv[word]
if word in word2vec.wv
else np.zeros(100, dtype=float)
for word in d
],
axis=0,
)
)
return np.array(vectorized)
def flatten(t):
return [str(int(item)) for sublist in t for item in sublist]
def predict(model, testX):
testX = torch.tensor(testX.astype(np.float32))
with torch.no_grad():
return flatten(model(testX).round().tolist())
def trainModel(
model,
trainX,
trainY,
optimizer,
criterion=torch.nn.BCELoss(),
epochs=10,
batchSize=256,
):
for epoch in range(epochs):
loss_score = 0
acc_score = 0
items_total = 0
model.train()
for i in range(0, trainY.shape[0], batchSize):
X = trainX[i : i + batchSize]
X = torch.tensor(X.astype(np.float32))
Y = trainY[i : i + batchSize]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
print(f"Epoch {epoch+1}/{epochs}")
loss, accuracy = getMetrics(model, trainX, trainY, criterion, batchSize)
print(f"Train set\nloss = {loss}, accuracy = {accuracy}")
def getMetrics(model, X_dataset, Y_dataset, criterion, batchSize):
loss_score = 0
acc_score = 0
items_total = 0
model.eval()
for i in range(0, Y_dataset.shape[0], batchSize):
X = np.array(X_dataset[i : i + batchSize], dtype=object)
X = torch.tensor(X.astype(np.float32))
Y = np.array(Y_dataset[i : i + batchSize])
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
loss = criterion(Y_predictions, Y)
loss_score += loss.item() * Y.shape[0]
return (loss_score / items_total), (acc_score / items_total)
def write(content, dir):
with open(dir, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(content)
def read_file(path):
with open(path, 'r') as f:
return f.readlines()
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("train/in.tsv")]
y = np.array([re.sub(r'\n', '', x) for x in read_file("train/expected.tsv")])
print(X[0])
w2v = Word2Vec(X, vector_size=100, min_count=2)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train = vectorize(w2v, X_train)
X_test = vectorize(w2v, X_test)
print(X_train[0])
nn = LogisticRegressionModel()
optimizer = torch.optim.SGD(nn.parameters(), lr=0.01)
trainModel(nn, X_train, y_train, optimizer)
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("test-A/in.tsv")]
X = vectorize(w2v, X)
write(predict(nn, X), "test-A/out.tsv")
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("dev-0/in.tsv")]
X = vectorize(w2v, X)
write(predict(nn, X), "dev-0/out.tsv")

14
network.py Normal file
View File

@ -0,0 +1,14 @@
import torch
class LogisticRegressionModel(torch.nn.Module):
def __init__(self, features=100):
super(LogisticRegressionModel, self).__init__()
self.fc = torch.nn.Linear(features,400)
self.fc2 = torch.nn.Linear(400,1)
def forward(self, x):
x = self.fc(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

12
train_word2vec.py Normal file
View File

@ -0,0 +1,12 @@
import csv
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
with open("train/in.tsv") as f:
texts = csv.reader(f, delimiter="\t")
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

BIN
word2vec.model Normal file

Binary file not shown.

BIN
word2vec.wordvectors Normal file

Binary file not shown.