Add solution for full dataset

This commit is contained in:
Jakub Kolasiński 2021-05-14 00:04:00 +02:00
parent 142f0ca72e
commit 48d4451077
3 changed files with 5745 additions and 301 deletions

File diff suppressed because it is too large Load Diff

View File

@ -20,22 +20,23 @@ model = NeuralNetworkModel()
criterion = torch.nn.BCELoss() criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1) optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
def train(): def train():
with open('train/train.tsv') as f: with open('train/train.tsv') as f:
docs = [line.rstrip() for line in f] docs = [line.rstrip() for line in f]
docs_preprocessed = [] docs_preprocessed = []
y = [] y = []
for doc in docs[:10000]: for doc in docs:
y_with_doc = doc.split('\t') y_with_doc = doc.split('\t')
y.append(y_with_doc[0]) y.append(y_with_doc[0])
doc = y_with_doc[1] docs_preprocessed.append(preprocess(y_with_doc[1]))
doc = preprocess(doc)
docs_preprocessed.append(doc)
y = [int(value) for value in y] y = [int(value) for value in y]
y = np.reshape(y, (len(y), 1)) y = np.reshape(y, (len(y), 1))
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)] tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
print("Fitting vectorizer...")
global vectorizer global vectorizer
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features) vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
print("DONE!")
x = vectorizer.dv.vectors x = vectorizer.dv.vectors
for epoch in range(5): for epoch in range(5):
loss_score = 0 loss_score = 0
@ -58,25 +59,21 @@ def train():
print(epoch) print(epoch)
print(get_loss_acc(model, x, y, criterion, optimizer)) print(get_loss_acc(model, x, y, criterion, optimizer))
def classify(path): def classify(path):
print("Predicting for", path)
with open(path + 'in.tsv') as f: with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f] docs = [line.rstrip() for line in f]
docs = [preprocess(doc) for doc in docs] docs = [preprocess(doc) for doc in docs]
test_x = [vectorizer.infer_vector(doc) for doc in docs] test_x = [vectorizer.infer_vector(doc) for doc in docs]
test_x = np.array(test_x, np.float32) test_x = np.array(test_x, np.float32)
test_x = torch.tensor(test_x) predictions = model(torch.tensor(test_x)).detach().numpy()[:, 0]
predictions = model(test_x).detach().numpy()[:, 0] predictions = [1 if value >= 0.5 else 0 for value in predictions]
p = []
for prediction in predictions:
if prediction >= 0.5:
p.append(1)
else:
p.append(0)
with open(path + 'out.tsv', 'w') as file: with open(path + 'out.tsv', 'w') as file:
for prediction in p: for prediction in predictions:
file.write("%i\n" % prediction) file.write("%i\n" % prediction)
train() train()
classify('dev-0/') classify('dev-0/')
# classify('test-A/') classify('test-A/')

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff