Compare commits

...

1 Commits

Author SHA1 Message Date
Wiktor Tylman
6dea101d21 first applictation 2021-05-30 23:44:55 +02:00
3 changed files with 11039 additions and 0 deletions

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

135
main.py Normal file
View File

@ -0,0 +1,135 @@
from gensim.models import KeyedVectors
import nltk
import numpy as np
import os
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import torch
import csv
# Assigning data from files to variables
train = pd.read_table('train/train.tsv', error_bad_lines=False,
sep='\t', quoting=csv.QUOTE_NONE, header=None)
x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False,
sep='\t', header=None, quoting=csv.QUOTE_NONE)
y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False,
sep='\t', header=None, quoting=csv.QUOTE_NONE)
x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
sep='\t', header=None, quoting=csv.QUOTE_NONE)
# Data split na x i y
x_train = train[1].values
y_train = train[0].values
x_dev = x_dev[0].values
x_test = x_test[0].values
# I needed this only once
# nltk.download('punkt')
# Tokenization
def tokenize(data):
new_data = [nltk.word_tokenize(x) for x in data]
for doc in new_data:
i = 0
while i < len(doc):
if doc[i].isalpha():
doc[i] = doc[i].lower()
else:
del doc[i]
i += 1
return new_data
x_train_tokenized = tokenize(x_train)
x_dev_tokenized = tokenize(x_dev)
x_test_tokenized = tokenize(x_test)
# trained custom model form wiki-forms-all-100-skipg-ns
# run only on first try
# http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
# word2vec = KeyedVectors.load_word2vec_format(
# 'wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
# word2vec.save("word2vec.bin")
word2vec = KeyedVectors.load("word2vec.bin")
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_train]
x_train_tensor = torch.tensor(
np.array(x_train, dtype=np.float32).astype(np.float32))
x_train_vec = np.array(x_train, dtype=np.float32)
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_dev]
x_dev_vec = np.array(x_dev, dtype=np.float32)
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_test]
x_test_vec = np.array(x_test, dtype=np.float32)
class NNModel(torch.nn.Module):
def __init__(self):
super(NNModel, self).__init__()
self.fc1 = torch.nn.Linear(100, 200)
self.fc2 = torch.nn.Linear(200, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
model = NNModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
batch_size = 12
for epoch in range(10):
loss_score = 0
acc_score = 0
items_total = 0
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = x_train_vec[i:i + batch_size]
X = torch.tensor(X.astype(np.float32))
Y = y_train[i:i + batch_size]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
print(epoch)
# Generation
y_pred_dev = model(torch.tensor(x_dev_vec.astype(np.float32)))
y_pred_dev = y_pred_dev.cpu().detach().numpy()
y_pred_dev = (y_pred_dev > 0.5)
y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32)
y_pred_dev.tofile('dev-0/out.tsv', sep='\n')
y_pred_test = model(torch.tensor(x_dev_vec.astype(np.float32)))
y_pred_test = y_pred_test.cpu().detach().numpy()
y_pred_test = (y_pred_test > 0.5)
y_pred_test = np.asarray(y_pred_test, dtype=np.int32)
y_pred_test.tofile('test-A/out.tsv', sep='\n')

5452
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff