Compare commits

...

1 Commits

Author SHA1 Message Date
Ufnow
672870ba84 done 2021-06-01 03:07:50 +02:00
7 changed files with 109157 additions and 0 deletions

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
geval Normal file

Binary file not shown.

121
main.py Normal file
View File

@ -0,0 +1,121 @@
from gensim.models import KeyedVectors
import nltk
import numpy as np
import pandas as pd
import torch
import csv
# nltk.download('punkt')
# http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
# word2vec = KeyedVectors.load_word2vec_format(
# 'wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
# word2vec.save("word2vec.bin")
def ReadFile(path):
variable = pd.read_table(path, error_bad_lines=False,
sep='\t', quoting=csv.QUOTE_NONE, header=None)
return variable
train = ReadFile('train/train.tsv')
x_dev = ReadFile('dev-0/in.tsv')
y_dev = ReadFile('dev-0/expected.tsv')
x_test = ReadFile('test-A/in.tsv')
x_train = train[1].values
y_train = train[0].values
x_dev = x_dev[0].values
x_test = x_test[0].values
batch_size = 12
word2vec = KeyedVectors.load("word2vec.bin")
def Tokenize(data):
new_data = [nltk.word_tokenize(x) for x in data]
for doc in new_data:
i = 0
while i < len(doc):
if doc[i].isalpha():
doc[i] = doc[i].lower()
else:
del doc[i]
i += 1
return new_data
def Generate(path):
predict = model(torch.tensor(x_dev_vec.astype(np.float32)))
predict = predict.cpu().detach().numpy()
predict = (predict > 0.5)
predict = np.asarray(predict, dtype=np.int32)
predict.tofile(path, sep='\n')
x_train_tokenized = Tokenize(x_train)
x_dev_tokenized = Tokenize(x_dev)
x_test_tokenized = Tokenize(x_test)
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_train]
x_train_tensor = torch.tensor(
np.array(x_train, dtype=np.float32).astype(np.float32))
x_train_vec = np.array(x_train, dtype=np.float32)
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_dev]
x_dev_vec = np.array(x_dev, dtype=np.float32)
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_test]
x_test_vec = np.array(x_test, dtype=np.float32)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.var1 = torch.nn.Linear(100, 200)
self.var2 = torch.nn.Linear(200, 1)
def forward(self, x):
x = self.var1(x)
x = torch.relu(x)
x = self.var2(x)
x = torch.sigmoid(x)
return x
model = Model()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for element in range(7):
loss_score = 0
acc_score = 0
items_total = 0
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = x_train_vec[i:i + batch_size]
X = torch.tensor(X.astype(np.float32))
Y = y_train[i:i + batch_size]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
print(element)
Generate('dev-0/out.tsv')
Generate('test-A/out.tsv')

5452
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
word2vec.bin Normal file

Binary file not shown.

BIN
word2vec.bin.vectors.npy Normal file

Binary file not shown.