done
This commit is contained in:
parent
9cb2fb2612
commit
672870ba84
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
121
main.py
Normal file
121
main.py
Normal file
@ -0,0 +1,121 @@
|
||||
from gensim.models import KeyedVectors
|
||||
import nltk
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import csv
|
||||
|
||||
# nltk.download('punkt')
|
||||
|
||||
# http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
|
||||
# word2vec = KeyedVectors.load_word2vec_format(
|
||||
# 'wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
|
||||
# word2vec.save("word2vec.bin")
|
||||
|
||||
def ReadFile(path):
|
||||
variable = pd.read_table(path, error_bad_lines=False,
|
||||
sep='\t', quoting=csv.QUOTE_NONE, header=None)
|
||||
return variable
|
||||
|
||||
|
||||
train = ReadFile('train/train.tsv')
|
||||
x_dev = ReadFile('dev-0/in.tsv')
|
||||
y_dev = ReadFile('dev-0/expected.tsv')
|
||||
x_test = ReadFile('test-A/in.tsv')
|
||||
|
||||
x_train = train[1].values
|
||||
y_train = train[0].values
|
||||
x_dev = x_dev[0].values
|
||||
x_test = x_test[0].values
|
||||
batch_size = 12
|
||||
|
||||
word2vec = KeyedVectors.load("word2vec.bin")
|
||||
|
||||
|
||||
|
||||
def Tokenize(data):
|
||||
new_data = [nltk.word_tokenize(x) for x in data]
|
||||
|
||||
for doc in new_data:
|
||||
i = 0
|
||||
while i < len(doc):
|
||||
if doc[i].isalpha():
|
||||
doc[i] = doc[i].lower()
|
||||
else:
|
||||
del doc[i]
|
||||
i += 1
|
||||
return new_data
|
||||
|
||||
|
||||
def Generate(path):
|
||||
predict = model(torch.tensor(x_dev_vec.astype(np.float32)))
|
||||
predict = predict.cpu().detach().numpy()
|
||||
predict = (predict > 0.5)
|
||||
predict = np.asarray(predict, dtype=np.int32)
|
||||
predict.tofile(path, sep='\n')
|
||||
|
||||
|
||||
x_train_tokenized = Tokenize(x_train)
|
||||
x_dev_tokenized = Tokenize(x_dev)
|
||||
x_test_tokenized = Tokenize(x_test)
|
||||
|
||||
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [
|
||||
np.zeros(100)], axis=0) for content in x_train]
|
||||
x_train_tensor = torch.tensor(
|
||||
np.array(x_train, dtype=np.float32).astype(np.float32))
|
||||
x_train_vec = np.array(x_train, dtype=np.float32)
|
||||
|
||||
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [
|
||||
np.zeros(100)], axis=0) for content in x_dev]
|
||||
x_dev_vec = np.array(x_dev, dtype=np.float32)
|
||||
|
||||
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [
|
||||
np.zeros(100)], axis=0) for content in x_test]
|
||||
x_test_vec = np.array(x_test, dtype=np.float32)
|
||||
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(Model, self).__init__()
|
||||
self.var1 = torch.nn.Linear(100, 200)
|
||||
self.var2 = torch.nn.Linear(200, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.var1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.var2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
model = Model()
|
||||
criterion = torch.nn.BCELoss()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
|
||||
|
||||
for element in range(7):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.train()
|
||||
|
||||
for i in range(0, y_train.shape[0], batch_size):
|
||||
X = x_train_vec[i:i + batch_size]
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = y_train[i:i + batch_size]
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(Y_predictions, Y)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
print(element)
|
||||
|
||||
Generate('dev-0/out.tsv')
|
||||
Generate('test-A/out.tsv')
|
5452
test-A/out.tsv
Normal file
5452
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
word2vec.bin
Normal file
BIN
word2vec.bin
Normal file
Binary file not shown.
BIN
word2vec.bin.vectors.npy
Normal file
BIN
word2vec.bin.vectors.npy
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user