This commit is contained in:
wangobango 2021-05-25 14:23:00 +02:00
parent 756ef4277a
commit 323ab17708
7 changed files with 295039 additions and 0 deletions

5
.gitignore vendored
View File

@ -6,3 +6,8 @@
*.o
.DS_Store
.token
venv/*
*.pickle
.idea/*
.vscode/*
in.tsv

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

54
main.py Normal file
View File

@ -0,0 +1,54 @@
"""
Zadanie domowe
wybrać jedno z poniższych repozytoriów i je sforkować:
https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public
https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public
stworzyć klasyfikator bazujący na prostej sieci neuronowej feed forward w pytorchu (można bazować na tym jupyterze).
Zamiast tfidf proszę skorzystać z jakieś reprezentacji gęstej (np. word2vec).
stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67
proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do
swojego repo termin 25.05, 70 punktów
"""
import pandas as pd
import spacy
from net import FFN
import numpy as np
import torch
from utils import create_embeddings_file, load_embeddings_file
from nltk.tokenize import word_tokenize
# sp = spacy.load('en_core_web_sm')
# def word2vec(word):
# return sp(word).vector
# return np.random.uniform(low=0.0, high=1.0, size=(384,))
train_data = pd.read_csv("train/in.tsv", sep='\t')
train_data.columns = ['PostText', 'Timestamp']
train_expected = pd.read_csv("train/expected.tsv", sep='\t')
train_expected.columns = ['Label']
# test_data = pd.read_csv("test-A/in.tsv", sep='\t')
# test_data.columns = ['PostText', 'Timestamp']
# dev_data = pd.read_csv('dev-0/in.tsv', sep='\t')
# dev_data.columns = ['PostText', 'Timestamp']
# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t')
# dev_expected.columns = ['Label']
# create_embeddings_file(dev_data['PostText'], 'dev-0/embeddings.csv', word2vec)
# create_embeddings_file(test_data['PostText'], 'test-A/embeddings.csv', word2vec)
# create_embeddings_file(train_data['PostText'], 'train/embeddings.csv', word2vec)
# train_data = load_embeddings_file('train/embeddings.csv').to_numpy()
# dev_data = load_embeddings_file('dev-0/embeddings.csv').to_numpy()
# test_data = load_embeddings_file('test-A/embeddings.csv').to_numpy()
model = FFN(300, 1, 300, 300, 0.01, 4, 100)
# model.double()
# model.train([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'])
model.load()
model.double()
model.test([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'], "train/out.tsv")

103
net.py Normal file
View File

@ -0,0 +1,103 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
import pickle
import numpy as np
import pandas as pd
from word2vec import Word2Vec
class FFN(nn.Module):
def __init__(self, input_dim, output_dim, hidden1_size, hidden2_size, lr, epochs, batch_size):
super(FFN, self).__init__()
self.path = 'model1.pickle'
self.lr = lr
self.epochs = epochs
self.output_dim = output_dim
self.word2vec = Word2Vec()
self.word2vec.load()
self.batch_size = batch_size
self.input_dim = input_dim
self.fc1 = nn.Linear(batch_size, hidden1_size)
self.fc2 = nn.Linear(hidden1_size, hidden2_size)
self.fc3 = nn.Linear(hidden2_size, hidden2_size)
self.fc4 = nn.Linear(hidden2_size, hidden2_size)
self.fc5 = nn.Linear(hidden2_size, batch_size)
def forward(self, data):
data = F.relu(self.fc1(data))
data = F.relu(self.fc2(data))
data = F.relu(self.fc3(data))
data = F.relu(self.fc4(data))
data = F.sigmoid(self.fc5(data))
return data
def serialize(self):
with open(self.path, 'wb') as file:
pickle.dump(self, file)
def load(self):
with open(self.path, 'rb') as file:
self = pickle.load(file)
def batch(self, iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
"""
data is a tuple of embedding vector and a label of 0/1
"""
def train(self, data, expected):
self.zero_grad()
criterion = torch.nn.BCELoss()
optimizer = optim.Adam(self.parameters(), lr=self.lr)
batch_size = self.batch_size
num_of_classes = self.output_dim
for epoch in range(self.epochs):
epoch_loss = 0.0
idx = 0
for i in range(0, int(len(data)/batch_size)*batch_size, batch_size):
inputs = data[i:i + batch_size]
labels = expected[i:i+ batch_size]
optimizer.zero_grad()
outputs = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
target = torch.tensor(labels.values).double()
loss = criterion(outputs.view(batch_size), target.view(-1,))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
if(idx % 1000 == 0):
print('epoch: {}, idx: {}, loss: {}'.format(epoch, idx, epoch_loss/1000))
epoch_loss = 0
idx += 1
self.serialize()
def test(self, data, expected, path):
correct = 0
incorrect = 0
total = 0
predictions = []
batch_size = self.batch_size
for i in range(0, int(len(data)/batch_size)*batch_size, batch_size):
inputs = data[i:i + batch_size]
labels = expected[i:i+ batch_size]
predicted = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
score = [1 if x > 0.5 else 0 for x in predicted]
for x, y in zip(score, labels):
if(x == y):
correct += 1
else:
incorrect += 1
predictions.append(score)
print(correct)
print(incorrect)
print(correct/(incorrect + correct))
df = pd.DataFrame(np.asarray(predictions).reshape(int(len(data)/batch_size)*batch_size))
df.reset_index(drop=True, inplace=True)
df.to_csv(path, sep="\t", index=False)

289579
train/out.tsv Normal file

File diff suppressed because it is too large Load Diff

11
utils.py Normal file
View File

@ -0,0 +1,11 @@
import pandas as pd
def create_embeddings_file(data, path, func):
out = []
for line in data:
out.append(func(line))
df = pd.DataFrame(out)
df.to_csv(path)
def load_embeddings_file(path):
return pd.read_csv(path)

15
word2vec.py Normal file
View File

@ -0,0 +1,15 @@
import gensim.downloader
import numpy as np
class Word2Vec():
def __init__(self) -> None:
pass
def load(self):
self.model = gensim.downloader.load('word2vec-google-news-300')
def sentence2vec(self, sentence):
return np.mean([self.model[word] if word in self.model else np.zeros(300) for word in sentence])
def list_of_sentences2vec(self, sentences):
return [self.sentence2vec(x) for x in sentences]