Compare commits

..

4 Commits

Author SHA1 Message Date
wangobango
962ca45b2c final 2021-06-22 17:16:57 +02:00
wangobango
43dbf81d83 change 2021-06-22 14:03:36 +02:00
wangobango
023a4e4361 progress 2021-06-20 22:05:07 +02:00
wangobango
dbadedfc1c w123 2021-06-20 19:42:14 +02:00
9 changed files with 6429 additions and 290862 deletions

5
.gitignore vendored
View File

@ -6,8 +6,3 @@
*.o
.DS_Store
.token
venv/*
*.pickle
.idea/*
.vscode/*
in.tsv

File diff suppressed because it is too large Load Diff

92
generate.py Normal file
View File

@ -0,0 +1,92 @@
import pandas as pd
from transformers import BertTokenizer, AdamW, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import csv
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
return item
def __len__(self):
return len(self.labels)
def save_tsv_result(path, data):
with open(path, "w") as save:
writer = csv.writer(save, delimiter='\t', lineterminator='\n')
for value in [str(x) for x in data]:
writer.writerow([value])
def predictions_for_set(inputs, masks):
predictions = []
with torch.no_grad():
batch_size = 60
for i in range(0, len(inputs), batch_size):
preds = model(inputs[i: i + batch_size].to(device),
masks[i: i + batch_size].to(device))
preds = preds.logits.detach().cpu().numpy()
preds = np.argmax(preds, axis=1)
predictions += preds.tolist()
return predictions
device = torch.device('cuda')
# train_texts = \
# pd.read_csv('train/in.tsv.xz', compression='xz', sep='\t',
# header=None, error_bad_lines=False, quoting=3)[0].tolist()
# train_labels = pd.read_csv(
# 'train/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist()
dev_texts = pd.read_csv('dev-0/in.tsv.xz', compression='xz',
sep='\t', header=None, quoting=3)[0].tolist()
dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t',
header=None, quoting=3)[0].tolist()
test_texts = pd.read_csv('test-A/in.tsv.xz', compression='xz', sep='\t',
header=None, error_bad_lines=False, quoting=3)[0].tolist()
model_name = "bert-base-uncased-pretrained"
model = BertForSequenceClassification.from_pretrained(
model_name, num_labels=len(pd.unique(dev_labels))).to(device)
max_length = 512
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
# model.load_pretrained(model_path)
# tokenizer.load_pretrainded(model_path)
# train_encodings = tokenizer(
# train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(
dev_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(
test_texts, truncation=True, padding=True, max_length=max_length)
input_ids_val = torch.tensor(valid_encodings.data['input_ids'])
attention_mask_val = torch.tensor(valid_encodings.data['attention_mask'])
input_ids_test = torch.tensor(test_encodings.data['input_ids'])
attention_mask_test = torch.tensor(test_encodings.data['attention_mask'])
predictions = predictions_for_set(input_ids_val, attention_mask_val)
print("Predictions for dev set:")
print(classification_report(dev_labels, predictions))
print(accuracy_score(dev_labels, predictions))
print(f1_score(dev_labels, predictions))
save_tsv_result("dev-0/out.tsv", predictions)
predictions = predictions_for_set(input_ids_test, attention_mask_test)
save_tsv_result("test-A/out.tsv", predictions)

128
main.py
View File

@ -1,54 +1,90 @@
"""
Zadanie domowe
wybrać jedno z poniższych repozytoriów i je sforkować:
https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public
https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public
stworzyć klasyfikator bazujący na prostej sieci neuronowej feed forward w pytorchu (można bazować na tym jupyterze).
Zamiast tfidf proszę skorzystać z jakieś reprezentacji gęstej (np. word2vec).
stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67
proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do
swojego repo termin 25.05, 70 punktów
"""
import pandas as pd
import spacy
from net import FFN
import numpy as np
import torch
from utils import create_embeddings_file, load_embeddings_file
from nltk.tokenize import word_tokenize
# sp = spacy.load('en_core_web_sm')
from transformers.file_utils import is_torch_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.metrics import accuracy_score
import pandas as pd
# def word2vec(word):
# return sp(word).vector
# return np.random.uniform(low=0.0, high=1.0, size=(384,))
train_data = pd.read_csv("train/in.tsv", sep='\t')
train_data.columns = ['PostText', 'Timestamp']
train_expected = pd.read_csv("train/expected.tsv", sep='\t')
train_expected.columns = ['Label']
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
# test_data = pd.read_csv("test-A/in.tsv", sep='\t')
# test_data.columns = ['PostText', 'Timestamp']
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
return item
# dev_data = pd.read_csv('dev-0/in.tsv', sep='\t')
# dev_data.columns = ['PostText', 'Timestamp']
# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t')
# dev_expected.columns = ['Label']
def __len__(self):
return len(self.labels)
# create_embeddings_file(dev_data['PostText'], 'dev-0/embeddings.csv', word2vec)
# create_embeddings_file(test_data['PostText'], 'test-A/embeddings.csv', word2vec)
# create_embeddings_file(train_data['PostText'], 'train/embeddings.csv', word2vec)
# train_data = load_embeddings_file('train/embeddings.csv').to_numpy()
# dev_data = load_embeddings_file('dev-0/embeddings.csv').to_numpy()
# test_data = load_embeddings_file('test-A/embeddings.csv').to_numpy()
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
if is_torch_available():
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
model = FFN(300, 1, 300, 300, 0.01, 4, 100)
# model.double()
# model.train([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'])
model.load()
model.double()
model.test([np.asarray(word_tokenize(x)) for x in train_data['PostText']], train_expected['Label'], "train/out.tsv")
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
}
set_seed(1)
train_texts = \
pd.read_csv('train/in.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3)[0].tolist()[:25000]
train_labels = pd.read_csv('train/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist()[:25000]
dev_texts = pd.read_csv('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)[0].tolist()[:1000]
dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist()[:1000]
# test_texts = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
model_name = "bert-base-uncased"
max_length = 25
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=max_length)
train_dataset = Dataset(train_encodings, train_labels)
valid_dataset = Dataset(valid_encodings, dev_labels)
model = BertForSequenceClassification.from_pretrained(
model_name, num_labels=len(pd.unique(train_labels))).to("cuda")
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=1, # total number of training epochs
per_device_train_batch_size=60, # batch size per device during training
per_device_eval_batch_size=60, # batch size for evaluation
warmup_steps=100, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
load_best_model_at_end=True, # load the best model when finished training (default metric is loss)
# but you can specify `metric_for_best_model` argument to change to accuracy or other metric
logging_steps=200, # log & save weights each logging_steps
evaluation_strategy="steps", # evaluate each `logging_steps`
)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=valid_dataset, # evaluation dataset
compute_metrics=compute_metrics, # the callback that computes metrics of interest
)
trainer.train()
trainer.evaluate()
model_path = "bert-base-uncased-pretrained"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

103
net.py
View File

@ -1,103 +0,0 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
import pickle
import numpy as np
import pandas as pd
from word2vec import Word2Vec
class FFN(nn.Module):
def __init__(self, input_dim, output_dim, hidden1_size, hidden2_size, lr, epochs, batch_size):
super(FFN, self).__init__()
self.path = 'model1.pickle'
self.lr = lr
self.epochs = epochs
self.output_dim = output_dim
self.word2vec = Word2Vec()
self.word2vec.load()
self.batch_size = batch_size
self.input_dim = input_dim
self.fc1 = nn.Linear(batch_size, hidden1_size)
self.fc2 = nn.Linear(hidden1_size, hidden2_size)
self.fc3 = nn.Linear(hidden2_size, hidden2_size)
self.fc4 = nn.Linear(hidden2_size, hidden2_size)
self.fc5 = nn.Linear(hidden2_size, batch_size)
def forward(self, data):
data = F.relu(self.fc1(data))
data = F.relu(self.fc2(data))
data = F.relu(self.fc3(data))
data = F.relu(self.fc4(data))
data = F.sigmoid(self.fc5(data))
return data
def serialize(self):
with open(self.path, 'wb') as file:
pickle.dump(self, file)
def load(self):
with open(self.path, 'rb') as file:
self = pickle.load(file)
def batch(self, iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
"""
data is a tuple of embedding vector and a label of 0/1
"""
def train(self, data, expected):
self.zero_grad()
criterion = torch.nn.BCELoss()
optimizer = optim.Adam(self.parameters(), lr=self.lr)
batch_size = self.batch_size
num_of_classes = self.output_dim
for epoch in range(self.epochs):
epoch_loss = 0.0
idx = 0
for i in range(0, int(len(data)/batch_size)*batch_size, batch_size):
inputs = data[i:i + batch_size]
labels = expected[i:i+ batch_size]
optimizer.zero_grad()
outputs = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
target = torch.tensor(labels.values).double()
loss = criterion(outputs.view(batch_size), target.view(-1,))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
if(idx % 1000 == 0):
print('epoch: {}, idx: {}, loss: {}'.format(epoch, idx, epoch_loss/1000))
epoch_loss = 0
idx += 1
self.serialize()
def test(self, data, expected, path):
correct = 0
incorrect = 0
total = 0
predictions = []
batch_size = self.batch_size
for i in range(0, int(len(data)/batch_size)*batch_size, batch_size):
inputs = data[i:i + batch_size]
labels = expected[i:i+ batch_size]
predicted = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
score = [1 if x > 0.5 else 0 for x in predicted]
for x, y in zip(score, labels):
if(x == y):
correct += 1
else:
incorrect += 1
predictions.append(score)
print(correct)
print(incorrect)
print(correct/(incorrect + correct))
df = pd.DataFrame(np.asarray(predictions).reshape(int(len(data)/batch_size)*batch_size))
df.reset_index(drop=True, inplace=True)
df.to_csv(path, sep="\t", index=False)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

289579
train/out.tsv

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
import pandas as pd
def create_embeddings_file(data, path, func):
out = []
for line in data:
out.append(func(line))
df = pd.DataFrame(out)
df.to_csv(path)
def load_embeddings_file(path):
return pd.read_csv(path)

View File

@ -1,15 +0,0 @@
import gensim.downloader
import numpy as np
class Word2Vec():
def __init__(self) -> None:
pass
def load(self):
self.model = gensim.downloader.load('word2vec-google-news-300')
def sentence2vec(self, sentence):
return np.mean([self.model[word] if word in self.model else np.zeros(300) for word in sentence])
def list_of_sentences2vec(self, sentences):
return [self.sentence2vec(x) for x in sentences]