Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

11 changed files with 0 additions and 600169 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,59 +0,0 @@
import random
import torch
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
)
class DataWrapper(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
def read_data(file_path):
with open(file_path) as f:
return f.readlines()
def wirte_output(file_path, data):
with open(file_path, 'w') as writer:
for result in trainer.predict(data):
writer.write(f"{str(result)}\n")
print("STEP 1 - READ DATA")
X_train = read_data('train/in.tsv')
y_train = read_data('train/expected.tsv')
X_dev = read_data('dev-0/in.tsv')
X_test = read_data('test-A/in.tsv')
print("STEP 2 - SHUFFLE")
data_train = list(zip(X_train, y_train))
data_train = random.sample(data_train, 15000)
print("STEP 3 - FINE TUNING")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer([text[0] for text in data_train], truncation=True, padding=True)
train_dataset = DataWrapper(train_encodings, [int(text[1]) for text in data_train])
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
args = TrainingArguments("model")
device = torch.device("cpu")
# device = torch.device("cuda")
model.to(device)
trainer = Trainer(model=model, args=args, train_dataset=train_dataset)
trainer.train()
print("STEP 4 - WRITE OUTPUT")
wirte_output('train/out.tsv', X_train)
wirte_output('dev-0/out.tsv', X_dev)
wirte_output('test-A/out.tsv', X_test)

BIN
geval

Binary file not shown.

View File

@ -1,5 +0,0 @@
Likelihood 0.0000
Accuracy 0.7517
F1.0 0.6119
Precision 0.6848
Recall 0.5531

View File

@ -1,94 +0,0 @@
import numpy as np
import pandas as pd
import torch
import csv
import gensim.downloader
import torch
from nltk import word_tokenize
class NeuralNetwork(torch.nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNetwork, self).__init__()
self.l1 = torch.nn.Linear(input_size, hidden_size)
self.l2 = torch.nn.Linear(hidden_size, num_classes)
def forward(self, x):
x = self.l1(x)
x = torch.relu(x)
x = self.l2(x)
x = torch.sigmoid(x)
return x
print('STEP 1 - LOAD DATA')
names = ['content', 'id', 'label']
train_data_content = pd.read_table('train/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=names[:2])
train_data_labels = pd.read_table('train/expected.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=names[2:])
dev_data = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=names[:2])
test_data = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=names[:2])
print('STEP 2 - SET PARAMS')
hidden_size = int(input('Hidden units size: ') or '600')
epochs = int(input("Epochs: ") or '5')
batch_size = int(input("Batch size: ") or '15')
print('STEP 3 - PREPROCESSING')
# lowercase all content
X_train = train_data_content['content'].str.lower()
y_train = train_data_labels['label']
X_dev = dev_data['content'].str.lower()
X_test = test_data['content'].str.lower()
# tokenize datasets
X_train = [word_tokenize(content) for content in X_train]
X_dev = [word_tokenize(content) for content in X_dev]
X_test = [word_tokenize(content) for content in X_test]
# use Google word2vec algorithm
word2vec = gensim.downloader.load('word2vec-google-news-300')
X_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_train]
X_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_dev]
X_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in X_test]
print('STEP 4 - MODEL TRAINING')
#prepare neural model
model = NeuralNetwork(300, hidden_size, 1)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(epochs):
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = X_train[i:i+batch_size]
X = torch.tensor(X)
y = y_train[i:i+batch_size]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
outputs = model(X.float())
loss = criterion(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('STEP 5 - PREDICTION')
y_dev, y_test = [], []
model.eval()
with torch.no_grad():
for i in range(0, len(X_dev), batch_size):
X = X_dev[i:i+batch_size]
X = torch.tensor(X)
outputs = model(X.float())
prediction = (outputs > 0.5)
y_dev += prediction.tolist()
for i in range(0, len(X_test), batch_size):
X = X_test[i:i+batch_size]
X = torch.tensor(X)
outputs = model(X.float())
y = (outputs > 0.5)
y_test += prediction.tolist()
print('STEP 6 - EXPORT RESULTS')
# export results to tsv
y_dev = np.asarray(y_dev, dtype=np.int32)
y_test = np.asarray(y_test, dtype=np.int32)
y_dev.tofile('./dev-0/out.tsv', sep='\n')
y_test.tofile('./test-A/out.tsv', sep='\n')

View File

@ -1,5 +0,0 @@
Likelihood 0.0000
Accuracy 0.8253
F1.0 0.7472
Precision 0.7659
Recall 0.7294

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long

289579
train/out.tsv

File diff suppressed because it is too large Load Diff