Compare commits

...

9 Commits

Author SHA1 Message Date
nlitkowski
6e057483a7 Add output 2021-06-22 22:58:14 +02:00
nlitkowski
a9ba5d5728 add model 2021-06-22 22:46:19 +02:00
nlitkowski
9531e6c569 Restore unpacked files 2021-06-22 20:27:41 +02:00
nlitkowski
567f498ee2 Fix script 2021-06-22 20:27:03 +02:00
nlitkowski
4b000457df fix script 2021-06-22 18:13:51 +02:00
nlitkowski
c36ba1d489 Add GPU 2021-06-22 16:41:34 +02:00
nlitkowski
0249514499 Add decompressed files 2021-06-22 15:59:06 +02:00
nlitkowski
16771b5293 Fix training args 2021-06-22 15:55:04 +02:00
nlitkowski
415cad97e2 Add script 2021-06-21 21:46:10 +02:00
9 changed files with 310559 additions and 0 deletions

2
.gitignore vendored
View File

@ -6,3 +6,5 @@
*.o
.DS_Store
.token
**/.vscode/*

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

103
main.py Normal file
View File

@ -0,0 +1,103 @@
import os
import sys
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import random
IN_FILE_NAME = "in.tsv"
OUT_FILE_NAME = "out.tsv"
TRAIN_PATH = "train"
EXP_FILE_NAME = "expected.tsv"
FILE_SEP = "\t"
# PT_MODEL_NAME = "bert-base-cased"
PT_MODEL_NAME = "roberta-base"
MODEL_OUT_NAME = "./model.tr"
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx])
for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
def main(dirnames):
print("Reading train data...")
train_set_features = get_tsv_data(os.path.join(
TRAIN_PATH, IN_FILE_NAME), compressed=True)
train_set_labels = get_tsv_data(os.path.join(
TRAIN_PATH, EXP_FILE_NAME), compressed=True)
print("Reading input data...")
in_sets = []
for d in dirnames:
print(f"\tReading dir: {d}...")
in_sets.append(get_tsv_data(
os.path.join(d, IN_FILE_NAME), compressed=True))
train_data = list(zip(train_set_features, train_set_labels))
train_data = random.sample(train_data, 15000)
mname = PT_MODEL_NAME
pt = os.path.exists(MODEL_OUT_NAME)
if pt:
mname = MODEL_OUT_NAME
tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModelForSequenceClassification.from_pretrained(
mname, num_labels=2)
train_set_enc = tokenizer(
[text[0] for text in train_data], truncation=True, padding=True)
ds = CustomDataset(
train_set_enc, [int(text[1]) for text in train_data])
device = torch.device(
'cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
trainer = Trainer(
model=model,
args=TrainingArguments("model"),
train_dataset=ds
)
print("Starting training...")
if not pt:
trainer.train()
trainer.save_model(MODEL_OUT_NAME)
print("Predicting outputs...")
for i in range(len(in_sets)):
p_in = os.path.join(dirnames[i], IN_FILE_NAME)
p_out = os.path.join(dirnames[i], OUT_FILE_NAME)
with open(p_out, "w") as f:
print(
f"\tPredicting for: {p_in}...")
f.write('\n'.join(trainer.predict(in_sets[i])))
print(f"Saved predictions to file: {p_out}")
def get_tsv_data(filename: str, compressed=False):
check_path(filename=filename)
with open(filename) as f:
return f.readlines()
def check_path(filename: str):
if not os.path.exists(filename):
raise Exception(f"Path {filename} does not exist!")
if __name__ == "__main__":
if len(sys.argv) < 2:
raise Exception("Name of working dir not specified!")
main(sys.argv[1:])

27
model.tr/config.json Normal file
View File

@ -0,0 +1,27 @@
{
"_name_or_path": "roberta-base",
"architectures": [
"RobertaForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"eos_token_id": 2,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"problem_type": "single_label_classification",
"transformers_version": "4.7.0",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}

BIN
model.tr/training_args.bin Normal file

Binary file not shown.

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long