Compare commits
9 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
6e057483a7 | ||
|
a9ba5d5728 | ||
|
9531e6c569 | ||
|
567f498ee2 | ||
|
4b000457df | ||
|
c36ba1d489 | ||
|
0249514499 | ||
|
16771b5293 | ||
|
415cad97e2 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -6,3 +6,5 @@
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
||||
|
||||
**/.vscode/*
|
||||
|
5272
dev-0/in.tsv
Normal file
5272
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
103
main.py
Normal file
103
main.py
Normal file
@ -0,0 +1,103 @@
|
||||
import os
|
||||
import sys
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
||||
import torch
|
||||
import random
|
||||
|
||||
IN_FILE_NAME = "in.tsv"
|
||||
OUT_FILE_NAME = "out.tsv"
|
||||
TRAIN_PATH = "train"
|
||||
EXP_FILE_NAME = "expected.tsv"
|
||||
FILE_SEP = "\t"
|
||||
# PT_MODEL_NAME = "bert-base-cased"
|
||||
PT_MODEL_NAME = "roberta-base"
|
||||
MODEL_OUT_NAME = "./model.tr"
|
||||
|
||||
|
||||
class CustomDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, encodings, labels):
|
||||
self.encodings = encodings
|
||||
self.labels = labels
|
||||
|
||||
def __getitem__(self, idx):
|
||||
item = {key: torch.tensor(val[idx])
|
||||
for key, val in self.encodings.items()}
|
||||
item['labels'] = torch.tensor(self.labels[idx])
|
||||
return item
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
||||
|
||||
def main(dirnames):
|
||||
print("Reading train data...")
|
||||
train_set_features = get_tsv_data(os.path.join(
|
||||
TRAIN_PATH, IN_FILE_NAME), compressed=True)
|
||||
train_set_labels = get_tsv_data(os.path.join(
|
||||
TRAIN_PATH, EXP_FILE_NAME), compressed=True)
|
||||
|
||||
print("Reading input data...")
|
||||
in_sets = []
|
||||
for d in dirnames:
|
||||
print(f"\tReading dir: {d}...")
|
||||
in_sets.append(get_tsv_data(
|
||||
os.path.join(d, IN_FILE_NAME), compressed=True))
|
||||
|
||||
train_data = list(zip(train_set_features, train_set_labels))
|
||||
train_data = random.sample(train_data, 15000)
|
||||
|
||||
mname = PT_MODEL_NAME
|
||||
pt = os.path.exists(MODEL_OUT_NAME)
|
||||
if pt:
|
||||
mname = MODEL_OUT_NAME
|
||||
tokenizer = AutoTokenizer.from_pretrained(mname)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
mname, num_labels=2)
|
||||
train_set_enc = tokenizer(
|
||||
[text[0] for text in train_data], truncation=True, padding=True)
|
||||
ds = CustomDataset(
|
||||
train_set_enc, [int(text[1]) for text in train_data])
|
||||
|
||||
device = torch.device(
|
||||
'cuda') if torch.cuda.is_available() else torch.device('cpu')
|
||||
model.to(device)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=TrainingArguments("model"),
|
||||
train_dataset=ds
|
||||
)
|
||||
|
||||
print("Starting training...")
|
||||
|
||||
if not pt:
|
||||
trainer.train()
|
||||
trainer.save_model(MODEL_OUT_NAME)
|
||||
|
||||
print("Predicting outputs...")
|
||||
|
||||
for i in range(len(in_sets)):
|
||||
p_in = os.path.join(dirnames[i], IN_FILE_NAME)
|
||||
p_out = os.path.join(dirnames[i], OUT_FILE_NAME)
|
||||
with open(p_out, "w") as f:
|
||||
print(
|
||||
f"\tPredicting for: {p_in}...")
|
||||
f.write('\n'.join(trainer.predict(in_sets[i])))
|
||||
print(f"Saved predictions to file: {p_out}")
|
||||
|
||||
|
||||
def get_tsv_data(filename: str, compressed=False):
|
||||
check_path(filename=filename)
|
||||
with open(filename) as f:
|
||||
return f.readlines()
|
||||
|
||||
|
||||
def check_path(filename: str):
|
||||
if not os.path.exists(filename):
|
||||
raise Exception(f"Path {filename} does not exist!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
raise Exception("Name of working dir not specified!")
|
||||
main(sys.argv[1:])
|
27
model.tr/config.json
Normal file
27
model.tr/config.json
Normal file
@ -0,0 +1,27 @@
|
||||
{
|
||||
"_name_or_path": "roberta-base",
|
||||
"architectures": [
|
||||
"RobertaForSequenceClassification"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"max_position_embeddings": 514,
|
||||
"model_type": "roberta",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"pad_token_id": 1,
|
||||
"position_embedding_type": "absolute",
|
||||
"problem_type": "single_label_classification",
|
||||
"transformers_version": "4.7.0",
|
||||
"type_vocab_size": 1,
|
||||
"use_cache": true,
|
||||
"vocab_size": 50265
|
||||
}
|
BIN
model.tr/training_args.bin
Normal file
BIN
model.tr/training_args.bin
Normal file
Binary file not shown.
5152
test-A/in.tsv
Normal file
5152
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/in.tsv
Normal file
289579
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user