Add output

add model
Restore unpacked files
2021-06-22 22:58:14 +02:00 · 2021-06-22 22:46:19 +02:00 · 2021-06-22 20:27:41 +02:00 · 2021-06-22 20:27:03 +02:00 · 2021-06-22 18:13:51 +02:00 · 2021-06-22 16:41:34 +02:00
9 changed files with 310559 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,5 @@
 *.o
 .DS_Store
 .token
+
+**/.vscode/*
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/main.py
+++ b/main.py
@ -0,0 +1,103 @@
+import os
+import sys
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
+import torch
+import random
+
+IN_FILE_NAME = "in.tsv"
+OUT_FILE_NAME = "out.tsv"
+TRAIN_PATH = "train"
+EXP_FILE_NAME = "expected.tsv"
+FILE_SEP = "\t"
+# PT_MODEL_NAME = "bert-base-cased"
+PT_MODEL_NAME = "roberta-base"
+MODEL_OUT_NAME = "./model.tr"
+
+
+class CustomDataset(torch.utils.data.Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx])
+                for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx])
+        return item
+
+    def __len__(self):
+        return len(self.labels)
+
+
+def main(dirnames):
+    print("Reading train data...")
+    train_set_features = get_tsv_data(os.path.join(
+        TRAIN_PATH, IN_FILE_NAME), compressed=True)
+    train_set_labels = get_tsv_data(os.path.join(
+        TRAIN_PATH, EXP_FILE_NAME), compressed=True)
+
+    print("Reading input data...")
+    in_sets = []
+    for d in dirnames:
+        print(f"\tReading dir: {d}...")
+        in_sets.append(get_tsv_data(
+            os.path.join(d, IN_FILE_NAME), compressed=True))
+
+    train_data = list(zip(train_set_features, train_set_labels))
+    train_data = random.sample(train_data, 15000)
+
+    mname = PT_MODEL_NAME
+    pt = os.path.exists(MODEL_OUT_NAME)
+    if pt:
+        mname = MODEL_OUT_NAME
+    tokenizer = AutoTokenizer.from_pretrained(mname)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        mname, num_labels=2)
+    train_set_enc = tokenizer(
+        [text[0] for text in train_data], truncation=True, padding=True)
+    ds = CustomDataset(
+        train_set_enc, [int(text[1]) for text in train_data])
+
+    device = torch.device(
+        'cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model.to(device)
+
+    trainer = Trainer(
+        model=model,
+        args=TrainingArguments("model"),
+        train_dataset=ds
+    )
+
+    print("Starting training...")
+
+    if not pt:
+        trainer.train()
+        trainer.save_model(MODEL_OUT_NAME)
+
+    print("Predicting outputs...")
+
+    for i in range(len(in_sets)):
+        p_in = os.path.join(dirnames[i], IN_FILE_NAME)
+        p_out = os.path.join(dirnames[i], OUT_FILE_NAME)
+        with open(p_out, "w") as f:
+            print(
+                f"\tPredicting for: {p_in}...")
+            f.write('\n'.join(trainer.predict(in_sets[i])))
+            print(f"Saved predictions to file: {p_out}")
+
+
+def get_tsv_data(filename: str, compressed=False):
+    check_path(filename=filename)
+    with open(filename) as f:
+        return f.readlines()
+
+
+def check_path(filename: str):
+    if not os.path.exists(filename):
+        raise Exception(f"Path {filename} does not exist!")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        raise Exception("Name of working dir not specified!")
+    main(sys.argv[1:])
--- a/model.tr/config.json
+++ b/model.tr/config.json
@ -0,0 +1,27 @@
+{
+  "_name_or_path": "roberta-base",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.7.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}
--- a/model.tr/training_args.bin
+++ b/model.tr/training_args.bin
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/in.tsv
+++ b/train/in.tsv
Author	SHA1	Message	Date
nlitkowski	6e057483a7	Add output	2021-06-22 22:58:14 +02:00
nlitkowski	a9ba5d5728	add model	2021-06-22 22:46:19 +02:00
nlitkowski	9531e6c569	Restore unpacked files	2021-06-22 20:27:41 +02:00
nlitkowski	567f498ee2	Fix script	2021-06-22 20:27:03 +02:00
nlitkowski	4b000457df	fix script	2021-06-22 18:13:51 +02:00
nlitkowski	c36ba1d489	Add GPU	2021-06-22 16:41:34 +02:00
nlitkowski	0249514499	Add decompressed files	2021-06-22 15:59:06 +02:00
nlitkowski	16771b5293	Fix training args	2021-06-22 15:55:04 +02:00
nlitkowski	415cad97e2	Add script	2021-06-21 21:46:10 +02:00