This commit is contained in:
Jakub Pokrywka 2021-11-04 12:06:29 +01:00
parent 9d5d178d3d
commit 5308358ad7
6 changed files with 8 additions and 34 deletions

View File

@ -1,5 +1,6 @@
install pytorch install pytorch
install transformers from source: pip install git+https://github.com/huggingface/transformers install transformers from source: pip install git+https://github.com/huggingface/transformers
instal datasets: pip install datasets instal datasets: pip install datasets
install tokenizers: pip install tokenizers
to run: go do a solution directory and: bash run.sh to run: go do a solution directory and: bash run.sh

View File

@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
from tokenizers import ByteLevelBPETokenizer from tokenizers import ByteLevelBPETokenizer
paths = ['../train/in_text_only.csv'] paths = ['./train_in.csv']
# Initialize a tokenizer # Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer() tokenizer = ByteLevelBPETokenizer()

View File

@ -8,8 +8,8 @@ python run_mlm.py \
--validation_file ./dev-0_in.csv \ --validation_file ./dev-0_in.csv \
--do_train \ --do_train \
--do_eval \ --do_eval \
--per_device_train_batch_size=64 \ --per_device_train_batch_size=1 \
--per_device_eval_batch_size=64 \ --per_device_eval_batch_size=1 \
--gradient_accumulation_steps=4 \ --gradient_accumulation_steps=4 \
--fp16 False \ --fp16 False \
--save_steps 1000 \ --save_steps 1000 \

View File

@ -0,0 +1 @@
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"}

View File

@ -1,16 +1,15 @@
#if [ -z ${CUDA_VISIBLE_DEVICES+x} ]; then echo "CUDA_VISIBLE_DEVICES NOT SET"; exit 1 ; else echo "using cuda devices '$CUDA_VISIBLE_DEVICES'"; fi #if [ -z ${CUDA_VISIBLE_DEVICES+x} ]; then echo "CUDA_VISIBLE_DEVICES NOT SET"; exit 1 ; else echo "using cuda devices '$CUDA_VISIBLE_DEVICES'"; fi
python run_mlm.py \ python run_mlm.py \
--model_type roberta \ --model_name_or_path roberta-base \
--config_name roberta_small_config.json \
--max_seq_length 64 \ --max_seq_length 64 \
--output_dir ./robertamodel \ --output_dir ./robertamodel \
--train_file ./train_in.csv \ --train_file ./train_in.csv \
--validation_file ./dev-0_in.csv \ --validation_file ./dev-0_in.csv \
--do_train \ --do_train \
--do_eval \ --do_eval \
--per_device_train_batch_size=64 \ --per_device_train_batch_size=1 \
--per_device_eval_batch_size=64 \ --per_device_eval_batch_size=1 \
--gradient_accumulation_steps=4 \ --gradient_accumulation_steps=4 \
--fp16 False \ --fp16 False \
--save_steps 1000 \ --save_steps 1000 \

View File

@ -1,27 +0,0 @@
{
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 66,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.11.0.dev0",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50265
}