diff --git a/how_to_run b/how_to_run index d04290f..2efa665 100644 --- a/how_to_run +++ b/how_to_run @@ -1,5 +1,6 @@ install pytorch install transformers from source: pip install git+https://github.com/huggingface/transformers instal datasets: pip install datasets +install tokenizers: pip install tokenizers to run: go do a solution directory and: bash run.sh diff --git a/regular_roberta_from_scratch/1_train_tokenizer.py b/regular_roberta_from_scratch/1_train_tokenizer.py index e0f3fe3..5b8c980 100644 --- a/regular_roberta_from_scratch/1_train_tokenizer.py +++ b/regular_roberta_from_scratch/1_train_tokenizer.py @@ -1,7 +1,7 @@ from pathlib import Path from tokenizers import ByteLevelBPETokenizer -paths = ['../train/in_text_only.csv'] +paths = ['./train_in.csv'] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() diff --git a/regular_roberta_from_scratch/2_run.sh b/regular_roberta_from_scratch/2_run.sh index 045eab8..ecb336c 100644 --- a/regular_roberta_from_scratch/2_run.sh +++ b/regular_roberta_from_scratch/2_run.sh @@ -8,8 +8,8 @@ python run_mlm.py \ --validation_file ./dev-0_in.csv \ --do_train \ --do_eval \ - --per_device_train_batch_size=64 \ - --per_device_eval_batch_size=64 \ + --per_device_train_batch_size=1 \ + --per_device_eval_batch_size=1 \ --gradient_accumulation_steps=4 \ --fp16 False \ --save_steps 1000 \ diff --git a/regular_roberta_from_scratch/tokenizer_model/tokenizer_config.json b/regular_roberta_from_scratch/tokenizer_model/tokenizer_config.json new file mode 100644 index 0000000..cbc6ef8 --- /dev/null +++ b/regular_roberta_from_scratch/tokenizer_model/tokenizer_config.json @@ -0,0 +1 @@ +{"unk_token": "", "bos_token": "", "eos_token": "", "add_prefix_space": false, "errors": "replace", "sep_token": "", "cls_token": "", "pad_token": "", "mask_token": "", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"} \ No newline at end of file diff --git a/roberta_with_year_from_scratch/2_run.sh b/roberta_with_year_from_scratch/2_run.sh index 1d94bf6..ecb336c 100644 --- a/roberta_with_year_from_scratch/2_run.sh +++ b/roberta_with_year_from_scratch/2_run.sh @@ -1,16 +1,15 @@ #if [ -z ${CUDA_VISIBLE_DEVICES+x} ]; then echo "CUDA_VISIBLE_DEVICES NOT SET"; exit 1 ; else echo "using cuda devices '$CUDA_VISIBLE_DEVICES'"; fi python run_mlm.py \ - --model_type roberta \ - --config_name roberta_small_config.json \ + --model_name_or_path roberta-base \ --max_seq_length 64 \ --output_dir ./robertamodel \ --train_file ./train_in.csv \ --validation_file ./dev-0_in.csv \ --do_train \ --do_eval \ - --per_device_train_batch_size=64 \ - --per_device_eval_batch_size=64 \ + --per_device_train_batch_size=1 \ + --per_device_eval_batch_size=1 \ --gradient_accumulation_steps=4 \ --fp16 False \ --save_steps 1000 \ diff --git a/roberta_with_year_from_scratch/roberta_small_config.json b/roberta_with_year_from_scratch/roberta_small_config.json deleted file mode 100644 index 6866bbc..0000000 --- a/roberta_with_year_from_scratch/roberta_small_config.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "architectures": [ - "RobertaForMaskedLM" - ], - "attention_probs_dropout_prob": 0.1, - "bos_token_id": 0, - "classifier_dropout": null, - "eos_token_id": 2, - "gradient_checkpointing": false, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "layer_norm_eps": 1e-05, - "max_position_embeddings": 66, - "model_type": "roberta", - "num_attention_heads": 12, - "num_hidden_layers": 6, - "pad_token_id": 1, - "position_embedding_type": "absolute", - "torch_dtype": "float32", - "transformers_version": "4.11.0.dev0", - "type_vocab_size": 1, - "use_cache": true, - "vocab_size": 50265 -}