91 KiB
91 KiB
!pip install transformers
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting transformers Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0) Collecting huggingface-hub<1.0,>=0.14.1 (from transformers) Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1) Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0) Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31) Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1) Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers) Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.2 MB/s[0m eta [36m0:00:00[0m [?25hCollecting safetensors>=0.3.1 (from transformers) Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0) Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4) Installing collected packages: tokenizers, safetensors, huggingface-hub, transformers Successfully installed huggingface-hub-0.15.1 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2
!pip install transformers[torch]
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: transformers[torch] in /usr/local/lib/python3.10/dist-packages (4.30.2) Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (3.12.0) Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.15.1) Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (1.22.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (23.1) Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (6.0) Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2022.10.31) Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.27.1) Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.13.3) Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (0.3.1) Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (4.65.0) Requirement already satisfied: torch!=1.12.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]) (2.0.1+cu118) Collecting accelerate>=0.20.2 (from transformers[torch]) Downloading accelerate-0.20.3-py3-none-any.whl (227 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.20.2->transformers[torch]) (5.9.5) Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers[torch]) (2023.4.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers[torch]) (4.5.0) Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (1.11.1) Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (3.1) Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (3.1.2) Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.9->transformers[torch]) (2.0.0) Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch!=1.12.0,>=1.9->transformers[torch]) (3.25.2) Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch!=1.12.0,>=1.9->transformers[torch]) (16.0.5) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (1.26.15) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2022.12.7) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (2.0.12) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]) (3.4) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch!=1.12.0,>=1.9->transformers[torch]) (2.1.2) Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch!=1.12.0,>=1.9->transformers[torch]) (1.3.0) Installing collected packages: accelerate Successfully installed accelerate-0.20.3
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments,AutoModelWithLMHead, TextDataset, DataCollatorForLanguageModeling
import lzma
from tqdm import tqdm
from google.colab import drive
import sys
import lzma
from torch.utils.data import DataLoader
DEV_0='/content/gdrive/MyDrive/dev-0'
TEST_A='/content/gdrive/MyDrive/test-A'
TRAIN='/content/gdrive/MyDrive/train'
drive.mount("/content/gdrive")
Mounted at /content/gdrive
!chmod +rwx './'
!chmod +x '/content/gdrive/MyDrive/dev-0'
!chmod +x '/content/gdrive/MyDrive/test-A'
def read_xz_file(fname):
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def get_contexts(input_text):
all_fields = input_text.replace(r'\n', ' ').split('\t')
return {'left': all_fields[6], 'right': all_fields[7]}
def predict_words(dataset, tokenizer, model, device):
preds = []
for entry in tqdm(dataset):
text = f"{entry['left']}"
src = tokenizer.encode(text, return_tensors="pt", truncation=True).to(device)
output = model.generate(src, max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2)
generated_word = tokenizer.decode(output[0], skip_special_tokens=True).split(' ')[-1]
preds.append(f'{generated_word.strip()}:0.99 :0.01')
return preds
def load_dataset(train_path ,tokenizer):
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path=train_path,
block_size=128)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False,
)
return train_dataset,data_collator
def write_to_text_parsed_file(fname):
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
result = [line.strip().split("\t")[-2]+"\n" for line in f.readlines()]
with open("train_parsed.txt", "w") as file:
file.writelines(result)
device = torch.device('cuda')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model: GPT2LMHeadModel = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)
model.to(device)
Downloading (…)olve/main/vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
Downloading model.safetensors: 0%| | 0.00/548M [00:00<?, ?B/s]
Downloading (…)neration_config.json: 0%| | 0.00/124 [00:00<?, ?B/s]
GPT2LMHeadModel( (transformer): GPT2Model( (wte): Embedding(50257, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0-11): 12 x GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (lm_head): Linear(in_features=768, out_features=50257, bias=False) )
SPECIAL_TOKENS = {
"mask_token": "[MASK]"
}
tokenizer.add_special_tokens(SPECIAL_TOKENS)
1
write_to_text_parsed_file(f'{TRAIN}/in.tsv.xz')
!head -25000 train_parsed.txt > train_set.txt
train_dataset, data_collator = load_dataset("train_set.txt",tokenizer)
data = DataLoader(train_dataset, batch_size=1_000)
/usr/local/lib/python3.10/dist-packages/transformers/data/datasets/language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py warnings.warn(
training_args = TrainingArguments(
output_dir="./gpt2_fine_tunes",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
eval_steps = 200,
save_steps= 400,
prediction_loss_only=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train()
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn(
[3272/3272 32:34, Epoch 1/1]
Step | Training Loss |
---|---|
500 | 4.501800 |
1000 | 4.328500 |
1500 | 4.243500 |
2000 | 4.214600 |
2500 | 4.186200 |
3000 | 4.164800 |
TrainOutput(global_step=3272, training_loss=4.263899040688512, metrics={'train_runtime': 1955.7673, 'train_samples_per_second': 26.767, 'train_steps_per_second': 1.673, 'total_flos': 3419724791808000.0, 'train_loss': 4.263899040688512, 'epoch': 1.0})
trainer.save_model()
model.eval()
with lzma.open(f"{DEV_0}/in.tsv.xz", encoding='utf8', mode="rt") as f:
with open("out-tuned.tsv", "w") as file:
for line in f:
line = line.strip("\n")
fields = line.split("\t")
left_context = fields[6]
left_context = left_context.replace("\\\\n", " ")
inputs = tokenizer.encode(left_context, return_tensors="pt").to(device)
outputs = model(inputs)
z_dist = outputs[0][0][-1]
prob_dist = torch.softmax(z_dist, dim=0)
topk_values, topk_indices = prob_dist.topk(20)
unk_bonus = 1 - sum(topk_values)
result =r""
for v, idx in zip(topk_values, topk_indices):
token = tokenizer.decode([idx])
token =str(token).strip(" ")
if token.isalnum():
result = result + token + ":"+str(v.item())+" "
else:
unk_bonus+=v.item()
result+=f":{unk_bonus}"
file.write(result+"\n")
with lzma.open(f"{TEST_A}/in.tsv.xz", encoding='utf8', mode="rt") as f:
with open("out-tuned_to_test.tsv", "w") as file:
for line in f:
line = line.strip("\n")
fields = line.split("\t")
left_context = fields[6]
left_context = left_context.replace("\\\\n", " ")
inputs = tokenizer.encode(left_context, return_tensors="pt").to(device)
outputs = model(inputs)
z_dist = outputs[0][0][-1]
prob_dist = torch.softmax(z_dist, dim=0)
topk_values, topk_indices = prob_dist.topk(20)
unk_bonus = 1 - sum(topk_values)
result =r""
for v, idx in zip(topk_values, topk_indices):
token = tokenizer.decode([idx])
token =str(token).strip(" ")
if token.isalnum():
result = result + token + ":"+str(v.item())+" "
else:
unk_bonus+=v.item()
result+=f":{unk_bonus}"
file.write(result+"\n")