16 KiB
16 KiB
!pip install transformers
!pip install torch
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting transformers Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0) Collecting huggingface-hub<1.0,>=0.14.1 (from transformers) Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1) Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0) Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31) Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1) Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers) Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.6 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0) Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4) Installing collected packages: tokenizers, huggingface-hub, transformers Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2 Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118) Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0) Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1) Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1) Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2) Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0) Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2) Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2) Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)
!git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master
Cloning into 'challenging-america-word-gap-prediction'... remote: Wymienianie obiektów: 27, gotowe.[K remote: Zliczanie obiektów: 100% (27/27), gotowe.[K remote: Kompresowanie obiektów: 100% (23/23), gotowe.[K remote: Razem 27 (delty 2), użyte ponownie 17 (delty 0), paczki użyte ponownie 0[K Receiving objects: 100% (27/27), 278.33 MiB | 8.52 MiB/s, done. Resolving deltas: 100% (2/2), done.
import torch
import sys
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import lzma
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset
import itertools
%cd /content/challenging-america-word-gap-prediction
/content/challenging-america-word-gap-prediction
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
def prediction(word: str) -> str:
left_context =tokenizer.encode(word, return_tensors="pt").to(device)
out = model(left_context)
prob_dist=torch.softmax(out[0][-1],dim=1)
values,index =prob_dist.topk(5)
token = []
for x in index[-1]:
token.append(tokenizer.decode(x))
zipped = list(zip(values[-1], token))
for index, element in enumerate(zipped):
unk = None
if '<unk>' in element:
unk = zipped.pop(index)
zipped.append(('', unk[1]))
break
if unk is None:
zipped[-1] = ('', zipped[-1][1])
return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])
def create_outputs(folder_name):
print(f'Creating outputs in {folder_name}')
with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\n') as f:
for line in fid:
separated = line.split('\t')
prefix = separated[6].replace(r'\n', ' ').split()[-1]
output_line = prediction(prefix)
f.write(output_line + '\n')
create_outputs('dev-0')
create_outputs('test-A')
Creating outputs in dev-0 Creating outputs in test-A
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for t in line.split():
yield t
yield '</s>'
def get_word_lines_from_file(file_name):
with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
for line in fh:
pattern = r'\^\^|\n|\\\\|[<>]|[()]'
line = re.sub(pattern, '', line)
yield line
for line in get_word_lines_from_file("train/in.tsv.xz"):
# line = line.strip('\n')
# fields = line.split("\t")
# print(line)
left_context = str(line)
input_ids = tokenizer.encode(left_context, return_tensors="pt")
# print(input_ids)
output = model(input_ids)
# print(output[0].shape())
prob_dist=torch.softmax(output[0][-1],dim=1)
values,index =prob_dist.topk(20)
print(left_context[-100:])
print(values.size())
print(index.size())
break
for x,indx in zip(values,index):
for i in range(20):
token = tokenizer.decode(indx[i])
print(f'{x[i]} {indx[i]} {token}')
print('-------------------------')
# line = line.strip('\n')
# fields = line.split("\t")
# print(line)
left_context = "he"
input_ids = tokenizer.encode(left_context, return_tensors="pt")
# print(input_ids)
output = model(input_ids)
# print(output[0].shape())
prob_dist=torch.softmax(output[0][-1],dim=1)
values,index =prob_dist.topk(5)
token = []
for x in index[-1]:
token.append(tokenizer.decode(x))
# print(token)
for x,token in zip(values[-1],token):
# token = tokenizer.decode(indx)
print(f'{x} {token}')
for line in get_word_lines_from_file("dev-0/in.tsv.xz"):
# line = line.strip('\n')
# fields = line.split("\t")
# print(line)
left_context = str(line)
input_ids = tokenizer.encode(left_context, return_tensors="pt")
# print(input_ids)
output = model(input_ids)
# print(output[0].shape())
prob_dist=torch.softmax(output[0][-1],dim=1)
values,index =prob_dist.topk(20)
print(left_context[-100:])
# print(values.size())
# print(index.size())
# print(values[])
# break
for x,indx in zip(values[-1],index[-1]):
token = tokenizer.decode(indx)
print(f'{x} {indx} {token}')
print('-------------------------')
token = tokenizer.decode(256 )
print(token)
top_indices[0]
top_probs[0]
top =prob_dist.topk(20)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = tokenizer.decode(top_indices)
print(top_words,'\n',top_indices,'\n',top_probs)
print(index[1])
print(prob_dist.topk(2)[0].size())