challenging-america-word-ga.../lab12.ipynb
JulianZablonski 3b0cab7eef zad12
2023-06-08 17:34:01 +02:00

16 KiB
Raw Blame History

!pip install transformers
!pip install torch
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.1/7.1 MB 105.1 MB/s eta 0:00:00
[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 236.8/236.8 kB 33.8 MB/s eta 0:00:00
[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 117.6 MB/s eta 0:00:00
[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)
Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)
Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)
Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)
Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)
Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)
Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)
Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)
Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)
!git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master
Cloning into 'challenging-america-word-gap-prediction'...
remote: Wymienianie obiektów: 27, gotowe.
remote: Zliczanie obiektów: 100% (27/27), gotowe.
remote: Kompresowanie obiektów: 100% (23/23), gotowe.
remote: Razem 27 (delty 2), użyte ponownie 17 (delty 0), paczki użyte ponownie 0
Receiving objects: 100% (27/27), 278.33 MiB | 8.52 MiB/s, done.
Resolving deltas: 100% (2/2), done.
import torch
import sys
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import lzma
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset
import itertools
%cd /content/challenging-america-word-gap-prediction
/content/challenging-america-word-gap-prediction

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
def prediction(word: str) -> str:
    left_context =tokenizer.encode(word, return_tensors="pt").to(device)
    out = model(left_context)
    prob_dist=torch.softmax(out[0][-1],dim=1)
    values,index  =prob_dist.topk(5)
    token = [] 
    for x in index[-1]:
      token.append(tokenizer.decode(x))
    zipped = list(zip(values[-1], token))
    for index, element in enumerate(zipped):
        unk = None
        if '<unk>' in element:
            unk = zipped.pop(index)
            zipped.append(('', unk[1]))
            break
    if unk is None:
        zipped[-1] = ('', zipped[-1][1])
    return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])
def create_outputs(folder_name):
    print(f'Creating outputs in {folder_name}')
    with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
        with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\n') as f:
            for line in fid:
                separated = line.split('\t')
                prefix = separated[6].replace(r'\n', ' ').split()[-1]
                output_line = prediction(prefix)
                f.write(output_line + '\n')
create_outputs('dev-0')
create_outputs('test-A')
Creating outputs in dev-0
Creating outputs in test-A
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
def get_words_from_line(line):
  line = line.rstrip()
  yield '<s>'
  for t in line.split():
    yield t
  yield '</s>'


def get_word_lines_from_file(file_name):
  with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
    for line in fh:
      pattern = r'\^\^|\n|\\\\|[<>]|[()]'
      line = re.sub(pattern, '', line)
      yield line

for line in get_word_lines_from_file("train/in.tsv.xz"):
    # line = line.strip('\n')
    # fields = line.split("\t")
    # print(line)
    left_context = str(line)
    input_ids = tokenizer.encode(left_context, return_tensors="pt")
    # print(input_ids)
    output = model(input_ids)
    # print(output[0].shape())
    prob_dist=torch.softmax(output[0][-1],dim=1)
    values,index  =prob_dist.topk(20) 
    print(left_context[-100:])
    print(values.size())
    print(index.size())
    break
    for x,indx in zip(values,index):
      for i in range(20):
        token = tokenizer.decode(indx[i])
        print(f'{x[i]} {indx[i]} {token}')
      print('-------------------------')

# line = line.strip('\n')
# fields = line.split("\t")
# print(line)
left_context = "he"
input_ids = tokenizer.encode(left_context, return_tensors="pt")
# print(input_ids)
output = model(input_ids)
# print(output[0].shape())
prob_dist=torch.softmax(output[0][-1],dim=1)
values,index  =prob_dist.topk(5) 
token = []
for x in index[-1]:
      token.append(tokenizer.decode(x))
      # print(token)
for x,token in zip(values[-1],token):
      # token = tokenizer.decode(indx)
      print(f'{x}  {token}')
for line in get_word_lines_from_file("dev-0/in.tsv.xz"):
    # line = line.strip('\n')
    # fields = line.split("\t")
    # print(line)
    left_context = str(line)
    input_ids = tokenizer.encode(left_context, return_tensors="pt")
    # print(input_ids)
    output = model(input_ids)
    # print(output[0].shape())
    prob_dist=torch.softmax(output[0][-1],dim=1)
    values,index  =prob_dist.topk(20) 
    print(left_context[-100:])
    # print(values.size())
    # print(index.size())
    # print(values[])
    # break
    for x,indx in zip(values[-1],index[-1]):
        token = tokenizer.decode(indx)
        print(f'{x} {indx} {token}')
    print('-------------------------')
token = tokenizer.decode(256  )
print(token)
top_indices[0]
top_probs[0]
top  =prob_dist.topk(20) 
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = tokenizer.decode(top_indices)
print(top_words,'\n',top_indices,'\n',top_probs)
print(index[1])
print(prob_dist.topk(2)[0].size())