aitech-eks-pub/cw/15_similarity_search.ipynb

9.3 KiB

Logo 1

Ekstrakcja informacji

15. Similarity search [ćwiczenia]

Jakub Pokrywka (2021)

Logo 2

from transformers import T5Tokenizer, T5ForConditionalGeneration
text = "translate English to French: My name is Azeem and I live in India"
text = "summarize: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks. For simple tasks assigned to computers, it is possible to program algorithms telling the machine how to execute all steps required to solve the problem at hand; on the computer's part, no learning is needed. For more advanced tasks, it can be challenging for a human to manually create the needed algorithms. In practice, it can turn out to be more effective to help the machine develop its own algorithm, rather than having human programmers specify every needed step."
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')

model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True,).to('cuda')


# You can also use "translate English to French" and "translate English to Romanian"
input_ids = tokenizer(text, return_tensors="pt").input_ids.to('cuda')  # Batch size 1

outputs = model.generate(input_ids)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded)
model
KLEISTER_PATH = '/media/kuba/ssdsam/Syncthing/Syncthing/przedmioty/2020-02/IE/applica/kleister-nda/'
train_exp_f = open(KLEISTER_PATH + 'train/expected.tsv')
train_exp = []
for line in train_exp_f:
    line_splitted = line.strip('\n').split(' ')
    found = False
    for elem in line_splitted:
        if 'jurisdiction=' in elem:
            train_exp.append('jurisdiction: ' + elem.split('=')[1])
            found = True
            break
    if not found:
        train_exp.append('jurisdiction: NONE')
dev_exp_f = open(KLEISTER_PATH + 'dev-0/expected.tsv')
dev_exp = []
for line in dev_exp_f:
    line_splitted = line.strip('\n').split(' ')
    found = False
    for elem in line_splitted:
        if 'jurisdiction=' in elem:
            dev_exp.append('jurisdiction: ' + elem.split('=')[1])
            found = True
            break
    if not found:
        dev_exp.append('jurisdiction: NONE')
train_exp
train_in_f = open(KLEISTER_PATH + 'train/in.tsv')
train_in = []
for line in train_in_f:
    line = line.rstrip('\n')
    train_in.append(line)
dev_in_f = open(KLEISTER_PATH + 'dev-0/in.tsv')
dev_in = []
for line in dev_in_f:
    line = line.rstrip('\n')
    dev_in.append(line)
train_in[0]
model.device
input = train_in[0]

# You can also use "translate English to French" and "translate English to Romanian"
input_ids = tokenizer(input, return_tensors="pt").input_ids[:,:512].to('cuda')  # Batch size 1

outputs = model.generate(input_ids)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded)
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids.to('cuda')
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids.to('cuda')
# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
loss
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()
for line_in, line_exp in zip(train_in, train_exp):
    input_ids = tokenizer(line_in, return_tensors='pt').input_ids[:,:512].to('cuda')
    labels = tokenizer(line_exp, return_tensors='pt').input_ids.to('cuda')
    # the forward function automatically creates the correct decoder_input_ids
    loss = model(input_ids=input_ids, labels=labels).loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(loss.item())
model.eval()
input = dev_in[0]

input_ids = tokenizer(input, return_tensors="pt").input_ids[:,:512].to('cuda')  # Batch size 1

outputs = model.generate(input_ids)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded)
dev_exp[0]
input = dev_in[2]

input_ids = tokenizer(input, return_tensors="pt").input_ids[:,:512].to('cuda')  # Batch size 1

outputs = model.generate(input_ids)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded)
dev_exp[2]

pytanie:

  • co można poprawić w istniejącym rozwiązaniu?