190 KiB
190 KiB
Importy i sprawdzenie GPU
import datasets
import torch
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import json
import torch
is_gpu_available = torch.cuda.is_available()
print("Available GPU", is_gpu_available)
Available GPU True
Zbiór danych
Sprawdź dostępne config dla zbioru
datasets.get_dataset_config_names("subjqa")
['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']
selected_config = 'tripadvisor'
ds_builder = datasets.load_dataset_builder("subjqa", selected_config)
print(ds_builder.info.description, "\n")
print("FEATURES", ds_builder.info.features, "\n")
print("SPLITS", ds_builder.info.splits, "\n")
SubjQA is a question answering dataset that focuses on subjective questions and answers. The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery, electronics, TripAdvisor (i.e. hotels), and restaurants. FEATURES {'domain': Value(dtype='string', id=None), 'nn_mod': Value(dtype='string', id=None), 'nn_asp': Value(dtype='string', id=None), 'query_mod': Value(dtype='string', id=None), 'query_asp': Value(dtype='string', id=None), 'q_reviews_id': Value(dtype='string', id=None), 'question_subj_level': Value(dtype='int64', id=None), 'ques_subj_score': Value(dtype='float32', id=None), 'is_ques_subjective': Value(dtype='bool', id=None), 'review_id': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None), 'answer_subj_level': Value(dtype='int64', id=None), 'ans_subj_score': Value(dtype='float32', id=None), 'is_ans_subjective': Value(dtype='bool', id=None)}, length=-1, id=None)} SPLITS {'train': SplitInfo(name='train', num_bytes=1574953, num_examples=1165, dataset_name='subjqa'), 'test': SplitInfo(name='test', num_bytes=689440, num_examples=512, dataset_name='subjqa'), 'validation': SplitInfo(name='validation', num_bytes=312577, num_examples=230, dataset_name='subjqa')}
subjqa_tripadv = datasets.load_dataset("subjqa", selected_config)
Reusing dataset subjqa (/home/karo/.cache/huggingface/datasets/subjqa/tripadvisor/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd)
0%| | 0/3 [00:00<?, ?it/s]
selected_config = 'restaurants'
ds_builder = datasets.load_dataset_builder("subjqa", selected_config)
print(ds_builder.info.description, "\n")
print("FEATURES", ds_builder.info.features, "\n")
print("SPLITS", ds_builder.info.splits, "\n")
SubjQA is a question answering dataset that focuses on subjective questions and answers. The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery, electronics, TripAdvisor (i.e. hotels), and restaurants. FEATURES {'domain': Value(dtype='string', id=None), 'nn_mod': Value(dtype='string', id=None), 'nn_asp': Value(dtype='string', id=None), 'query_mod': Value(dtype='string', id=None), 'query_asp': Value(dtype='string', id=None), 'q_reviews_id': Value(dtype='string', id=None), 'question_subj_level': Value(dtype='int64', id=None), 'ques_subj_score': Value(dtype='float32', id=None), 'is_ques_subjective': Value(dtype='bool', id=None), 'review_id': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None), 'answer_subj_level': Value(dtype='int64', id=None), 'ans_subj_score': Value(dtype='float32', id=None), 'is_ans_subjective': Value(dtype='bool', id=None)}, length=-1, id=None)} SPLITS {'train': SplitInfo(name='train', num_bytes=1823263, num_examples=1400, dataset_name='subjqa'), 'test': SplitInfo(name='test', num_bytes=335385, num_examples=266, dataset_name='subjqa'), 'validation': SplitInfo(name='validation', num_bytes=349286, num_examples=267, dataset_name='subjqa')}
subjqa_restaurant = datasets.load_dataset("subjqa", selected_config)
Reusing dataset subjqa (/home/karo/.cache/huggingface/datasets/subjqa/restaurants/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd)
0%| | 0/3 [00:00<?, ?it/s]
Połącz datasety restaurant oraz trip_advisor oraz usuń test i przyłącz do train, ponieważ nie jest potrzebny
def concat_test2train(dataset):
dataset['train'] = datasets.concatenate_datasets([dataset['test'], dataset['train']])
del dataset['test']
return dataset
subjqa_restaurant = concat_test2train(subjqa_restaurant)
subjqa_tripadv = concat_test2train(subjqa_tripadv)
subjqa = datasets.DatasetDict({
'train': datasets.concatenate_datasets([subjqa_tripadv['train'], subjqa_restaurant['train']]),
'validation': datasets.concatenate_datasets([subjqa_tripadv['validation'], subjqa_restaurant['validation']]),
})
subjqa
DatasetDict({ train: Dataset({ features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'], num_rows: 3343 }) validation: Dataset({ features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'], num_rows: 497 }) })
print(subjqa["train"]['question'][0])
print(subjqa["train"]["answers"][0])
Is a cramped room? {'text': ['Hotel rooms were very small', 'rooms were very small'], 'answer_start': [12, 18], 'answer_subj_level': [2, 2], 'ans_subj_score': [0.5199999809265137, 0.5199999809265137], 'is_ans_subjective': [True, True]}
Ilość unikalnych pytań w split
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}
for split, df in dfs.items():
print(f"Number of questions in {split}: {df['id'].nunique()}")
Number of questions in train: 3343 Number of questions in validation: 497
qa_cols = ["id", "title", "question", "answers.text",
"answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df
id | title | question | answers.text | answers.answer_start | context | |
---|---|---|---|---|---|---|
2544 | 0d78196c65a3bec3913e2291cd9b771f | Yl2TN9c23ZGLUBSD9ks5Uw | Is it present ? | [so it's a bit of a splurge meal] | [177] | Byblos has a really beautiful decor, that I wo... |
884 | 57b688d836d9502a076fe6a50a9cde81 | usa_san francisco_hotel_adagio | Where can I locate the hotel staff? | [] | [] | Stayed at the Adagio for 4 nights end of June-... |
Sprawdzenie generowania odpowiedzi z kontekstu
start_idx = sample_df["answers.answer_start"].iloc[0][0]
end_idx = start_idx + len(sample_df["answers.text"].iloc[0][0])
sample_df["context"].iloc[0][start_idx:end_idx]
"so it's a bit of a splurge meal"
Usuń obiekty które nie posiadają odpowiedzi
subjqa['train'] = subjqa['train'].filter(lambda example: len(example["answers"]['text']) > 0)
subjqa['validation'] = subjqa['validation'].filter(lambda example: len(example["answers"]['text']) > 0)
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}
subjqa
0%| | 0/2 [00:00<?, ?ba/s]
0%| | 0/1 [00:00<?, ?ba/s]
DatasetDict({ train: Dataset({ features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'], num_rows: 1666 }) validation: Dataset({ features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'], num_rows: 265 }) })
dfs['validation'].head()
domain | nn_mod | nn_asp | query_mod | query_asp | q_reviews_id | question_subj_level | ques_subj_score | is_ques_subjective | review_id | id | title | context | question | answers.text | answers.answer_start | answers.answer_subj_level | answers.ans_subj_score | answers.is_ans_subjective | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | tripadvisor | excellent | hotel | friendly | hotel | b96b8478f5202ac9534eaf75167016f7 | 1 | 0.0 | False | tripadvisor_review_1509 | d1c352b70d1225245569a0a1acbf5e04 | usa_san francisco_argonaut_hotel_a_kimpton_hotel | Great setting at the end of the wharf (so your... | How was the hotel? | [excellent hotels] | [527] | [1] | [1.0] | [True] |
1 | tripadvisor | neat | hotel | cozy | hotel | ca174c824baba906d30eed207350e37e | 1 | 0.0 | False | tripadvisor_review_6133 | b6fbf58ca273ad8f9b47a9be6a36e707 | usa_san francisco_best_western_tuscan_inn_fish... | My wife and I took two trips to San Fran in 20... | How is the hotel? | [The hotel location was great] | [129] | [1] | [0.75] | [True] |
2 | tripadvisor | excellent | value for money | good | value for money | 5d41bfd0e2166e14eeae1c1be9085555 | 3 | 0.0 | False | tripadvisor_review_4872 | ea95bccdd762284ad7040be8d016da4f | usa_san francisco_castle_inn | Yep, I have to agree with all those folks who ... | Is it value for money? | [And very reasonably priced. Overall, excellen... | [501] | [3] | [0.58] | [True] |
3 | tripadvisor | convenient | place | safe | hotel | a5880d95aa1161cfaac48584ee58934d | 1 | 0.6 | True | tripadvisor_review_4715 | 413df1095d03e4f967d349f0490a2514 | usa_san francisco_castle_inn | On first sight the Castle hotel is not great, ... | Does the hotel offer good service? | [On first sight the Castle hotel is not great] | [0] | [1] | [0.5416667] | [True] |
4 | tripadvisor | helpful | staff | helpfull | staff | 902b8e2a5c10f1796abdc830c4a4acd2 | 2 | 0.0 | False | tripadvisor_review_1093 | d7fdc86b464f2a797dca33b058b68078 | usa_san francisco_chancellor_hotel_on_union_sq... | Stayed at the Chancellor recently for 3 nights... | How do you rate the staff? | [Staff very helpful] | [86] | [2] | [0.3] | [False] |
Sprawdzenie typów pytań
def show_question_types_counts(dfs_split):
counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]
for q in question_types:
try:
counts[q] = dfs[dfs_split]["question"].str.startswith(q).value_counts()[True]
except:
counts[q] = 0
pd.Series(counts).sort_values().plot.barh()
plt.title("Frequency of Question Types " + dfs_split)
plt.show()
show_question_types_counts("train")
show_question_types_counts("validation")
Pytania i ilości odpowiedzi
def show_answers_counts(dfs_split):
counts = dfs[dfs_split]["answers.text"] \
.groupby(dfs[dfs_split]["answers.text"].str.len()) \
.count().to_dict()
pd.Series(counts).sort_values().plot.barh()
plt.title("Number of Answers " + dfs_split)
plt.show()
show_answers_counts("train")
show_answers_counts("validation")
Przykładowe pytania
for question_type in ["How", "What", "Is"]:
for question in (
dfs["train"][dfs["train"].question.str.startswith(question_type)]
.sample(n=3, random_state=42)['question']):
print(question)
How was the service staff? How is the stay? How tastefully decorated was the room? What is the customer service? What do you think about dinner? What is the most expensive price of food? Is it service ? Is it location ? Is it a good place to stay?
Preprocessing
Tokenizacja
model_ckpt = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer.is_fast
True
def compute_input_length(row):
inputs = tokenizer(row["question"], row["context"])
inputs_length = len(inputs["input_ids"])
del inputs
return inputs_length
dfs["train"]["n_tokens"] = dfs["train"].apply(compute_input_length, axis=1)
fig, ax = plt.subplots()
dfs["train"]["n_tokens"].hist(bins=100, grid=False, ec="C0", ax=ax)
plt.xlabel("Number of tokens in question-context pair")
ax.axvline(x=384, ymin=0, ymax=1, linestyle="--", color="C1",
label="Maximum sequence length")
plt.legend()
plt.ylabel("Count")
plt.show()
Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
Użycie stride w celu pozyskania okna z kontekstem i okrojenia wielkości tesktu, prezentacja działania
context = subjqa["train"][2:6]["context"]
question = subjqa["train"][2:6]["question"]
inputs = tokenizer(
question,
context,
max_length=100,
truncation="only_second",
stride=50,
return_overflowing_tokens=True,
return_offsets_mapping=True,
)
for ids in inputs["input_ids"]:
print(tokenizer.decode(ids))
[CLS] How many employees does the company have? [SEP] We had a great two night stay in Dec 2008. Hotel staff were very accomodating with our 18 month old - providing a cot, toy and stroller for him. The staff were friendly and upbeat. Our room was also upgraded. Our only dissappointment was the food at the downstairs restaurant - way too expensive and did not meet expectations. [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] My wife and I stayed here on our honeymoon for 2 nights in early March and had a great stay. I had booked a room at another hotel on hotels. com on Jan. 6th though. Then 4 days before we were to start our honeymoon in San Fran., hotels. com send me an e - mail to say they had to change my reservation from a hotel downtown to a hotel at the [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP]. com on Jan. 6th though. Then 4 days before we were to start our honeymoon in San Fran., hotels. com send me an e - mail to say they had to change my reservation from a hotel downtown to a hotel at the airport. I rufused and was given a refund and no hotel. There was a huge 30, 000 person convention in town at the same [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] - mail to say they had to change my reservation from a hotel downtown to a hotel at the airport. I rufused and was given a refund and no hotel. There was a huge 30, 000 person convention in town at the same time so all the hotels were full. I got lucky though and was able to book a bay view room at The Argonaut. This was the best [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP]nd and no hotel. There was a huge 30, 000 person convention in town at the same time so all the hotels were full. I got lucky though and was able to book a bay view room at The Argonaut. This was the best thing that could of happened to us! This place was amazing! We arrived at the hotel around 12 : 30pm and were able to check in right away [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] and was able to book a bay view room at The Argonaut. This was the best thing that could of happened to us! This place was amazing! We arrived at the hotel around 12 : 30pm and were able to check in right away! The staff were all so nice and called us by name whenever they saw us. The room was amazing! We had a beautiful view of Alcatraz [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP]! We arrived at the hotel around 12 : 30pm and were able to check in right away! The staff were all so nice and called us by name whenever they saw us. The room was amazing! We had a beautiful view of Alcatraz Island and could even see the golden gate. Internet worked great and the TV was fine. It's not a big fancy TV, but we didn'[SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] whenever they saw us. The room was amazing! We had a beautiful view of Alcatraz Island and could even see the golden gate. Internet worked great and the TV was fine. It's not a big fancy TV, but we didn't fly all the way to San Fran. to watch TV anyway. What made our stay so great was the concierge. There was a wonderful [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] and the TV was fine. It's not a big fancy TV, but we didn't fly all the way to San Fran. to watch TV anyway. What made our stay so great was the concierge. There was a wonderful young lady there that recommended 2 wonderful resturants for dinners and an amazing dim sum resturant for lunch. We enjoyed San Fran. way [SEP] [CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] TV anyway. What made our stay so great was the concierge. There was a wonderful young lady there that recommended 2 wonderful resturants for dinners and an amazing dim sum resturant for lunch. We enjoyed San Fran. way more then we ever thought we would and we owe that to The Argonaut Hotel and there staff. We can't wait to come visit again. [SEP] [CLS] How is the employee service on this hotel? [SEP] Spent one night at The Argonaut and wish that we could have stayed longer. We even missed the free wine hour but what the heck, the staff was so pleasant and not in a superficial,'I don't mean it'kind of way. Valet was on top of their game and was very helpful with directions or anything we might need. The roofm was super clean and I was looking for dirt in the areas [SEP] [CLS] How is the employee service on this hotel? [SEP] a superficial,'I don't mean it'kind of way. Valet was on top of their game and was very helpful with directions or anything we might need. The roofm was super clean and I was looking for dirt in the areas that sometimes don't get the best housekeeping job. I couldn't find as much as a crumb! We were on the fourth floor in a two queen bedded room [SEP] [CLS] How is the employee service on this hotel? [SEP] was super clean and I was looking for dirt in the areas that sometimes don't get the best housekeeping job. I couldn't find as much as a crumb! We were on the fourth floor in a two queen bedded room as we were traveling with our two children. We were upgraded to a Cannery / Alcatraz view because of our membership in Kimpton's In Touch program but it was really just [SEP] [CLS] How is the employee service on this hotel? [SEP] were on the fourth floor in a two queen bedded room as we were traveling with our two children. We were upgraded to a Cannery / Alcatraz view because of our membership in Kimpton's In Touch program but it was really just a view of the Mexican restaurant below unless you really, really cranked your head. Still nice to receive. We also had a note welcoming us and a big bottle of water but didn [SEP] [CLS] How is the employee service on this hotel? [SEP] Kimpton's In Touch program but it was really just a view of the Mexican restaurant below unless you really, really cranked your head. Still nice to receive. We also had a note welcoming us and a big bottle of water but didn't drink it because it didn't say it was complementary. Overall a great experience and I wouldn't hesitate to return whenever I am in the San Francisco area. Great decor [SEP] [CLS] How is the employee service on this hotel? [SEP] a note welcoming us and a big bottle of water but didn't drink it because it didn't say it was complementary. Overall a great experience and I wouldn't hesitate to return whenever I am in the San Francisco area. Great decor. This was our first stay in a Kimtpon property but I am going to seek them out from now on. [SEP] [CLS] How is attraction? [SEP] This is a great hotel! I loved the nautical decoration! The room was immaculate, very comfortable and we didn't have any issues with noise. The location is great, especially for a quick stay as you are close to the tourist attractions. There are countless kiosks in the area for different tours and attractions. We took the cable car tour around San Fran and over the Golden Gate Bridge. The ferry to Alcatraz is a 10 - [SEP] [CLS] How is attraction? [SEP] you are close to the tourist attractions. There are countless kiosks in the area for different tours and attractions. We took the cable car tour around San Fran and over the Golden Gate Bridge. The ferry to Alcatraz is a 10 - 15 minute walk away from the hotel. There are loads of restaurants nearby. The restaurant attached to the hotel was great for breakfast. I would definitely recommend this hotel. [SEP]
print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")
The 4 examples gave 18 features. Here is where each comes from: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3].
answers = subjqa["train"][2:6]["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(inputs["offset_mapping"]):
sample_idx = inputs["overflow_to_sample_mapping"][i]
sequence_ids = inputs.sequence_ids(i)
answer = answers[sample_idx]
# Find start and end of context
context_start = sequence_ids.index(1)
context_end = len(sequence_ids) - 2
start_char = 0
end_char = 0
if len(answer['answer_start']) > 0:
for idx, answer_start in enumerate(answer['answer_start']):
tmp_start_char = answer_start
tmp_end_char = answer_start + len(answer['text'][idx])
# Answer inside context
if offset[context_start][0] <= tmp_start_char and offset[context_end][1] >= tmp_end_char:
idx_c = context_start
while idx_c <= context_end and offset[idx_c][0] <= tmp_start_char:
idx_c += 1
start_char = idx_c - 1
idx_c = context_end
while idx_c >= context_start and offset[idx_c][1] >= tmp_end_char:
idx_c -= 1
end_char = idx_c + 1
break
start_positions.append(start_char)
end_positions.append(end_char)
print(start_positions)
print(end_positions)
[31, 0, 0, 0, 0, 0, 0, 0, 0, 61, 41, 0, 0, 0, 0, 0, 70, 27] [31, 0, 0, 0, 0, 0, 0, 0, 0, 72, 45, 0, 0, 0, 0, 0, 70, 27]
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]
start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start: end + 1])
print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
del inputs
Theoretical answer: 18, labels give: 18
Przygotowanie do stride
def preprocess_train_data(examples, max_length=384, stride=128):
questions = [q.strip() for q in examples["question"]]
print(tokenizer.__class__.__name__)
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
answers = examples["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(inputs["offset_mapping"]):
sample_idx = inputs["overflow_to_sample_mapping"][i]
sequence_ids = inputs.sequence_ids(i)
answer = answers[sample_idx]
# Find start and end of context
context_start = sequence_ids.index(1)
context_end = len(sequence_ids) - 2
start_char = 0
end_char = 0
if len(answer['answer_start']) > 0:
for idx, answer_start in enumerate(answer['answer_start']):
tmp_start_char = answer_start
tmp_end_char = answer_start + len(answer['text'][idx])
# Answer inside context
if offset[context_start][0] <= tmp_start_char and offset[context_end][1] >= tmp_end_char:
idx_c = context_start
while idx_c <= context_end and offset[idx_c][0] <= tmp_start_char:
idx_c += 1
start_char = idx_c - 1
idx_c = context_end
while idx_c >= context_start and offset[idx_c][1] >= tmp_end_char:
idx_c -= 1
end_char = idx_c + 1
break
start_positions.append(start_char)
end_positions.append(end_char)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
return inputs
train_dataset = subjqa["train"].map(
preprocess_train_data,
batched=True,
remove_columns=subjqa["train"].column_names,
)
len(subjqa["train"]), len(train_dataset)
0%| | 0/2 [00:00<?, ?ba/s]
(1666, 2030)
def preprocess_validation_examples(examples, max_length=384, stride=128):
questions = [q.strip() for q in examples["question"]]
print(tokenizer.__class__.__name__)
inputs = tokenizer(
questions,
examples["context"],
max_length=max_length,
truncation="only_second",
stride=stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_map = inputs.pop("overflow_to_sample_mapping")
example_ids = []
for i in range(len(inputs["input_ids"])):
sample_idx = sample_map[i]
example_ids.append(examples["id"][sample_idx])
sequence_ids = inputs.sequence_ids(i)
offset = inputs["offset_mapping"][i]
inputs["offset_mapping"][i] = [
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
]
inputs["example_id"] = example_ids
return inputs
validation_dataset = subjqa["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=subjqa["validation"].column_names,
)
len(subjqa["validation"]), len(validation_dataset)
0%| | 0/1 [00:00<?, ?ba/s]
(265, 327)
Experymenty
from tqdm.auto import tqdm
import evaluate
import collections
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
metric = evaluate.load("squad")
n_best = 3
max_answer_length = 50
predicted_answers = []
def compute_metrics(start_logits, end_logits, features, examples):
example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(features):
example_to_features[feature["example_id"]].append(idx)
predicted_answers = []
for example in tqdm(examples):
example_id = example["id"]
context = example["context"]
answers = []
# Loop through all features associated with that example
for feature_index in example_to_features[example_id]:
start_logit = start_logits[feature_index]
end_logit = end_logits[feature_index]
offsets = features[feature_index]["offset_mapping"]
start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Skip answers that are not fully in the context
if offsets[start_index] is None or offsets[end_index] is None:
continue
# Skip answers with a length that is either < 0 or > max_answer_length
if (
end_index < start_index
or end_index - start_index + 1 > max_answer_length
):
continue
answer = {
"text": context[offsets[start_index][0]: offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
}
answers.append(answer)
# Select the answer with the best score
if len(answers) > 0:
best_answer = max(answers, key=lambda x: x["logit_score"])
predicted_answers.append(
{"id": example_id, "prediction_text": best_answer["text"]}
)
else:
predicted_answers.append({"id": example_id, "prediction_text": ""})
theoretical_answers = [
{"id": ex["id"], "answers": {
'text': ex["answers"]['text']
if len(ex["answers"]['text']) != 0
else [""],
'answer_start': ex["answers"]["answer_start"]
if len(ex["answers"]["answer_start"]) != 0
else [0]
}} for ex in examples
]
metrics = metric.compute(predictions=predicted_answers, references=theoretical_answers)
for i in range(3):
print("QUESTION:\t", examples[i]['question'])
print("PREDICTED:", predicted_answers[i]['prediction_text'])
print("ACTUAL:", theoretical_answers[i]['answers']['text'])
print(metrics)
return predicted_answers, theoretical_answers, metrics
import numpy as np
def predict_from_trained():
eval_set = subjqa["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=subjqa["validation"].column_names,
)
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")
batch_size = 8
all_start_logits = []
all_end_logits = []
for i in range(0, int(eval_set_for_model.num_rows / batch_size.__ceil__())+1):
batch = {k: eval_set_for_model[k][batch_size*i:batch_size*(i+1)].to(device) for k in eval_set_for_model.column_names}
with torch.no_grad():
outputs = trained_model(**batch)
all_start_logits.append(outputs.start_logits.cpu().numpy())
all_end_logits.append(outputs.end_logits.cpu().numpy())
start_logits = np.concatenate(all_start_logits, axis=0)
end_logits = np.concatenate(all_end_logits, axis=0)
_=compute_metrics(start_logits, end_logits, eval_set, subjqa["validation"])
distilbert-base-cased-distilled-squad trained
trained_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)
predict_from_trained()
Loading cached processed dataset at /home/karo/.cache/huggingface/datasets/subjqa/tripadvisor/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd/cache-bfb7935e995d8aee.arrow
0%| | 0/265 [00:00<?, ?it/s]
QUESTION: How was the hotel? PREDICTED: it is a quality 4-star hotel ACTUAL: ['excellent hotels'] QUESTION: How is the hotel? PREDICTED: The hotel is clean and neat ACTUAL: ['The hotel location was great'] QUESTION: Is it value for money? PREDICTED: excellent value for money ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.'] {'exact_match': 10.943396226415095, 'f1': 33.948637559395756}
Bert Uncased Fine tune
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] - This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
from transformers import TrainingArguments
from transformers import Trainer
args = TrainingArguments(
output_dir="data/bert-finetuned-subjqa",
overwrite_output_dir=True,
evaluation_strategy="no",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
fp16=True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=validation_dataset,
tokenizer=tokenizer,
)
trainer.train()
Using cuda_amp half precision backend The following columns in the training set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, overflow_to_sample_mapping. If offset_mapping, overflow_to_sample_mapping are not expected by `BertForQuestionAnswering.forward`, you can safely ignore this message. /home/karo/nlp-project/venv/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( ***** Running training ***** Num examples = 2030 Num Epochs = 3 Instantaneous batch size per device = 8 Total train batch size (w. parallel, distributed & accumulation) = 8 Gradient Accumulation steps = 1 Total optimization steps = 762
[ 2/762 : < :, Epoch 0.00/3]
Step | Training Loss |
---|
Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-254 Configuration saved in data/bert-finetuned-subjqa/checkpoint-254/config.json Model weights saved in data/bert-finetuned-subjqa/checkpoint-254/pytorch_model.bin tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-254/tokenizer_config.json Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-254/special_tokens_map.json Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-508 Configuration saved in data/bert-finetuned-subjqa/checkpoint-508/config.json Model weights saved in data/bert-finetuned-subjqa/checkpoint-508/pytorch_model.bin tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-508/tokenizer_config.json Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-508/special_tokens_map.json Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-762 Configuration saved in data/bert-finetuned-subjqa/checkpoint-762/config.json Model weights saved in data/bert-finetuned-subjqa/checkpoint-762/pytorch_model.bin tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-762/tokenizer_config.json Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-762/special_tokens_map.json Training completed. Do not forget to share your model on huggingface.co/models =)
TrainOutput(global_step=762, training_loss=0.7420042443463183, metrics={'train_runtime': 336.3364, 'train_samples_per_second': 18.107, 'train_steps_per_second': 2.266, 'total_flos': 1193472936391680.0, 'train_loss': 0.7420042443463183, 'epoch': 3.0})
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
_=compute_metrics(start_logits, end_logits, validation_dataset, subjqa["validation"])
The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `BertForQuestionAnswering.forward`, you can safely ignore this message. ***** Running Prediction ***** Num examples = 327 Batch size = 8
[ 1/41 : < :]
0%| | 0/265 [00:00<?, ?it/s]
QUESTION: How was the hotel? PREDICTED: Great setting at the end of the wharf ACTUAL: ['excellent hotels'] QUESTION: How is the hotel? PREDICTED: My wife and I took two trips to San Fran in 2004 and stayed at the best western both times ACTUAL: ['The hotel location was great'] QUESTION: Is it value for money? PREDICTED: excellent value for money ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.'] {'exact_match': 13.584905660377359, 'f1': 30.577012885313252}
trained_checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)
predict_from_trained()
loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/vocab.json from cache at /home/karo/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05 loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/merges.txt from cache at /home/karo/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer.json from cache at None loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/added_tokens.json from cache at None loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/special_tokens_map.json from cache at /home/karo/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0 loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer_config.json from cache at /home/karo/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513 loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading weights file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/pytorch_model.bin from cache at /home/karo/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e All model checkpoint weights were used when initializing RobertaForQuestionAnswering. All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at deepset/roberta-base-squad2. If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.
0%| | 0/1 [00:00<?, ?ba/s]
RobertaTokenizerFast
0%| | 0/265 [00:00<?, ?it/s]
QUESTION: How was the hotel? PREDICTED: No complaints - really happy with what this hotel offered for the price ACTUAL: ['excellent hotels'] QUESTION: How is the hotel? PREDICTED: clean and neat ACTUAL: ['The hotel location was great'] QUESTION: Is it value for money? PREDICTED: excellent value for money ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.'] {'exact_match': 11.69811320754717, 'f1': 37.040671268633204}
model_ckpt = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
roberta_train_dataset = subjqa["train"].map(
preprocess_train_data,
batched=True,
remove_columns=subjqa["train"].column_names,
)
print(len(subjqa["train"]), len(roberta_train_dataset))
roberta_validation_dataset = subjqa["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=subjqa["validation"].column_names,
)
print(len(subjqa["validation"]), len(roberta_validation_dataset))
loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading weights file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/pytorch_model.bin from cache at /home/karo/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e All model checkpoint weights were used when initializing RobertaForQuestionAnswering. All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at deepset/roberta-base-squad2. If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training. loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/vocab.json from cache at /home/karo/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05 loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/merges.txt from cache at /home/karo/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer.json from cache at None loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/added_tokens.json from cache at None loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/special_tokens_map.json from cache at /home/karo/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0 loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer_config.json from cache at /home/karo/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513 loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673 Model config RobertaConfig { "_name_or_path": "deepset/roberta-base-squad2", "architectures": [ "RobertaForQuestionAnswering" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "language": "english", "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "name": "Roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.21.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 }
0%| | 0/2 [00:00<?, ?ba/s]
RobertaTokenizerFast RobertaTokenizerFast 1666 1995
0%| | 0/1 [00:00<?, ?ba/s]
RobertaTokenizerFast 265 321
args = TrainingArguments(
output_dir="data/roberta-finetuned-subjqa",
overwrite_output_dir=True,
evaluation_strategy="no",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
fp16=True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=roberta_train_dataset,
eval_dataset=roberta_validation_dataset,
tokenizer=tokenizer,
)
trainer.train()
PyTorch: setting up devices The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). Using cuda_amp half precision backend The following columns in the training set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: offset_mapping, overflow_to_sample_mapping. If offset_mapping, overflow_to_sample_mapping are not expected by `RobertaForQuestionAnswering.forward`, you can safely ignore this message. /home/karo/nlp-project/venv/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( ***** Running training ***** Num examples = 1995 Num Epochs = 3 Instantaneous batch size per device = 8 Total train batch size (w. parallel, distributed & accumulation) = 8 Gradient Accumulation steps = 1 Total optimization steps = 750
[ 2/750 : < :, Epoch 0.00/3]
Step | Training Loss |
---|
Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-250 Configuration saved in data/roberta-finetuned-subjqa/checkpoint-250/config.json Model weights saved in data/roberta-finetuned-subjqa/checkpoint-250/pytorch_model.bin tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-250/tokenizer_config.json Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-250/special_tokens_map.json Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-500 Configuration saved in data/roberta-finetuned-subjqa/checkpoint-500/config.json Model weights saved in data/roberta-finetuned-subjqa/checkpoint-500/pytorch_model.bin tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-500/tokenizer_config.json Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-500/special_tokens_map.json Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-750 Configuration saved in data/roberta-finetuned-subjqa/checkpoint-750/config.json Model weights saved in data/roberta-finetuned-subjqa/checkpoint-750/pytorch_model.bin tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-750/tokenizer_config.json Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-750/special_tokens_map.json Training completed. Do not forget to share your model on huggingface.co/models =)
TrainOutput(global_step=750, training_loss=0.4630465749104818, metrics={'train_runtime': 337.0904, 'train_samples_per_second': 17.755, 'train_steps_per_second': 2.225, 'total_flos': 1172895816798720.0, 'train_loss': 0.4630465749104818, 'epoch': 3.0})
predictions, _, _ = trainer.predict(roberta_validation_dataset)
start_logits, end_logits = predictions
_=compute_metrics(start_logits, end_logits, roberta_validation_dataset, subjqa["validation"])
The following columns in the test set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`, you can safely ignore this message. ***** Running Prediction ***** Num examples = 321 Batch size = 8
[ 1/41 : < :]
0%| | 0/265 [00:00<?, ?it/s]
QUESTION: How was the hotel? PREDICTED: No complaints ACTUAL: ['excellent hotels'] QUESTION: How is the hotel? PREDICTED: The hotel location was great ACTUAL: ['The hotel location was great'] QUESTION: Is it value for money? PREDICTED: excellent value for money ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.'] {'exact_match': 29.81132075471698, 'f1': 46.72268395172683}