Karolina b3caefa329 Change cell to polish

2022-08-29 01:17:15 +02:00

190 KiB

Raw Permalink Blame History

Importy i sprawdzenie GPU

import datasets
import torch
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import json
import torch

is_gpu_available = torch.cuda.is_available()
print("Available GPU", is_gpu_available)

Available GPU True

Zbiór danych

Sprawdź dostępne config dla zbioru

datasets.get_dataset_config_names("subjqa")

['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

selected_config = 'tripadvisor'
ds_builder = datasets.load_dataset_builder("subjqa", selected_config)
print(ds_builder.info.description, "\n")
print("FEATURES", ds_builder.info.features, "\n")
print("SPLITS", ds_builder.info.splits, "\n")

SubjQA is a question answering dataset that focuses on subjective questions and answers.
The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,
electronics, TripAdvisor (i.e. hotels), and restaurants. 

FEATURES {'domain': Value(dtype='string', id=None), 'nn_mod': Value(dtype='string', id=None), 'nn_asp': Value(dtype='string', id=None), 'query_mod': Value(dtype='string', id=None), 'query_asp': Value(dtype='string', id=None), 'q_reviews_id': Value(dtype='string', id=None), 'question_subj_level': Value(dtype='int64', id=None), 'ques_subj_score': Value(dtype='float32', id=None), 'is_ques_subjective': Value(dtype='bool', id=None), 'review_id': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None), 'answer_subj_level': Value(dtype='int64', id=None), 'ans_subj_score': Value(dtype='float32', id=None), 'is_ans_subjective': Value(dtype='bool', id=None)}, length=-1, id=None)} 

SPLITS {'train': SplitInfo(name='train', num_bytes=1574953, num_examples=1165, dataset_name='subjqa'), 'test': SplitInfo(name='test', num_bytes=689440, num_examples=512, dataset_name='subjqa'), 'validation': SplitInfo(name='validation', num_bytes=312577, num_examples=230, dataset_name='subjqa')}

subjqa_tripadv = datasets.load_dataset("subjqa", selected_config)

Reusing dataset subjqa (/home/karo/.cache/huggingface/datasets/subjqa/tripadvisor/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd)

  0%|          | 0/3 [00:00<?, ?it/s]

selected_config = 'restaurants'
ds_builder = datasets.load_dataset_builder("subjqa", selected_config)
print(ds_builder.info.description, "\n")
print("FEATURES", ds_builder.info.features, "\n")
print("SPLITS", ds_builder.info.splits, "\n")

SubjQA is a question answering dataset that focuses on subjective questions and answers.
The dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,
electronics, TripAdvisor (i.e. hotels), and restaurants. 

FEATURES {'domain': Value(dtype='string', id=None), 'nn_mod': Value(dtype='string', id=None), 'nn_asp': Value(dtype='string', id=None), 'query_mod': Value(dtype='string', id=None), 'query_asp': Value(dtype='string', id=None), 'q_reviews_id': Value(dtype='string', id=None), 'question_subj_level': Value(dtype='int64', id=None), 'ques_subj_score': Value(dtype='float32', id=None), 'is_ques_subjective': Value(dtype='bool', id=None), 'review_id': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None), 'answer_subj_level': Value(dtype='int64', id=None), 'ans_subj_score': Value(dtype='float32', id=None), 'is_ans_subjective': Value(dtype='bool', id=None)}, length=-1, id=None)} 

SPLITS {'train': SplitInfo(name='train', num_bytes=1823263, num_examples=1400, dataset_name='subjqa'), 'test': SplitInfo(name='test', num_bytes=335385, num_examples=266, dataset_name='subjqa'), 'validation': SplitInfo(name='validation', num_bytes=349286, num_examples=267, dataset_name='subjqa')}

subjqa_restaurant = datasets.load_dataset("subjqa", selected_config)

Reusing dataset subjqa (/home/karo/.cache/huggingface/datasets/subjqa/restaurants/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd)

  0%|          | 0/3 [00:00<?, ?it/s]

Połącz datasety restaurant oraz trip_advisor oraz usuń test i przyłącz do train, ponieważ nie jest potrzebny

def concat_test2train(dataset):
    dataset['train'] = datasets.concatenate_datasets([dataset['test'], dataset['train']])
    del dataset['test']
    return dataset


subjqa_restaurant = concat_test2train(subjqa_restaurant)
subjqa_tripadv = concat_test2train(subjqa_tripadv)
subjqa = datasets.DatasetDict({
    'train': datasets.concatenate_datasets([subjqa_tripadv['train'], subjqa_restaurant['train']]),
    'validation': datasets.concatenate_datasets([subjqa_tripadv['validation'], subjqa_restaurant['validation']]),
})

subjqa

DatasetDict({
    train: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 3343
    })
    validation: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 497
    })
})

print(subjqa["train"]['question'][0])
print(subjqa["train"]["answers"][0])

Is a cramped  room?
{'text': ['Hotel rooms were very small', 'rooms were very small'], 'answer_start': [12, 18], 'answer_subj_level': [2, 2], 'ans_subj_score': [0.5199999809265137, 0.5199999809265137], 'is_ans_subjective': [True, True]}

Ilość unikalnych pytań w split

dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}

for split, df in dfs.items():
    print(f"Number of questions in {split}: {df['id'].nunique()}")

Number of questions in train: 3343
Number of questions in validation: 497

qa_cols = ["id", "title", "question", "answers.text",
           "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df

	id	title	question	answers.text	answers.answer_start	context
2544	0d78196c65a3bec3913e2291cd9b771f	Yl2TN9c23ZGLUBSD9ks5Uw	Is it present ?	[so it's a bit of a splurge meal]	[177]	Byblos has a really beautiful decor, that I wo...
884	57b688d836d9502a076fe6a50a9cde81	usa_san francisco_hotel_adagio	Where can I locate the hotel staff?	[]	[]	Stayed at the Adagio for 4 nights end of June-...

Sprawdzenie generowania odpowiedzi z kontekstu

start_idx = sample_df["answers.answer_start"].iloc[0][0]
end_idx = start_idx + len(sample_df["answers.text"].iloc[0][0])
sample_df["context"].iloc[0][start_idx:end_idx]

"so it's a bit of a splurge meal"

Usuń obiekty które nie posiadają odpowiedzi

subjqa['train'] = subjqa['train'].filter(lambda example: len(example["answers"]['text']) > 0)
subjqa['validation'] = subjqa['validation'].filter(lambda example: len(example["answers"]['text']) > 0)
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}
subjqa

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 1666
    })
    validation: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 265
    })
})

dfs['validation'].head()

	domain	nn_mod	nn_asp	query_mod	query_asp	q_reviews_id	question_subj_level	ques_subj_score	is_ques_subjective	review_id	id	title	context	question	answers.text	answers.answer_start	answers.answer_subj_level	answers.ans_subj_score	answers.is_ans_subjective
0	tripadvisor	excellent	hotel	friendly	hotel	b96b8478f5202ac9534eaf75167016f7	1	0.0	False	tripadvisor_review_1509	d1c352b70d1225245569a0a1acbf5e04	usa_san francisco_argonaut_hotel_a_kimpton_hotel	Great setting at the end of the wharf (so your...	How was the hotel?	[excellent hotels]	[527]	[1]	[1.0]	[True]
1	tripadvisor	neat	hotel	cozy	hotel	ca174c824baba906d30eed207350e37e	1	0.0	False	tripadvisor_review_6133	b6fbf58ca273ad8f9b47a9be6a36e707	usa_san francisco_best_western_tuscan_inn_fish...	My wife and I took two trips to San Fran in 20...	How is the hotel?	[The hotel location was great]	[129]	[1]	[0.75]	[True]
2	tripadvisor	excellent	value for money	good	value for money	5d41bfd0e2166e14eeae1c1be9085555	3	0.0	False	tripadvisor_review_4872	ea95bccdd762284ad7040be8d016da4f	usa_san francisco_castle_inn	Yep, I have to agree with all those folks who ...	Is it value for money?	[And very reasonably priced. Overall, excellen...	[501]	[3]	[0.58]	[True]
3	tripadvisor	convenient	place	safe	hotel	a5880d95aa1161cfaac48584ee58934d	1	0.6	True	tripadvisor_review_4715	413df1095d03e4f967d349f0490a2514	usa_san francisco_castle_inn	On first sight the Castle hotel is not great, ...	Does the hotel offer good service?	[On first sight the Castle hotel is not great]	[0]	[1]	[0.5416667]	[True]
4	tripadvisor	helpful	staff	helpfull	staff	902b8e2a5c10f1796abdc830c4a4acd2	2	0.0	False	tripadvisor_review_1093	d7fdc86b464f2a797dca33b058b68078	usa_san francisco_chancellor_hotel_on_union_sq...	Stayed at the Chancellor recently for 3 nights...	How do you rate the staff?	[Staff very helpful]	[86]	[2]	[0.3]	[False]

Sprawdzenie typów pytań

def show_question_types_counts(dfs_split):
    counts = {}
    question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]

    for q in question_types:
        try:
            counts[q] = dfs[dfs_split]["question"].str.startswith(q).value_counts()[True]
        except:
            counts[q] = 0
    pd.Series(counts).sort_values().plot.barh()
    plt.title("Frequency of Question Types " + dfs_split)
    plt.show()


show_question_types_counts("train")
show_question_types_counts("validation")

Pytania i ilości odpowiedzi

def show_answers_counts(dfs_split):
    counts = dfs[dfs_split]["answers.text"] \
        .groupby(dfs[dfs_split]["answers.text"].str.len()) \
        .count().to_dict()

    pd.Series(counts).sort_values().plot.barh()
    plt.title("Number of Answers " + dfs_split)
    plt.show()


show_answers_counts("train")
show_answers_counts("validation")

Przykładowe pytania

for question_type in ["How", "What", "Is"]:
    for question in (
            dfs["train"][dfs["train"].question.str.startswith(question_type)]
                    .sample(n=3, random_state=42)['question']):
        print(question)

How was the service staff?
How is the stay?
How tastefully decorated was the room?
What is the customer service?
What do you think about dinner?
What is the most expensive price of food?
Is it service ?
Is it location ?
Is it a good place to stay?

Preprocessing

Tokenizacja

model_ckpt = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer.is_fast

True

def compute_input_length(row):
    inputs = tokenizer(row["question"], row["context"])
    inputs_length = len(inputs["input_ids"])
    del inputs
    return inputs_length


dfs["train"]["n_tokens"] = dfs["train"].apply(compute_input_length, axis=1)

fig, ax = plt.subplots()
dfs["train"]["n_tokens"].hist(bins=100, grid=False, ec="C0", ax=ax)
plt.xlabel("Number of tokens in question-context pair")
ax.axvline(x=384, ymin=0, ymax=1, linestyle="--", color="C1",
           label="Maximum sequence length")
plt.legend()
plt.ylabel("Count")
plt.show()

Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors

Użycie stride w celu pozyskania okna z kontekstem i okrojenia wielkości tesktu, prezentacja działania

context = subjqa["train"][2:6]["context"]
question = subjqa["train"][2:6]["question"]

inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] How many employees does the company have? [SEP] We had a great two night stay in Dec 2008. Hotel staff were very accomodating with our 18 month old - providing a cot, toy and stroller for him. The staff were friendly and upbeat. Our room was also upgraded. Our only dissappointment was the food at the downstairs restaurant - way too expensive and did not meet expectations. [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] My wife and I stayed here on our honeymoon for 2 nights in early March and had a great stay. I had booked a room at another hotel on hotels. com on Jan. 6th though. Then 4 days before we were to start our honeymoon in San Fran., hotels. com send me an e - mail to say they had to change my reservation from a hotel downtown to a hotel at the [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP]. com on Jan. 6th though. Then 4 days before we were to start our honeymoon in San Fran., hotels. com send me an e - mail to say they had to change my reservation from a hotel downtown to a hotel at the airport. I rufused and was given a refund and no hotel. There was a huge 30, 000 person convention in town at the same [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] - mail to say they had to change my reservation from a hotel downtown to a hotel at the airport. I rufused and was given a refund and no hotel. There was a huge 30, 000 person convention in town at the same time so all the hotels were full. I got lucky though and was able to book a bay view room at The Argonaut. This was the best [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP]nd and no hotel. There was a huge 30, 000 person convention in town at the same time so all the hotels were full. I got lucky though and was able to book a bay view room at The Argonaut. This was the best thing that could of happened to us! This place was amazing! We arrived at the hotel around 12 : 30pm and were able to check in right away [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] and was able to book a bay view room at The Argonaut. This was the best thing that could of happened to us! This place was amazing! We arrived at the hotel around 12 : 30pm and were able to check in right away! The staff were all so nice and called us by name whenever they saw us. The room was amazing! We had a beautiful view of Alcatraz [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP]! We arrived at the hotel around 12 : 30pm and were able to check in right away! The staff were all so nice and called us by name whenever they saw us. The room was amazing! We had a beautiful view of Alcatraz Island and could even see the golden gate. Internet worked great and the TV was fine. It's not a big fancy TV, but we didn'[SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] whenever they saw us. The room was amazing! We had a beautiful view of Alcatraz Island and could even see the golden gate. Internet worked great and the TV was fine. It's not a big fancy TV, but we didn't fly all the way to San Fran. to watch TV anyway. What made our stay so great was the concierge. There was a wonderful [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] and the TV was fine. It's not a big fancy TV, but we didn't fly all the way to San Fran. to watch TV anyway. What made our stay so great was the concierge. There was a wonderful young lady there that recommended 2 wonderful resturants for dinners and an amazing dim sum resturant for lunch. We enjoyed San Fran. way [SEP]
[CLS] Was the atmosphere of the tourist areas of san fransisco peaceful? [SEP] TV anyway. What made our stay so great was the concierge. There was a wonderful young lady there that recommended 2 wonderful resturants for dinners and an amazing dim sum resturant for lunch. We enjoyed San Fran. way more then we ever thought we would and we owe that to The Argonaut Hotel and there staff. We can't wait to come visit again. [SEP]
[CLS] How is the employee service on this hotel? [SEP] Spent one night at The Argonaut and wish that we could have stayed longer. We even missed the free wine hour but what the heck, the staff was so pleasant and not in a superficial,'I don't mean it'kind of way. Valet was on top of their game and was very helpful with directions or anything we might need. The roofm was super clean and I was looking for dirt in the areas [SEP]
[CLS] How is the employee service on this hotel? [SEP] a superficial,'I don't mean it'kind of way. Valet was on top of their game and was very helpful with directions or anything we might need. The roofm was super clean and I was looking for dirt in the areas that sometimes don't get the best housekeeping job. I couldn't find as much as a crumb! We were on the fourth floor in a two queen bedded room [SEP]
[CLS] How is the employee service on this hotel? [SEP] was super clean and I was looking for dirt in the areas that sometimes don't get the best housekeeping job. I couldn't find as much as a crumb! We were on the fourth floor in a two queen bedded room as we were traveling with our two children. We were upgraded to a Cannery / Alcatraz view because of our membership in Kimpton's In Touch program but it was really just [SEP]
[CLS] How is the employee service on this hotel? [SEP] were on the fourth floor in a two queen bedded room as we were traveling with our two children. We were upgraded to a Cannery / Alcatraz view because of our membership in Kimpton's In Touch program but it was really just a view of the Mexican restaurant below unless you really, really cranked your head. Still nice to receive. We also had a note welcoming us and a big bottle of water but didn [SEP]
[CLS] How is the employee service on this hotel? [SEP] Kimpton's In Touch program but it was really just a view of the Mexican restaurant below unless you really, really cranked your head. Still nice to receive. We also had a note welcoming us and a big bottle of water but didn't drink it because it didn't say it was complementary. Overall a great experience and I wouldn't hesitate to return whenever I am in the San Francisco area. Great decor [SEP]
[CLS] How is the employee service on this hotel? [SEP] a note welcoming us and a big bottle of water but didn't drink it because it didn't say it was complementary. Overall a great experience and I wouldn't hesitate to return whenever I am in the San Francisco area. Great decor. This was our first stay in a Kimtpon property but I am going to seek them out from now on. [SEP]
[CLS] How is attraction? [SEP] This is a great hotel! I loved the nautical decoration! The room was immaculate, very comfortable and we didn't have any issues with noise. The location is great, especially for a quick stay as you are close to the tourist attractions. There are countless kiosks in the area for different tours and attractions. We took the cable car tour around San Fran and over the Golden Gate Bridge. The ferry to Alcatraz is a 10 - [SEP]
[CLS] How is attraction? [SEP] you are close to the tourist attractions. There are countless kiosks in the area for different tours and attractions. We took the cable car tour around San Fran and over the Golden Gate Bridge. The ferry to Alcatraz is a 10 - 15 minute walk away from the hotel. There are loads of restaurants nearby. The restaurant attached to the hotel was great for breakfast. I would definitely recommend this hotel. [SEP]

print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 4 examples gave 18 features.
Here is where each comes from: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3].

answers = subjqa["train"][2:6]["answers"]
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    sequence_ids = inputs.sequence_ids(i)
    answer = answers[sample_idx]

    # Find start and end of context
    context_start = sequence_ids.index(1)
    context_end = len(sequence_ids) - 2
    start_char = 0
    end_char = 0
    if len(answer['answer_start']) > 0:
        for idx, answer_start in enumerate(answer['answer_start']):
            tmp_start_char = answer_start
            tmp_end_char = answer_start + len(answer['text'][idx])
            # Answer inside context
            if offset[context_start][0] <= tmp_start_char and offset[context_end][1] >= tmp_end_char:
                idx_c = context_start
                while idx_c <= context_end and offset[idx_c][0] <= tmp_start_char:
                    idx_c += 1
                start_char = idx_c - 1

                idx_c = context_end
                while idx_c >= context_start and offset[idx_c][1] >= tmp_end_char:
                    idx_c -= 1
                end_char = idx_c + 1
                break

    start_positions.append(start_char)
    end_positions.append(end_char)

print(start_positions)
print(end_positions)

[31, 0, 0, 0, 0, 0, 0, 0, 0, 61, 41, 0, 0, 0, 0, 0, 70, 27]
[31, 0, 0, 0, 0, 0, 0, 0, 0, 72, 45, 0, 0, 0, 0, 0, 70, 27]

idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start: end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
del inputs

Theoretical answer: 18, labels give: 18

Przygotowanie do stride


def preprocess_train_data(examples, max_length=384, stride=128):
    questions = [q.strip() for q in examples["question"]]
    print(tokenizer.__class__.__name__)
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(inputs["offset_mapping"]):
        sample_idx = inputs["overflow_to_sample_mapping"][i]
        sequence_ids = inputs.sequence_ids(i)
        answer = answers[sample_idx]

        # Find start and end of context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 2
        start_char = 0
        end_char = 0
        if len(answer['answer_start']) > 0:
            for idx, answer_start in enumerate(answer['answer_start']):
                tmp_start_char = answer_start
                tmp_end_char = answer_start + len(answer['text'][idx])
                # Answer inside context
                if offset[context_start][0] <= tmp_start_char and offset[context_end][1] >= tmp_end_char:
                    idx_c = context_start
                    while idx_c <= context_end and offset[idx_c][0] <= tmp_start_char:
                        idx_c += 1
                    start_char = idx_c - 1

                    idx_c = context_end
                    while idx_c >= context_start and offset[idx_c][1] >= tmp_end_char:
                        idx_c -= 1
                    end_char = idx_c + 1
                    break

        start_positions.append(start_char)
        end_positions.append(end_char)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

train_dataset = subjqa["train"].map(
    preprocess_train_data,
    batched=True,
    remove_columns=subjqa["train"].column_names,
)
len(subjqa["train"]), len(train_dataset)

  0%|          | 0/2 [00:00<?, ?ba/s]

(1666, 2030)

def preprocess_validation_examples(examples, max_length=384, stride=128):
    questions = [q.strip() for q in examples["question"]]
    print(tokenizer.__class__.__name__)
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

validation_dataset = subjqa["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=subjqa["validation"].column_names,
)
len(subjqa["validation"]), len(validation_dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

(265, 327)

Experymenty

from tqdm.auto import tqdm
import evaluate
import collections

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
metric = evaluate.load("squad")
n_best = 3
max_answer_length = 50
predicted_answers = []


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                            end_index < start_index
                            or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0]: offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [
        {"id": ex["id"], "answers": {
            'text': ex["answers"]['text']
            if len(ex["answers"]['text']) != 0
            else [""],
            'answer_start': ex["answers"]["answer_start"]
            if len(ex["answers"]["answer_start"]) != 0
            else [0]
        }} for ex in examples
    ]
    metrics = metric.compute(predictions=predicted_answers, references=theoretical_answers)
    for i in range(3):
        print("QUESTION:\t", examples[i]['question'])
        print("PREDICTED:", predicted_answers[i]['prediction_text'])
        print("ACTUAL:", theoretical_answers[i]['answers']['text'])
    print(metrics)

    return predicted_answers, theoretical_answers, metrics

import numpy as np

def predict_from_trained():
    eval_set = subjqa["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=subjqa["validation"].column_names,
    )
    eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
    eval_set_for_model.set_format("torch")
    batch_size = 8
    all_start_logits = []
    all_end_logits = []
    for i in range(0, int(eval_set_for_model.num_rows / batch_size.__ceil__())+1):
        batch = {k: eval_set_for_model[k][batch_size*i:batch_size*(i+1)].to(device) for k in eval_set_for_model.column_names}
        with torch.no_grad():
            outputs = trained_model(**batch)
        all_start_logits.append(outputs.start_logits.cpu().numpy())
        all_end_logits.append(outputs.end_logits.cpu().numpy())
    start_logits = np.concatenate(all_start_logits, axis=0)
    end_logits = np.concatenate(all_end_logits, axis=0)
    _=compute_metrics(start_logits, end_logits, eval_set, subjqa["validation"])

distilbert-base-cased-distilled-squad trained

trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)
predict_from_trained()

Loading cached processed dataset at /home/karo/.cache/huggingface/datasets/subjqa/tripadvisor/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd/cache-bfb7935e995d8aee.arrow

  0%|          | 0/265 [00:00<?, ?it/s]

QUESTION:	 How was the hotel?
PREDICTED: it is a quality 4-star hotel
ACTUAL: ['excellent hotels']
QUESTION:	 How is the hotel?
PREDICTED: The hotel is clean and neat
ACTUAL: ['The hotel location was great']
QUESTION:	 Is it value for money?
PREDICTED: excellent value for money
ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.']
{'exact_match': 10.943396226415095, 'f1': 33.948637559395756}

Bert Uncased Fine tune

model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

from transformers import TrainingArguments
from transformers import Trainer


args = TrainingArguments(
    output_dir="data/bert-finetuned-subjqa",
    overwrite_output_dir=True,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, overflow_to_sample_mapping. If offset_mapping, overflow_to_sample_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
/home/karo/nlp-project/venv/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
***** Running training *****
  Num examples = 2030
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 762

[ 2/762 : < :, Epoch 0.00/3]

Step	Training Loss

Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-254
Configuration saved in data/bert-finetuned-subjqa/checkpoint-254/config.json
Model weights saved in data/bert-finetuned-subjqa/checkpoint-254/pytorch_model.bin
tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-254/tokenizer_config.json
Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-254/special_tokens_map.json
Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-508
Configuration saved in data/bert-finetuned-subjqa/checkpoint-508/config.json
Model weights saved in data/bert-finetuned-subjqa/checkpoint-508/pytorch_model.bin
tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-508/tokenizer_config.json
Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-508/special_tokens_map.json
Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-762
Configuration saved in data/bert-finetuned-subjqa/checkpoint-762/config.json
Model weights saved in data/bert-finetuned-subjqa/checkpoint-762/pytorch_model.bin
tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-762/tokenizer_config.json
Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-762/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)

TrainOutput(global_step=762, training_loss=0.7420042443463183, metrics={'train_runtime': 336.3364, 'train_samples_per_second': 18.107, 'train_steps_per_second': 2.266, 'total_flos': 1193472936391680.0, 'train_loss': 0.7420042443463183, 'epoch': 3.0})

predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
_=compute_metrics(start_logits, end_logits, validation_dataset, subjqa["validation"])

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 327
  Batch size = 8

[ 1/41 : < :]

  0%|          | 0/265 [00:00<?, ?it/s]

QUESTION:	 How was the hotel?
PREDICTED: Great setting at the end of the wharf
ACTUAL: ['excellent hotels']
QUESTION:	 How is the hotel?
PREDICTED: My wife and I took two trips to San Fran in 2004 and stayed at the best western both times
ACTUAL: ['The hotel location was great']
QUESTION:	 Is it value for money?
PREDICTED: excellent value for money
ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.']
{'exact_match': 13.584905660377359, 'f1': 30.577012885313252}

trained_checkpoint = "deepset/roberta-base-squad2"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)
predict_from_trained()

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/vocab.json from cache at /home/karo/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/merges.txt from cache at /home/karo/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/special_tokens_map.json from cache at /home/karo/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer_config.json from cache at /home/karo/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513
loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/pytorch_model.bin from cache at /home/karo/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e
All model checkpoint weights were used when initializing RobertaForQuestionAnswering.

All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at deepset/roberta-base-squad2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.

  0%|          | 0/1 [00:00<?, ?ba/s]

RobertaTokenizerFast

  0%|          | 0/265 [00:00<?, ?it/s]

QUESTION:	 How was the hotel?
PREDICTED: No complaints - really happy with what this hotel offered for the price
ACTUAL: ['excellent hotels']
QUESTION:	 How is the hotel?
PREDICTED: clean and neat
ACTUAL: ['The hotel location was great']
QUESTION:	 Is it value for money?
PREDICTED: excellent value for money
ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.']
{'exact_match': 11.69811320754717, 'f1': 37.040671268633204}

model_ckpt = "deepset/roberta-base-squad2"

model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

roberta_train_dataset = subjqa["train"].map(
    preprocess_train_data,
    batched=True,
    remove_columns=subjqa["train"].column_names,
)

print(len(subjqa["train"]), len(roberta_train_dataset))
roberta_validation_dataset = subjqa["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=subjqa["validation"].column_names,
)
print(len(subjqa["validation"]), len(roberta_validation_dataset))

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/pytorch_model.bin from cache at /home/karo/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e
All model checkpoint weights were used when initializing RobertaForQuestionAnswering.

All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at deepset/roberta-base-squad2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.
loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/vocab.json from cache at /home/karo/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/merges.txt from cache at /home/karo/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/special_tokens_map.json from cache at /home/karo/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0
loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer_config.json from cache at /home/karo/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513
loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

  0%|          | 0/2 [00:00<?, ?ba/s]

RobertaTokenizerFast
RobertaTokenizerFast
1666 1995

  0%|          | 0/1 [00:00<?, ?ba/s]

RobertaTokenizerFast
265 321

args = TrainingArguments(
    output_dir="data/roberta-finetuned-subjqa",
    overwrite_output_dir=True,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=roberta_train_dataset,
    eval_dataset=roberta_validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: offset_mapping, overflow_to_sample_mapping. If offset_mapping, overflow_to_sample_mapping are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.
/home/karo/nlp-project/venv/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
***** Running training *****
  Num examples = 1995
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750

[ 2/750 : < :, Epoch 0.00/3]

Step	Training Loss

Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-250
Configuration saved in data/roberta-finetuned-subjqa/checkpoint-250/config.json
Model weights saved in data/roberta-finetuned-subjqa/checkpoint-250/pytorch_model.bin
tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-250/tokenizer_config.json
Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-250/special_tokens_map.json
Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-500
Configuration saved in data/roberta-finetuned-subjqa/checkpoint-500/config.json
Model weights saved in data/roberta-finetuned-subjqa/checkpoint-500/pytorch_model.bin
tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-500/tokenizer_config.json
Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-500/special_tokens_map.json
Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-750
Configuration saved in data/roberta-finetuned-subjqa/checkpoint-750/config.json
Model weights saved in data/roberta-finetuned-subjqa/checkpoint-750/pytorch_model.bin
tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-750/tokenizer_config.json
Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-750/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)

TrainOutput(global_step=750, training_loss=0.4630465749104818, metrics={'train_runtime': 337.0904, 'train_samples_per_second': 17.755, 'train_steps_per_second': 2.225, 'total_flos': 1172895816798720.0, 'train_loss': 0.4630465749104818, 'epoch': 3.0})

predictions, _, _ = trainer.predict(roberta_validation_dataset)
start_logits, end_logits = predictions
_=compute_metrics(start_logits, end_logits, roberta_validation_dataset, subjqa["validation"])

The following columns in the test set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 321
  Batch size = 8