No loss from inputs valuerror when finetuning Bart model

148 views Asked by At

I'm finetuning a Bart model for a sequence-to-sequence task and getting an error on the train step. Note that example below has dummy English to Portuguese sentences as the finetuning dataset, but in my actual examples it's a true sequence where the inputs and outputs are different string formats (not language translation).

from functools import partial

import datasets
from transformers import BartForConditionalGeneration, BartTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, DataCollatorWithPadding, Trainer, AutoTokenizer

from funcs import process_pipeline


def tokenize_function(tok, example):
    inp = tok(example['input_seq'],
              padding="max_length",
              max_length=500,
              truncation=True)
    outp = tok(example['outp_seq'],
               padding="max_length",
               max_length=500,
               truncation=True)

    res = {
        'input_ids': inp['input_ids'],
        'attention_mask': inp['attention_mask'],
        'decoder_input_ids': outp['input_ids'],
        'decoder_attention_mask': outp['attention_mask']
    }
    return res


def main():
    inp = [
        "When I went to the cabin up north, I had to bring a lot of board games to entertain myself.",
        "I don't know why he bought instant coffee when there was a free espresso machine around the corner which was constantly stocked",
        "Why do you keep asking such obvious questions?"
    ]

    outp = [
        "Quando fui para a cabana no norte, tive que trazer muitos jogos de tabuleiro para me entreter.",
        "Não sei por que ele comprou café solúvel quando havia uma máquina de café expresso grátis na esquina, que estava constantemente abastecida.",
        "Por que você continua fazendo perguntas tão óbvias?"
    ]

    ds = datasets.Dataset.from_dict({"input_seq": inp, "outp_seq": outp})

    checkpoint = "facebook/bart-large"

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

    tokenized_dataset = ds.map(partial(tokenize_function, tokenizer),
                               batched=True,
                               batch_size=1,
                               remove_columns=['input_seq', 'outp_seq'])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(output_dir="./test_dir",
                                      evaluation_strategy="epoch",
                                      gradient_checkpointing=True,
                                      label_names=['decoder_input_ids'],
                                      num_train_epochs=3)

    trainer = Trainer(model,
                      training_args,
                      train_dataset=tokenized_dataset,
                      data_collator=data_collator,
                      tokenizer=tokenizer)

    trainer.train()
    trainer.save_model()


if __name__ == '__main__':
    main()

The full error

Exception has occurred: ValueError
The model did not return a loss from the inputs, only the following keys: logits,encoder_last_hidden_state. For reference, the inputs it received are input_ids,attention_mask,decoder_input_ids,decoder_attention_mask.

I'm naming the fields to coincide with Bart's forward method. Aside from error in input field names as the error cause, maybe there's an issue with special tokens - I haven't been able to figure out whether the decoder_input_ids need special tokens for each sequence start and end.

1

There are 1 answers

0
matsuo_basho On

The error occurs in the Trainer.compute_loss method because there is no field titled 'labels':

def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            if is_peft_available() and isinstance(model, PeftModel):
                model_name = unwrap_model(model.base_model)._get_name()
            else:
                model_name = unwrap_model(model)._get_name()
            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

So it appears that even though Bart's forward method asks for ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'], compute_metrics requires a field titled 'labels'.

So, duplicate the 'decoder_input_ids' to create a 'labels' field. In the below code, I added 3 more sentences to the dataset and then split up 50%/50% to get a train and test set.

from functools import partial

import conllu as U
import datasets
from transformers import BartForConditionalGeneration, BartTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, DataCollatorWithPadding, Trainer, AutoTokenizer

from funcs import process_pipeline


def tokenize_function(tok, example):
    inp = tok(example['input_seq'],
              padding="max_length",
              max_length=500,
              truncation=True)
    outp = tok(example['outp_seq'],
               padding="max_length",
               max_length=500,
               truncation=True)

    res = {
        'input_ids': inp['input_ids'],
        'attention_mask': inp['attention_mask'],
        'decoder_input_ids': outp['input_ids'],
        'labels': outp['input_ids'],
        'decoder_attention_mask': outp['attention_mask']
    }
    return res


def main():
    inp = [
        "When I went to the cabin up north, I had to bring a lot of board games to entertain myself.",
        "I don't know why he bought instant coffee when there was a free espresso machine around the corner which was constantly stocked",
        "Why do you keep asking such obvious questions?",
        "She was dying to get breakfast out, but she couldn't afford it.",
        "We had a snow storm last weekend.", "She’s as sick as a dog"
    ]

    outp = [
        "Quando fui para a cabana no norte, tive que trazer muitos jogos de tabuleiro para me entreter.",
        "Não sei por que ele comprou café solúvel quando havia uma máquina de café expresso grátis na esquina, que estava constantemente abastecida.",
        "Por que você continua fazendo perguntas tão óbvias?",
        "Ela estava louca para tomar café da manhã, mas não tinha condições de pagar.",
        "Tivemos uma tempestade de neve no último fim de semana.",
        "Ela está tão doente quanto um cachorro."
    ]

    ds = datasets.Dataset.from_dict({"input_seq": inp, "outp_seq": outp})

    data_prepped = ds.train_test_split(test_size=0.5)

    checkpoint = "facebook/bart-large"

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

    tokenized_dataset = data_prepped.map(
        partial(tokenize_function, tokenizer),
        batched=True,
        batch_size=1,
        remove_columns=['input_seq', 'outp_seq'])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(output_dir="./surface_realization",
                                      evaluation_strategy="epoch",
                                      gradient_checkpointing=True,
                                      label_names=['decoder_input_ids'],
                                      num_train_epochs=3)

    trainer = Trainer(model,
                      training_args,
                      train_dataset=tokenized_dataset['train'],
                      eval_dataset=tokenized_dataset['test'],
                      data_collator=data_collator,
                      tokenizer=tokenizer)

    trainer.train()
    trainer.save_model()


if __name__ == '__main__':
    main()