I am trying to translate my code from TensorFlow to PyTorch. This code generates tokens in the form of questions and answers as an input for my transformer model. This is the original TensorFlow code:
# Load conversations
questions, answers = load_conversations()
# Build tokenizer using tfds for both questions and answers
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
questions + answers, target_vocab_size=2**13
)
# Define start and end token to indicate the start and end of a sentence
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
# Vocabulary size plus start and end token
VOCAB_SIZE = tokenizer.vocab_size + 2
# Tokenize and filter questions and answers
questions, answers = tokenize_and_filter(questions, answers)
# Print vocabulary size and number of samples
print(f"Vocab size: {VOCAB_SIZE}")
print(f"Number of samples: {len(questions)}")
This is my PyTorch code:
# Load conversations
questions, answers = load_conversations()
# Combine questions and answers into a single list
corpus = questions + answers
# Initialize a ByteLevel tokenizer
tokenizer = Tokenizer(models.BPE())
# Train the tokenizer on the corpus
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
# Create the directory if it doesn't exist
save_directory = "tokenizer_directory"
os.makedirs(save_directory, exist_ok=True)
tokenizer.train(corpus, trainer)
# Save the trained tokenizer
current_directory = os.getcwd()
tokenizer.save(os.path.join(current_directory, save_directory, "tokenizer.json"))
# Load the tokenizer later if needed
loaded_tokenizer = Tokenizer.from_file(os.path.join(current_directory, save_directory, "tokenizer.json"))
# Define start and end token to indicate the start and end of a sentence
START_TOKEN, END_TOKEN = [loaded_tokenizer.get_vocab_size()], [loaded_tokenizer.get_vocab_size() + 1]
# Vocabulary size plus start and end token
VOCAB_SIZE = loaded_tokenizer.get_vocab_size() + 2
# Tokenize and filter questions and answers
# Assuming you have your own implementation for tokenize_and_filter
questions, answers = tokenize_and_filter(loaded_tokenizer, questions, answers)
# Print vocabulary size and number of samples
print(f"Vocab size: {VOCAB_SIZE}")
print(f"Number of samples: {len(questions)}")
And this is error it throws:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
Cell In[12], line 17
14 save_directory = "tokenizer_directory"
15 os.makedirs(save_directory, exist_ok=True)
---> 17 tokenizer.train(corpus, trainer)
19 # Save the trained tokenizer
20 current_directory = os.getcwd()
Exception: No such file or directory (os error 2)
How to solve this error?