I am using TFCLIPTextModel and TFCLIPVisionModel to get embedding of texts and images that I need for some downstream tasks. I want the embedding to share the same dimensional space as they do in CLIP. However, as the documentation of these two models suggests the hidden_size of TFCLIPTextModel is 512 while for TFCLIPVisionModel it is 768. So while I extract the last hidden state from these two models, I get embeddings of different dimensions. I am also aware of the projection_dim which is 512 for both these models, but I don't know how to extract the projected features.
Here is my code for the embedding extraction for image and texts.
def Image_Embedding_Generator(images, batch_size=32):
model_name = "openai/clip-vit-base-patch32"
model = TFCLIPVisionModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
if isinstance(images, (np.ndarray, tf.Tensor)):
images = tf.unstack(images) if len(images.shape) == 4 else [images]
elif isinstance(images, dict):
images = [image for _, image in images.items()]
# inputs = processor(images = images, return_tensors = "tf", padding = True, rescaling = False)
# dataset = tf.data.Dataset.from_tensor_slices((inputs['pixel_values'], inputs['attention_mask']))
# dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
image_embeddings = []
cls_embeddings = []
pbar = trange(0, len(images), batch_size, desc = "Generating Image Embeddings")
for i in range(0, len(images), batch_size):
image_batch = images[i:i+batch_size]
inputs = processor(images = image_batch, return_tensors = "tf", do_rescale = False)
outputs = model(**inputs, output_hidden_states = True)
print(outputs)
batch_embeddings = outputs.last_hidden_state.numpy() # Convert to numpy array
print(batch_embeddings.shape)
pooled_embeddings = tf.reduce_mean(batch_embeddings, axis = 1)
image_embeddings.append(pooled_embeddings)
cls_embeddings.append(outputs.pooler_output)
pbar.update(batch_size)
image_embeddings = np.concatenate(image_embeddings, axis=0)
cls_embeddings = np.concatenate(cls_embeddings, axis=0)
return cls_embeddings, image_embeddings
def Text_Embedding_Generator(texts, batch_size = 32):
model_name = "openai/clip-vit-base-patch32"
model = TFCLIPTextModel.from_pretrained(model_name)
tokenizer = CLIPTokenizer.from_pretrained(model_name)
if isinstance(texts, str):
texts = [texts]
elif isinstance(texts, dict):
texts = [text for _, text in texts.items()]
elif isinstance(texts, Iterable):
texts = texts
inputs = tokenizer(text = texts, return_tensors = "tf", padding = "max_length", truncation = True, max_length = 256)
dataset = tf.data.Dataset.from_tensor_slices((inputs['input_ids'], inputs['attention_mask']))
dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
text_embeddings = []
cls_embeddings = []
for batch in tqdm(dataset, desc = "Generating Text Embeddings"):
batch_inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
outputs = model(**batch_inputs)
batch_embeddings = outputs.last_hidden_state.numpy() # Convert to numpy array)
attention_mask_expanded = tf.cast(batch_inputs['attention_mask'], dtype=tf.float32)[:, :, None]
sum_embeddings = tf.reduce_sum(batch_embeddings * attention_mask_expanded, axis=1)
sum_mask = tf.reduce_sum(attention_mask_expanded, axis=1)
pooled_embeddings = sum_embeddings / sum_mask
text_embeddings.append(pooled_embeddings)
cls_embeddings.append(outputs.pooler_output)
text_embeddings = np.concatenate(text_embeddings, axis=0)
return cls_embeddings, text_embeddings
As said earlier the cls_embeddings and text_embeddings from Text_Embedding_Generator have shape (batch_size, 512) while those from Image_Embedding_Generator have shape (batch_size, 768).
Is there a way to get these two embeddings to same dimensions without needing to train an extra layer on top of these embeddings?