I'm trying to match event names using LLM BERT embeddings.
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from scipy.spatial.distance import cosine
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Encode a word into its vector representation using BERT
def get_word_embedding(word):
tokens = tokenizer(word, return_tensors="pt")
outputs = model(**tokens)
word_embedding = np.mean(outputs.last_hidden_state.detach().numpy(), axis=1)
return word_embedding
# Calculate the cosine similarity between two word embeddings
def calculate_similarity(word1, word2):
embedding1 = get_word_embedding(word1)
embedding2 = get_word_embedding(word2)
similarity = 1 - cosine(embedding1, embedding2)
return similarity
word1 = "17 May Constitution Day Off"
word2 = "Constitution Day Off"
similarity_score = calculate_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")
Similarity between '17 May Constitution Day Off' and 'Constitution Day Off': 0.8122310638427734
word1 = "Ascension Day Off"
word2 = "Constitution Day Off"
similarity_score = calculate_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")
Similarity between 'Ascension Day Off' and 'Constitution Day Off': 0.8288278579711914
Speaking from Human perspective, the holidays wrt constitution off should have higher contextual similarity. Any idea on how I can transform the input to derive from realistic results?