I am learning Transformer_Captioning.ipynb of cs231n assignment3. According to its guide, I need to run the cell which can test my MultiHeadAttention implementation, and I need to get the relative error should be less than e-3. After I have finished my code, I got incorrect result:
self_attn_output error: 0.449382070034207
masked_self_attn_output error: 1.0
attn_output error: 1.0
I even copied other people's code on gtihub which can get correct result, and, I still the same output:
self_attn_output error: 0.449382070034207
masked_self_attn_output error: 1.0
attn_output error: 1.0
Is there anything else I missed?
My Class:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
class MultiHeadAttention(nn.Module):
"""
A model layer which implements a simplified version of masked attention, as
introduced by "Attention Is All You Need" (https://arxiv.org/abs/1706.03762).
Usage:
attn = MultiHeadAttention(embed_dim, num_heads=2)
# self-attention
data = torch.randn(batch_size, sequence_length, embed_dim)
self_attn_output = attn(query=data, key=data, value=data)
# attention using two inputs
other_data = torch.randn(batch_size, sequence_length, embed_dim)
attn_output = attn(query=data, key=other_data, value=other_data)
"""
def __init__(self, embed_dim, num_heads, dropout=0.1):
"""
Construct a new MultiHeadAttention layer.
Inputs:
- embed_dim: Dimension of the token embedding
- num_heads: Number of attention heads
- dropout: Dropout probability
"""
super().__init__()
assert embed_dim % num_heads == 0
# We will initialize these layers for you, since swapping the ordering
# would affect the random number generation (and therefore your exact
# outputs relative to the autograder). Note that the layers use a bias
# term, but this isn't strictly necessary (and varies by
# implementation).
self.key = nn.Linear(embed_dim, embed_dim)
self.query = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
self.proj = nn.Linear(embed_dim, embed_dim)
self.attn_drop = nn.Dropout(dropout)
self.n_head = num_heads
self.emd_dim = embed_dim
self.head_dim = self.emd_dim // self.n_head
def forward(self, query, key, value, attn_mask=None):
"""
Calculate the masked attention output for the provided data, computing
all attention heads in parallel.
In the shape definitions below, N is the batch size, S is the source
sequence length, T is the target sequence length, and E is the embedding
dimension.
Inputs:
- query: Input data to be used as the query, of shape (N, S, E)
- key: Input data to be used as the key, of shape (N, T, E)
- value: Input data to be used as the value, of shape (N, T, E)
- attn_mask: Array of shape (S, T) where mask[i,j] == 0 indicates token
i in the source should not influence token j in the target.
Returns:
- output: Tensor of shape (N, S, E) giving the weighted combination of
data in value according to the attention weights calculated using key
and query.
"""
N, S, E = query.shape
N, T, E = value.shape
# Create a placeholder, to be overwritten by your code below.
output = torch.empty((N, S, E))
############################################################################
# TODO: Implement multiheaded attention using the equations given in #
# Transformer_Captioning.ipynb. #
# A few hints: #
# 1) You'll want to split your shape from (N, T, E) into (N, T, H, E/H), #
# where H is the number of heads. #
# 2) The function torch.matmul allows you to do a batched matrix multiply.#
# For example, you can do (N, H, T, E/H) by (N, H, E/H, T) to yield a #
# shape (N, H, T, T). For more examples, see #
# https://pytorch.org/docs/stable/generated/torch.matmul.html #
# 3) For applying attn_mask, think how the scores should be modified to #
# prevent a value from influencing output. Specifically, the PyTorch #
# function masked_fill may come in handy. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
H = self.n_head
# Compute key, query and value matrices from sequences
K = self.key(key).view(N, T, H, E // H).moveaxis(1, 2)
Q = self.query(query).view(N, S, H, E // H).moveaxis(1, 2)
V = self.value(value).view(N, T, H, E // H).moveaxis(1, 2)
Y = Q @ K.transpose(2, 3) / math.sqrt(self.head_dim)
if attn_mask is not None:
Y = Y.masked_fill(attn_mask == 0, float("-inf"))
Y = self.attn_drop(F.softmax(Y, dim=-1)) @ V
output = self.proj(Y.moveaxis(1, 2).reshape(N, S, E))
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return output
My test code:
import torch
if torch.backends.mps.is_available():
mps_device = torch.device("mps")
x = torch.ones(1, device=mps_device)
print (x)
else:
print ("MPS device not found.")
torch.manual_seed(231)
# Choose dimensions such that they are all unique for easier debugging:
# Specifically, the following values correspond to N=1, H=2, T=3, E//H=4, and E=8.
batch_size = 1
sequence_length = 3
embed_dim = 8
attn = MultiHeadAttention(embed_dim, num_heads=2)
# Self-attention.
data = torch.randn(batch_size, sequence_length, embed_dim)
self_attn_output = attn(query=data, key=data, value=data)
# Masked self-attention.
mask = torch.randn(sequence_length, sequence_length) < 0.5
masked_self_attn_output = attn(query=data, key=data, value=data, attn_mask=mask)
# Attention using two inputs.
other_data = torch.randn(batch_size, sequence_length, embed_dim)
attn_output = attn(query=data, key=other_data, value=other_data)
expected_self_attn_output = np.asarray([[
[-0.2494, 0.1396, 0.4323, -0.2411, -0.1547, 0.2329, -0.1936,
-0.1444],
[-0.1997, 0.1746, 0.7377, -0.3549, -0.2657, 0.2693, -0.2541,
-0.2476],
[-0.0625, 0.1503, 0.7572, -0.3974, -0.1681, 0.2168, -0.2478,
-0.3038]]])
expected_masked_self_attn_output = np.asarray([[
[-0.1347, 0.1934, 0.8628, -0.4903, -0.2614, 0.2798, -0.2586,
-0.3019],
[-0.1013, 0.3111, 0.5783, -0.3248, -0.3842, 0.1482, -0.3628,
-0.1496],
[-0.2071, 0.1669, 0.7097, -0.3152, -0.3136, 0.2520, -0.2774,
-0.2208]]])
expected_attn_output = np.asarray([[
[-0.1980, 0.4083, 0.1968, -0.3477, 0.0321, 0.4258, -0.8972,
-0.2744],
[-0.1603, 0.4155, 0.2295, -0.3485, -0.0341, 0.3929, -0.8248,
-0.2767],
[-0.0908, 0.4113, 0.3017, -0.3539, -0.1020, 0.3784, -0.7189,
-0.2912]]])
print('self_attn_output error: ', rel_error(expected_self_attn_output, self_attn_output.detach().numpy()))
print('masked_self_attn_output error: ', rel_error(expected_masked_self_attn_output, masked_self_attn_output.detach().numpy()))
print('attn_output error: ', rel_error(expected_attn_output, attn_output.detach().numpy()))
My configuration: MacBook Pro M1、Python 3.8.12、conda 4.11.0、torch 2.1.0.dev20230415、macOS Monterey Version 12.5
I also met this question.
You can code this in Colab , and you will find your answer is right.