- Published on
因果自注意力与多头注意力
根据之前2篇文章的介绍,我们知道 Transformer 模型的核心是自注意力机制。自注意力机制允许模型在处理输入序列时,动态地关注序列中的不同部分,从而捕捉长距离依赖关系。 这篇文章主要是实现来一个因果自注意力(Causal Attention)和多头注意力(Multi-Head Attention)的 PyTorch 模块。
我之后打算将这个模块集成到一个完整的 Transformer 模型中,所以这里的实现会尽量简单明了,便于理解。 于是在github上新建了一个repo DeepText, 算是这但时间学习大模型的一个小小的积累。
github: DeepText
import torch
import torch.nn as nn
class CausalAttention(nn.Module):
def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
super().__init__()
self.d_out = d_out
self.dropout = nn.Dropout(dropout)
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
# 它的作用是告诉模型:“这个张量(我们的掩码)是模型状态的一部分,但它不是一个需要训练的参数。”
# 为什么要这么做?主要有两个好处:
# 状态保存与加载: 当你保存模型的状态(model.state_dict())时,所有注册的 buffer 都会被一并保存。同样,加载状态时它们也会被正确加载。如果只是简单地用 self.mask = ...,这个掩码就不会成为模型状态的一部分,导致模型保存和加载不完整。
# 设备自动转移: 当你把模型移动到 GPU(model.to('cuda'))时,所有注册的 buffer 都会自动被移动到 GPU 上,无需手动处理。这避免了很多常见的设备不匹配错误。
self.register_buffer(
'mask',
torch.triu(torch.ones(context_length, context_length), diagonal=1)
)
def forward(self, x):
b, num_tokens, d_in = x.shape
keys = self.W_key(x)
queries = self.W_query(x)
values = self.W_value(x)
attn_scores = queries @ keys.transpose(1, 2)
attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
attn_weights = self.dropout(attn_weights)
context_vec = attn_weights @ values
return context_vec
inputs = torch.tensor(
[[0.43, 0.15, 0.89], # Your
[0.55, 0.87, 0.66], # journey
[0.57, 0.85, 0.64], # starts
[0.22, 0.58, 0.33], # with
[0.77, 0.25, 0.10], # one
[0.05, 0.80, 0.55]] # step
)
# The input embedding size, d=3
d_in = inputs.shape[-1] # Should be the last dimension (features)
# The output embedding size, d=2
d_out = 2
batch = torch.stack((inputs, inputs), dim = 0)
print(batch.shape) # torch.Size([2, 6, 3])
torch.manual_seed(123)
context_length = batch.shape[1]
ca = CausalAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
print("context_vecs.shape:", context_vecs.shape)
# multi-head attention
class MultiHeadAttention(nn.Module):
def __init__(self, d_in, d_out, context_length, dropout=0.0, num_heads=2, qkv_bias=False):
super().__init__()
self.heads = nn.ModuleList([
CausalAttention(d_in, d_out, context_length, dropout, qkv_bias)
for _ in range(num_heads)
])
def forward(self, x):
return torch.cat([head(x) for head in self.heads], dim=-1)
torch.manual_seed(123)
context_length = batch.shape[1] # This is the number of tokens
d_in, d_out = 3, 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)
################################ MHAttention ###########################
class MultiHeadAttention(nn.Module):
def __init__(self, d_in, d_out,context_length, dropout, num_heads, qkv_bias=False):
super().__init__()
# 1. 断言:确保输出维度可以被头的数量整除
assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"
self.d_out = d_out
self.num_heads = num_heads
# 2. 计算每个头的维度
self.head_dim = d_out // num_heads
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
# 4. 定义最后的输出投影层
self.out_proj = nn.Linear(d_out, d_out)
self.dropout = nn.Dropout(dropout)
# 5. 注册因果掩码????
self.register_buffer(
"mask",
torch.triu(torch.ones(context_length, context_length), diagonal=1)
)
def forward(self, x):
b, num_tokens, d_in = x.shape
keys = self.W_key(x)
queries = self.W_query(x)
values = self.W_value(x)
# 我们用 .view() 将形状为 (b, num_tokens, d_out) 的张量重塑为 (b, num_tokens, num_heads, head_dim)。这步操作没有移动任何数据,只是改变了 PyTorch "看待" 这个张量的方式,巧妙地将最后一个维度拆分成了“头的数量”和“每个头的维度”。
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
values = values.view(b, num_tokens, self.num_heads, self.head_dim)
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
# Transposes from shape
# (b, num_tokens, num_heads, head_dim) to
# (b, num_heads, num_tokens, head_dim)
# 交换维度 (.transpose):这是第二个魔法。我们交换 num_tokens 和 num_heads 的位置。为什么要这么做?因为这样一来,所有头的计算就可以被看作一个更大的批处理操作。PyTorch 的矩阵乘法 @ 会自动在批次维度上(现在是 b * num_heads)并行执行。
keys = keys.transpose(1, 2)
queries = queries.transpose(1, 2)
values = values.transpose(1, 2)
# In summary, the transpose(1, 2) operation is not directly performing the parallel computation itself, but it restructures the data in a way that allows PyTorch's efficient batched matrix multiplication to compute the attention scores for all heads simultaneously, rather than sequentially.
# When computing attention scores using the batched matrix multiplication operator (@), PyTorch automatically parallelizes the computation across the leading dimensions.
attn_scores = queries @ keys.transpose(2, 3)
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
# 函数名末尾的下划线 _ 表示这是一个in-place(原地/就地)操作。
attn_scores.masked_fill_(mask_bool, -torch.inf)
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
attn_weights = self.dropout(attn_weights)
context_vec = (attn_weights @ values).transpose(1, 2)
# Combines heads, where self.d_out = self.num_heads * self.head_dim
context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
context_vec = self.out_proj(context_vec)
return context_vec
torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)
THE END