import torch
import torch.nn as nn
from transformers.models.sam.modeling_sam import SamVisionAttention

class SamVisionAttentionSplit(SamVisionAttention, nn.Module):
    def __init__(self, config, window_size):
        super().__init__(config, window_size)
        del self.qkv
        # 创建单独的 q, k, v 投影
        self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
        self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
        self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
        self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook)

    def split_q_k_v_load_hook(self, state_dict, prefix, *args):
        keys_to_delete = []
        for key in list(state_dict.keys()):
            if "qkv." in key:
                # 将 q, k, v 从组合投影中分离
                q, k, v = state_dict[key].chunk(3, dim=0)
                # 用单独的 q, k, v 投影替换
                state_dict[key.replace("qkv.", "q.")] = q
                state_dict[key.replace("qkv.", "k.")] = k
                state_dict[key.replace("qkv.", "v.")] = v
                # 标记旧的 qkv 键以便删除
                keys_to_delete.append(key)
        
        # 删除旧的 qkv 键
        for key in keys_to_delete:
            del state_dict[key]

    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
        batch_size, height, width, _ = hidden_states.shape
        qkv_shapes = (batch_size * self.num_attention_heads, height * width, -1)
        query = self.q(hidden_states).reshape((batch_size, height * width, self.num_attention_heads, -1)).permute(0, 2, 1, 3).reshape(qkv_shapes)
        key = self.k(hidden_states).reshape((batch_size, height * width, self.num_attention_heads, -1)).permute(0, 2, 1, 3).reshape(qkv_shapes)
        value = self.v(hidden_states).reshape((batch_size, height * width, self.num_attention_heads, -1)).permute(0, 2, 1, 3).reshape(qkv_shapes)

        attn_weights = (query * self.scale) @ key.transpose(-2, -1)

        if self.use_rel_pos:
            attn_weights = self.add_decomposed_rel_pos(
                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
            )

        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
        attn_output = self.proj(attn_output)

        if output_attentions:
            outputs = (attn_output, attn_weights)
        else:
            outputs = (attn_output, None)
        return outputs

from transformers import SamModel
from transformers.models.sam import modeling_sam

# 替换 modeling_sam 模块中的注意力类
modeling_sam.SamVisionAttention = SamVisionAttentionSplit

# 加载预训练的 SAM 模型
model = SamModel.from_pretrained("facebook/sam-vit-base")

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],  # 将 LoRA 应用于 q 和 v 投影
    lora_dropout=0.1,
    task_type="mask-generation"
)

# 将 LoRA 应用于模型
model = get_peft_model(model, config)

model.print_trainable_parameters()

如何修改 Transformer 模型¶

示例：修改Segment Anything Model (SAM)的注意力机制¶

动机¶

实现¶

第一步：创建自定义注意力类¶

第二步：替换原始的注意力类¶

第三步：将LoRA应用于特定投影¶

第四步：验证可训练参数的数量¶

贡献你自己的“修改”方法¶