Source code for ual_adapter.binders.architectures

"""
Architecture-Specific Model Binders

Implementations for GPT-2, LLaMA, Pythia, and other model families.
"""

from typing import Dict, Any
from ual_adapter.binders.base import ModelBinder


[docs] class GPT2Binder(ModelBinder): """Binder for GPT-2 family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "transformer.h.{layer}.attn.c_attn", "fused": True, "slice": "q" }, "attention_key": { "pattern": "transformer.h.{layer}.attn.c_attn", "fused": True, "slice": "k" }, "attention_value": { "pattern": "transformer.h.{layer}.attn.c_attn", "fused": True, "slice": "v" }, "attention_output": { "pattern": "transformer.h.{layer}.attn.c_proj", "fused": False }, "mlp_up": { "pattern": "transformer.h.{layer}.mlp.c_fc", "fused": False }, "mlp_down": { "pattern": "transformer.h.{layer}.mlp.c_proj", "fused": False } }
[docs] class LLaMABinder(ModelBinder): """Binder for LLaMA family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "model.layers.{layer}.self_attn.q_proj", "fused": False }, "attention_key": { "pattern": "model.layers.{layer}.self_attn.k_proj", "fused": False }, "attention_value": { "pattern": "model.layers.{layer}.self_attn.v_proj", "fused": False }, "attention_output": { "pattern": "model.layers.{layer}.self_attn.o_proj", "fused": False }, "mlp_up": { "pattern": "model.layers.{layer}.mlp.up_proj", "fused": False }, "mlp_down": { "pattern": "model.layers.{layer}.mlp.down_proj", "fused": False }, "mlp_gate": { "pattern": "model.layers.{layer}.mlp.gate_proj", "fused": False } }
[docs] class PythiaBinder(ModelBinder): """Binder for Pythia/GPT-NeoX family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "gpt_neox.layers.{layer}.attention.query_key_value", "fused": True, "slice": "q" }, "attention_key": { "pattern": "gpt_neox.layers.{layer}.attention.query_key_value", "fused": True, "slice": "k" }, "attention_value": { "pattern": "gpt_neox.layers.{layer}.attention.query_key_value", "fused": True, "slice": "v" }, "attention_output": { "pattern": "gpt_neox.layers.{layer}.attention.dense", "fused": False }, "mlp_up": { "pattern": "gpt_neox.layers.{layer}.mlp.dense_h_to_4h", "fused": False }, "mlp_down": { "pattern": "gpt_neox.layers.{layer}.mlp.dense_4h_to_h", "fused": False } }
[docs] class QwenBinder(ModelBinder): """Binder for Qwen family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "transformer.h.{layer}.attn.c_attn", "fused": True, "slice": "q" }, "attention_key": { "pattern": "transformer.h.{layer}.attn.c_attn", "fused": True, "slice": "k" }, "attention_value": { "pattern": "transformer.h.{layer}.attn.c_attn", "fused": True, "slice": "v" }, "attention_output": { "pattern": "transformer.h.{layer}.attn.c_proj", "fused": False }, "mlp_up": { "pattern": "transformer.h.{layer}.mlp.w1", "fused": False }, "mlp_down": { "pattern": "transformer.h.{layer}.mlp.w2", "fused": False } }
[docs] class MistralBinder(ModelBinder): """Binder for Mistral family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "model.layers.{layer}.self_attn.q_proj", "fused": False }, "attention_key": { "pattern": "model.layers.{layer}.self_attn.k_proj", "fused": False }, "attention_value": { "pattern": "model.layers.{layer}.self_attn.v_proj", "fused": False }, "attention_output": { "pattern": "model.layers.{layer}.self_attn.o_proj", "fused": False }, "mlp_up": { "pattern": "model.layers.{layer}.mlp.up_proj", "fused": False }, "mlp_down": { "pattern": "model.layers.{layer}.mlp.down_proj", "fused": False }, "mlp_gate": { "pattern": "model.layers.{layer}.mlp.gate_proj", "fused": False } }
[docs] class PhiBinder(ModelBinder): """Binder for Microsoft Phi family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "model.layers.{layer}.mixer.Wqkv", "fused": True, "slice": "q" }, "attention_key": { "pattern": "model.layers.{layer}.mixer.Wqkv", "fused": True, "slice": "k" }, "attention_value": { "pattern": "model.layers.{layer}.mixer.Wqkv", "fused": True, "slice": "v" }, "attention_output": { "pattern": "model.layers.{layer}.mixer.out_proj", "fused": False }, "mlp_up": { "pattern": "model.layers.{layer}.mlp.fc1", "fused": False }, "mlp_down": { "pattern": "model.layers.{layer}.mlp.fc2", "fused": False } }
[docs] class BERTBinder(ModelBinder): """Binder for BERT family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "bert.encoder.layer.{layer}.attention.self.query", "fused": False }, "attention_key": { "pattern": "bert.encoder.layer.{layer}.attention.self.key", "fused": False }, "attention_value": { "pattern": "bert.encoder.layer.{layer}.attention.self.value", "fused": False }, "attention_output": { "pattern": "bert.encoder.layer.{layer}.attention.output.dense", "fused": False }, "mlp_up": { "pattern": "bert.encoder.layer.{layer}.intermediate.dense", "fused": False }, "mlp_down": { "pattern": "bert.encoder.layer.{layer}.output.dense", "fused": False } }
[docs] class T5Binder(ModelBinder): """Binder for T5 family models.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: return { "attention_query": { "pattern": "encoder.block.{layer}.layer.0.SelfAttention.q", "fused": False }, "attention_key": { "pattern": "encoder.block.{layer}.layer.0.SelfAttention.k", "fused": False }, "attention_value": { "pattern": "encoder.block.{layer}.layer.0.SelfAttention.v", "fused": False }, "attention_output": { "pattern": "encoder.block.{layer}.layer.0.SelfAttention.o", "fused": False }, "mlp_up": { "pattern": "encoder.block.{layer}.layer.1.DenseReluDense.wi", "fused": False }, "mlp_down": { "pattern": "encoder.block.{layer}.layer.1.DenseReluDense.wo", "fused": False } }
[docs] class GenericBinder(ModelBinder): """Generic binder for unknown architectures.""" def _define_mappings(self) -> Dict[str, Dict[str, Any]]: """Generic mappings that work for many architectures.""" return { "attention_query": { "pattern": "layers.{layer}.attention.q_proj", "fused": False, "alternatives": ["q_proj", "query", "q_lin"] }, "attention_key": { "pattern": "layers.{layer}.attention.k_proj", "fused": False, "alternatives": ["k_proj", "key", "k_lin"] }, "attention_value": { "pattern": "layers.{layer}.attention.v_proj", "fused": False, "alternatives": ["v_proj", "value", "v_lin"] }, "attention_output": { "pattern": "layers.{layer}.attention.o_proj", "fused": False, "alternatives": ["o_proj", "out_proj", "dense"] }, "mlp_up": { "pattern": "layers.{layer}.mlp.up_proj", "fused": False, "alternatives": ["up_proj", "fc1", "w1", "dense_h_to_4h"] }, "mlp_down": { "pattern": "layers.{layer}.mlp.down_proj", "fused": False, "alternatives": ["down_proj", "fc2", "w2", "dense_4h_to_h"] } }