31
31
import torch .types
32
32
from PIL import Image
33
33
from torch import nn
34
- from torch .nn .init import trunc_normal_
35
34
from transformers import PretrainedConfig
36
35
from typing_extensions import NotRequired
37
36
38
37
from vllm .attention import AttentionMetadata
39
38
from vllm .config import CacheConfig , LoRAConfig , MultiModalConfig
40
39
from vllm .inputs import INPUT_REGISTRY , InputContext , LLMInputs
41
- from vllm .model_executor .layers .linear import ReplicatedLinear
42
40
from vllm .model_executor .layers .logits_processor import LogitsProcessor
43
41
from vllm .model_executor .layers .quantization import QuantizationConfig
44
- from vllm .model_executor .layers .resampler import (Resampler2 ,
42
+ from vllm .model_executor .layers .resampler import (BaseResampler , Resampler2 ,
45
43
get_2d_sincos_pos_embed )
46
44
from vllm .model_executor .layers .sampler import Sampler , SamplerOutput
47
45
from vllm .model_executor .layers .vocab_parallel_embedding import ParallelLMHead
@@ -106,58 +104,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
106
104
DEFAULT_LN = partial (nn .LayerNorm , eps = 1e-6 )
107
105
108
106
109
- class BaseResampler (nn .Module ):
110
- """
111
- A 2D perceiver-resampler network with one cross attention layers by
112
- (grid_size**2) learnable queries and 2d sincos pos_emb
113
- Outputs:
114
- A tensor with the shape of (grid_size**2, embed_dim)
115
- """
116
-
117
- def __init__ (
118
- self ,
119
- num_queries : int ,
120
- embed_dim : int ,
121
- num_heads : int ,
122
- kv_dim : Optional [int ] = None ,
123
- norm_layer : Callable [[int ], nn .LayerNorm ] = DEFAULT_LN ,
124
- ) -> None :
125
- super ().__init__ ()
126
-
127
- self .num_queries = num_queries
128
- self .embed_dim = embed_dim
129
- self .num_heads = num_heads
130
-
131
- self .query = nn .Parameter (torch .zeros (self .num_queries , embed_dim ))
132
- trunc_normal_ (self .query , std = 0.02 )
133
- if kv_dim is not None and kv_dim != embed_dim :
134
- self .kv_proj = ReplicatedLinear (kv_dim , embed_dim , bias = False )
135
- else :
136
- # Maintain the same return value with ReplicatedLinear.forward
137
- self .kv_proj = lambda * args , ** kwargs : (
138
- nn .Identity ()(* args , ** kwargs ),
139
- None ,
140
- )
141
- self .attn = nn .MultiheadAttention (embed_dim , num_heads )
142
- self .ln_q = norm_layer (embed_dim )
143
- self .ln_kv = norm_layer (embed_dim )
144
- self .ln_post = norm_layer (embed_dim )
145
- self .proj = nn .Parameter (
146
- (embed_dim ** - 0.5 ) * torch .randn (embed_dim , embed_dim ))
147
-
148
- def _init_weights (self , m : nn .Module ) -> None :
149
- if isinstance (m , nn .Linear ):
150
- trunc_normal_ (m .weight , std = 0.02 )
151
- if isinstance (m , nn .Linear ) and m .bias is not None :
152
- nn .init .constant_ (m .bias , 0 )
153
- elif isinstance (m , nn .LayerNorm ):
154
- nn .init .constant_ (m .bias , 0 )
155
- nn .init .constant_ (m .weight , 1.0 )
156
-
157
- def _repeat (self , query , N : int ):
158
- return query .unsqueeze (1 ).repeat (1 , N , 1 )
159
-
160
-
161
107
class Resampler2_5 (BaseResampler ):
162
108
163
109
def __init__ (
@@ -869,7 +815,35 @@ def is_default_weight_loading(self, name: str) -> bool:
869
815
return "resampler" in name
870
816
871
817
872
- class MiniCPMV2_6 (MiniCPMVBaseModel ):
818
+ class MiniCPMV2_6 (MiniCPMVBaseModel , SupportsLoRA ):
819
+ packed_modules_mapping = {
820
+ "qkv_proj" : [
821
+ "q_proj" ,
822
+ "k_proj" ,
823
+ "v_proj" ,
824
+ ],
825
+ "gate_up_proj" : [
826
+ "gate_proj" ,
827
+ "up_proj" ,
828
+ ],
829
+ }
830
+ # LoRA specific attributes
831
+ supported_lora_modules = [
832
+ # vision encoder
833
+ "fc1" ,
834
+ "fc2" ,
835
+ "out_proj" ,
836
+ # language model
837
+ "qkv_proj" , # same name with vision encoder
838
+ "o_proj" ,
839
+ "gate_up_proj" ,
840
+ "down_proj" ,
841
+ # resampler
842
+ "kv_proj" ,
843
+ ]
844
+
845
+ embedding_modules = {}
846
+ embedding_padding_modules = []
873
847
874
848
def __init__ (
875
849
self ,
@@ -894,15 +868,8 @@ def init_llm(
894
868
name = "model" )
895
869
896
870
def init_vision_module (self ) -> nn .Module :
897
- # A custom version of SiglipVisionTransformer, won't work with TP
898
- from vllm .model_executor .models .na_vit import SiglipVisionTransformer
899
871
900
- if self .config ._attn_implementation == "flash_attention_2" :
901
- self .config .vision_config ._attn_implementation = "flash_attention_2"
902
- else :
903
- # not support sdpa
904
- self .config .vision_config ._attn_implementation = "eager"
905
- model = SiglipVisionTransformer (self .config .vision_config )
872
+ model = Idefics2VisionTransformer (self .config .vision_config )
906
873
if self .config .drop_vision_last_layer :
907
874
model .encoder .layers = model .encoder .layers [:- 1 ]
908
875
return model
@@ -928,7 +895,7 @@ def get_vision_embedding(
928
895
pixel_values ,
929
896
patch_attention_mask = patch_attn_mask ,
930
897
tgt_sizes = tgt_sizes ,
931
- ). last_hidden_state
898
+ )
932
899
return vision_embedding
933
900
934
901
def get_vision_hidden_states (
@@ -960,12 +927,12 @@ def get_vision_hidden_states(
960
927
all_pixel_values .type (dtype ),
961
928
patch_attention_mask = patch_attn_mask ,
962
929
tgt_sizes = tgt_sizes ,
963
- ). last_hidden_state
930
+ )
964
931
965
932
return self .resampler (vision_embedding , tgt_sizes )
966
933
967
934
def is_default_weight_loading (self , name : str ) -> bool :
968
- return "resampler" in name or "vpm" in name
935
+ return "resampler" in name
969
936
970
937
971
938
_SUPPORT_VERSION = {
0 commit comments