Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Expose InternVL2 max_dynamic_patch as a mm_processor_kwarg #8946

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)

placeholders = "\n".join(f"Image-{i}: <image>\n"
Expand Down
33 changes: 22 additions & 11 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,16 @@ def get_internvl_num_patches(image_size: int, patch_size: int,
(downsample_ratio**2))


def get_max_internvl_image_tokens(ctx: InputContext):
def get_max_internvl_image_tokens(ctx: InputContext,
*,
max_dynamic_patch: Optional[int] = None):
hf_config = ctx.get_hf_config()
vision_config = hf_config.vision_config

if max_dynamic_patch is None:
max_dynamic_patch = hf_config.max_dynamic_patch
use_thumbnail = hf_config.use_thumbnail
max_dynamic_patch = hf_config.max_dynamic_patch
if use_thumbnail:
if use_thumbnail and max_dynamic_patch > 1:
max_dynamic_patch += 1
downsample_ratio = hf_config.downsample_ratio

Expand All @@ -192,7 +195,10 @@ def get_max_internvl_image_tokens(ctx: InputContext):
return num_patches * max_dynamic_patch


def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
def input_processor_for_internvl(ctx: InputContext,
llm_inputs: LLMInputs,
*,
max_dynamic_patch: Optional[int] = None):
multi_modal_data = llm_inputs.get("multi_modal_data")
if multi_modal_data is None or "image" not in multi_modal_data:
return llm_inputs
Expand All @@ -209,7 +215,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):

image_data = multi_modal_data["image"]
min_num = hf_config.min_dynamic_patch
max_num = hf_config.max_dynamic_patch
max_num = max_dynamic_patch or hf_config.max_dynamic_patch
use_thumbnail = hf_config.use_thumbnail
if isinstance(image_data, Image.Image):
width, height = image_data.size
Expand Down Expand Up @@ -253,12 +259,15 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
multi_modal_data=multi_modal_data)


def input_mapper_for_internvl(ctx: InputContext, data: object):
def input_mapper_for_internvl(ctx: InputContext,
data: object,
*,
max_dynamic_patch: Optional[int] = None):
hf_config = ctx.get_hf_config()

use_thumbnail = hf_config.use_thumbnail
min_num = hf_config.min_dynamic_patch
max_num = hf_config.max_dynamic_patch
max_num = max_dynamic_patch or hf_config.max_dynamic_patch
Copy link
Member

@DarkLight1337 DarkLight1337 Sep 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe instead set it up in input processor/mapper/dummy data, and require it to be an integer in get_max_internvl_image_tokens?

image_size = hf_config.vision_config.image_size

if isinstance(data, Image.Image):
Expand Down Expand Up @@ -292,8 +301,11 @@ def input_mapper_for_internvl(ctx: InputContext, data: object):
})


def dummy_data_for_internvl(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
def dummy_data_for_internvl(ctx: InputContext,
seq_len: int,
mm_counts: Mapping[str, int],
*,
max_dynamic_patch: Optional[int] = None):
num_images = mm_counts["image"]

image_feature_size = get_max_internvl_image_tokens(ctx)
Expand All @@ -315,7 +327,7 @@ def dummy_data_for_internvl(ctx: InputContext, seq_len: int,

image_size = vision_config.image_size
min_num = hf_config.min_dynamic_patch
max_num = hf_config.max_dynamic_patch
max_num = max_dynamic_patch or hf_config.max_dynamic_patch
max_image_width = max_num * image_size
max_image_height = min_num * image_size

Expand Down Expand Up @@ -470,7 +482,6 @@ def _process_image_input(
self,
image_input: InternVLImageInputs,
) -> torch.Tensor:

if image_input["type"] == "image_embeds":
return image_input["data"]

Expand Down
Loading