| | |
| | |
| | """OpenCUA-7B EXL2 — Standalone Visual Inference (Streaming) |
| | Tested On exllamav2 0.3.2, python3.12.9, torch 2.6.0+cu126 |
| | - Applies a minimal, safe monkey-patch so ExLlamaV2 knows how to wire the |
| | OpenCUA EXL2 architecture (Qwen2.5-style vision tower + Llama-like LM). |
| | - Keeps vision RoPE active (DO NOT neutralize positional embeddings). |
| | - Chooses a valid 1-D RoPE style if available (LLAMA > HF > default). |
| | - Loads model + vision tower, extracts EXL2 image embeddings. |
| | - Builds a chat-style prompt with the image alias and user instruction. |
| | - Streams tokens using ExLlamaV2DynamicGenerator / DynamicJob.""" |
| | |
| | MODEL_PATH = r"C:\Users\44741\Desktop\OpenCUA-7B-exl2" |
| | IMAGE_URL = "http://images.cocodataset.org/val2017/000000001584.jpg" |
| | INSTRUCTION = "Describe in detail everything you can see." |
| | MAX_NEW_TOKENS = 600 |
| | |
| | import sys |
| | import traceback |
| | import torch |
| | from PIL import Image |
| | import requests |
| | |
| | |
| | from exllamav2.architecture import ( |
| | ExLlamaV2ArchParams, |
| | RopeStyle, |
| | layer_keys_llama_norms, |
| | layer_keys_llama_attn, |
| | layer_keys_llama_mlp, |
| | expect_keys_llama |
| | ) |
| |
|
| | print(" -- Applying OpenCUA architecture monkey-patch for inference...") |
| |
|
| | _original_arch_init = ExLlamaV2ArchParams.__init__ |
| |
|
| | def _patched_arch_init(self, arch_string, read_config): |
| | |
| | _original_arch_init(self, arch_string, read_config) |
| |
|
| | |
| | if arch_string == "OpenCUAForConditionalGeneration": |
| | print(" -- Found OpenCUA architecture, applying keys & RoPE settings...") |
| |
|
| | |
| | self.lm_prefix = "language_model." |
| | self.lm.layer_keys += ( |
| | layer_keys_llama_norms + layer_keys_llama_attn + layer_keys_llama_mlp |
| | ) |
| | self.lm.expect_keys += expect_keys_llama |
| | self.lm.attention_bias_qkv = True |
| | self.lm.supports_tp = True |
| |
|
| | |
| | self.vt_prefix = "vision_tower." |
| | read_config["vision_config"].update({"model_type": "qwen2.5"}) |
| | self.vt.keys.update({ |
| | "fused_qkv": ".attn.qkv", |
| | "attn_o": ".attn.proj", |
| | "mlp_gate": ".mlp.gate_proj", |
| | "mlp_up": ".mlp.up_proj", |
| | "mlp_down": ".mlp.down_proj", |
| | "norm_1": ".norm1", |
| | "norm_2": ".norm2", |
| | "layers": "blocks", |
| | "patch_conv": "patch_embed.proj", |
| | }) |
| | self.vt.mlp_gate = True |
| | self.vt.mlp_act_func = "silu" |
| | self.vt.norm = "rmsnorm" |
| | self.vt.mlp_bias = True |
| | self.vt.attention_bias_qkv = True |
| | self.vt.attention_bias_o = True |
| | self.vt.vision_input_norm = False |
| | self.vt.vision_conv3d = True |
| |
|
| | |
| | try: |
| | if hasattr(RopeStyle, "LLAMA"): |
| | self.vt.rope_style = RopeStyle.LLAMA |
| | elif hasattr(RopeStyle, "HF"): |
| | self.vt.rope_style = RopeStyle.HF |
| | else: |
| | |
| | pass |
| | except Exception: |
| | |
| | pass |
| |
|
| | |
| | self.vt.mlp_merger = True |
| | self.mmp_prefix = "vision_tower.merger." |
| | self.mmp.keys.update({ |
| | "mlp_gate": None, |
| | "mlp_up": "mlp.0", |
| | "mlp_down": "mlp.2", |
| | "norm_2": "ln_q", |
| | }) |
| | self.mmp.mlp_gate = False |
| | self.mmp.mlp_act_func = "gelu" |
| | self.mmp.mlp_bias = True |
| | self.mmp.norm = "layernorm" |
| |
|
| | |
| | ExLlamaV2ArchParams.__init__ = _patched_arch_init |
| | print(" -- Patch applied successfully.") |
| | |
| |
|
| | |
| | from exllamav2 import ( |
| | ExLlamaV2, |
| | ExLlamaV2Config, |
| | ExLlamaV2Cache, |
| | ExLlamaV2Tokenizer, |
| | ExLlamaV2VisionTower, |
| | ) |
| | from exllamav2.generator import ( |
| | ExLlamaV2DynamicGenerator, |
| | ExLlamaV2Sampler, |
| | ExLlamaV2DynamicJob, |
| | ) |
| |
|
| | def main(): |
| | try: |
| | print(" -- Loading model/config...") |
| | config = ExLlamaV2Config(MODEL_PATH) |
| | |
| | |
| |
|
| | model = ExLlamaV2(config) |
| | cache = ExLlamaV2Cache(model, lazy=True) |
| | model.load_autosplit(cache) |
| |
|
| | tokenizer = ExLlamaV2Tokenizer(config) |
| |
|
| | print(" -- Loading vision tower...") |
| | vision_tower = ExLlamaV2VisionTower(config) |
| | vision_tower.load() |
| | try: |
| | print(f"[Debug] vt.rope_style = {getattr(vision_tower, 'rope_style', 'n/a')}") |
| | except Exception: |
| | pass |
| |
|
| | generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer) |
| |
|
| | print(f" -- Downloading test image from: {IMAGE_URL}") |
| | image = Image.open(requests.get(IMAGE_URL, stream=True).raw).convert("RGB") |
| |
|
| | print(" -- Processing image and building prompt...") |
| | image_embeddings = vision_tower.get_image_embeddings(model, tokenizer, image) |
| |
|
| | |
| | placeholders = image_embeddings.text_alias |
| | prompt = ( |
| | f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" |
| | f"<|im_start|>user\n{placeholders}\n{INSTRUCTION}<|im_end|>\n" |
| | f"<|im_start|>assistant\n" |
| | ) |
| |
|
| | |
| | print("\n--- Prompt Sent to Model ---") |
| | print(prompt.replace(image_embeddings.text_alias, "<image>")) |
| | print("----------------------------\n") |
| |
|
| | |
| | print("--- Model Output (streaming) ---") |
| | gen_settings = ExLlamaV2Sampler.Settings.greedy() |
| |
|
| | |
| | input_ids = tokenizer.encode( |
| | prompt, |
| | add_bos=True, |
| | encode_special_tokens=True, |
| | embeddings=[image_embeddings] |
| | ) |
| |
|
| | |
| | job = ExLlamaV2DynamicJob( |
| | input_ids=input_ids, |
| | max_new_tokens=MAX_NEW_TOKENS, |
| | decode_special_tokens=False, |
| | gen_settings=gen_settings, |
| | embeddings=[image_embeddings], |
| | ) |
| |
|
| | |
| | generator.enqueue(job) |
| |
|
| | final_text = [] |
| | try: |
| | while generator.num_remaining_jobs(): |
| | results = generator.iterate() |
| | for r in results: |
| | chunk = r.get("text", "") |
| | if chunk: |
| | print(chunk, end="", flush=True) |
| | final_text.append(chunk) |
| | finally: |
| | print("\n\n--- Test Complete ---") |
| |
|
| | |
| | full_output = "".join(final_text) |
| | |
| | |
| |
|
| | except Exception as e: |
| | print(f"\nAn error occurred: {e}") |
| | traceback.print_exc() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | try: |
| | torch.backends.cuda.matmul.allow_tf32 = True |
| | except Exception: |
| | pass |
| |
|
| | main() |
| |
|