Spaces:

mknolan
/

cursor_slides_internvl2

Paused

App Files Files Community

mknolan commited on Mar 15

Commit

ce32a95

verified ·

1 Parent(s): da1f9eb

Upload InternVL2 implementation

Browse files

Files changed (1) hide show

app_internvl2.py +118 -216

app_internvl2.py CHANGED Viewed

@@ -8,11 +8,6 @@ import warnings
 import stat
 import subprocess
 import sys
-import asyncio
-import nest_asyncio
-# Apply nest_asyncio to allow nested event loops
-nest_asyncio.apply()
 # Set environment variables
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
@@ -102,8 +97,7 @@ def check_gpu_availability():
         return False
 # Global variables
-internvl2_pipeline = None
-MODEL_LOADED = False
 USE_GPU = check_gpu_availability()
 if USE_GPU:
@@ -111,209 +105,119 @@ if USE_GPU:
 else:
     print("WARNING: GPU is not available or not working properly. This application requires GPU acceleration.")
-# Check if lmdeploy is available and try to import
 try:
     from lmdeploy import pipeline, TurbomindEngineConfig
-    LMDEPLOY_AVAILABLE = True
     print("Successfully imported lmdeploy")
 except ImportError as e:
-    LMDEPLOY_AVAILABLE = False
-    print(f"lmdeploy import failed: {str(e)}. Will use a placeholder for demos.")
-# Model configuration
-MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ"  # 4-bit quantized model
-def load_internvl2_model():
-    """Load the InternVL2 model using lmdeploy"""
-    global internvl2_pipeline, MODEL_LOADED
-    # If already loaded, return
-    if internvl2_pipeline is not None:
-        return True
-    # If lmdeploy is not available, we'll use a demo placeholder
-    if not LMDEPLOY_AVAILABLE:
-        print("lmdeploy not available. Using demo placeholder.")
-        MODEL_LOADED = False
-        return False
-    # Check if GPU is available
     if not USE_GPU:
-        print("Cannot load InternVL2 model without GPU acceleration.")
-        MODEL_LOADED = False
-        return False
-    print("Loading InternVL2 model...")
-    try:
-        # Force synchronous execution for everything
-        import os
-        # Set environment variables to force synchronous behavior
-        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-        # Disable asyncio in lmdeploy
-        os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
-        # Configure for AWQ quantized model
-        backend_config = TurbomindEngineConfig(
-            model_format='awq',
-            session_len=2048  # Explicitly set session length
-        )
-        # Create a synchronous pipeline to avoid asyncio issues
-        # Explicitly set all parameters that might default to async behavior
-        internvl2_pipeline = pipeline(
-            MODEL_ID,
-            backend_config=backend_config,
-            log_level='INFO',
-            model_name_or_path=None,
-            backend_name="turbomind",
-            stream=False,  # Important: disable streaming
-            tensor_parallel=1,  # Use single GPU to avoid distributed processing
-        )
-        print("InternVL2 model loaded successfully!")
-        MODEL_LOADED = True
-        return True
-    except Exception as e:
-        print(f"Error loading InternVL2 model: {str(e)}")
-        if "CUDA out of memory" in str(e):
-            print("Not enough GPU memory for the model")
-        elif "Found no NVIDIA driver" in str(e):
-            print("NVIDIA GPU driver not found or not properly configured")
-        MODEL_LOADED = False
         return False
 def analyze_image(image, prompt):
-    """Analyze the image using InternVL2 model"""
     try:
-        start_time = time.time()
-        # Skip model loading if lmdeploy is not available
-        if not LMDEPLOY_AVAILABLE:
-            return ("This is a demo placeholder. The actual model couldn't be loaded because lmdeploy "
-                   "is not properly installed. Check your installation and dependencies.")
-        # Check for GPU
-        if not USE_GPU:
-            return ("ERROR: This application requires a GPU to run InternVL2. "
-                  "The NVIDIA driver was not detected on this system. "
-                  "Please make sure this Space is using a GPU-enabled instance and that the GPU is correctly initialized.")
-        # Make sure the model is loaded
-        if not load_internvl2_model():
-            return "Couldn't load InternVL2 model. See logs for details."
-        # Convert numpy array to PIL Image
         if isinstance(image, np.ndarray):
-            image_pil = Image.fromarray(image).convert('RGB')
         else:
-            # If somehow it's already a PIL Image
-            image_pil = image.convert('RGB')
-        # We'll use a completely different approach - multiprocessing
-        # This runs the model in a separate process, avoiding any event loop conflicts
-        import multiprocessing as mp
-        # Define a function to run in a separate process
-        def run_in_process(prompt, image_path, result_queue):
             try:
-                # Set environment variables in the subprocess
-                import os
-                os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-                os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
-                # Import libraries inside the process
-                from lmdeploy import pipeline, TurbomindEngineConfig
-                # Save the image to a temporary file to pass between processes
-                import tempfile
-                import torch
-                # Check GPU in subprocess
-                print(f"Subprocess GPU available: {torch.cuda.is_available()}")
-                # Configure for AWQ quantized model
-                backend_config = TurbomindEngineConfig(
-                    model_format='awq',
-                    session_len=2048
-                )
-                # Create new pipeline in the subprocess
-                model_pipeline = pipeline(
-                    MODEL_ID,
-                    backend_config=backend_config,
-                    log_level='INFO',
-                    model_name_or_path=None,
-                    backend_name="turbomind",
-                    stream=False,
-                    tensor_parallel=1,
-                )
-                # Load the image in the subprocess
-                from PIL import Image
-                image = Image.open(image_path).convert('RGB')
-                # Run inference
-                response = model_pipeline((prompt, image))
                 result = response.text if hasattr(response, "text") else str(response)
-                # Put the result in the queue
-                result_queue.put(("success", result))
             except Exception as e:
-                import traceback
-                error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}"
-                result_queue.put(("error", error_msg))
-        # Create a temporary file for the image
-        import tempfile
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_path = temp_file.name
-            image_pil.save(temp_path)
-        try:
-            # Create a process-safe queue
-            result_queue = mp.Queue()
-            # Start the process
-            print("Starting model inference in a separate process")
-            process = mp.Process(
-                target=run_in_process,
-                args=(prompt, temp_path, result_queue)
-            )
-            # Make it a daemon so it terminates when the main process ends
-            process.daemon = True
-            process.start()
-            # Wait for the process to complete (with timeout)
-            process.join(timeout=180)  # 3 minute timeout
-            # Delete the temporary file
-            try:
-                os.unlink(temp_path)
-            except:
-                pass
-            if process.is_alive():
-                # Terminate the process if it's still running after timeout
-                process.terminate()
-                return "Model inference timed out after 180 seconds. The model might be too slow on this hardware."
-            # Get the result from the queue (non-blocking to avoid hanging)
-            if not result_queue.empty():
-                status, result = result_queue.get(block=False)
-                if status == "error":
-                    return f"Error in model inference: {result}"
-                else:
-                    elapsed_time = time.time() - start_time
-                    return result
-            else:
-                return "Unknown error: Model inference process completed but did not produce a result"
-        except Exception as e:
-            print(f"Error in multiprocessing: {str(e)}")
-            return f"Error setting up multiprocessing: {str(e)}"
     except Exception as e:
         print(f"Error in image analysis: {str(e)}")
         # Try to clean up memory in case of error
@@ -351,11 +255,13 @@ def process_image(image, analysis_type="general"):
 # Define the Gradio interface
 def create_interface():
     with gr.Blocks(title="Image Analysis with InternVL2") as demo:
-        gr.Markdown("# Image Analysis with InternVL2-40B")
         # System diagnostics
         system_info = f"""
         ## System Diagnostics:
         - PyTorch Version: {torch.__version__}
         - CUDA Available: {torch.cuda.is_available()}
         - GPU Working: {USE_GPU}
@@ -363,14 +269,14 @@ def create_interface():
         """
         gr.Markdown(system_info)
-        gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")
         # Show warnings based on system status
-        if not LMDEPLOY_AVAILABLE:
-            gr.Markdown("⚠️ **WARNING**: lmdeploy is not properly installed. This demo will not function correctly.", elem_classes=["warning-message"])
         if not USE_GPU:
-            gr.Markdown("🚫 **ERROR**: NVIDIA GPU not detected. This application requires GPU acceleration to run InternVL2 model.", elem_classes=["error-message"])
         with gr.Row():
             with gr.Column(scale=1):
@@ -382,22 +288,34 @@ def create_interface():
                 )
                 submit_btn = gr.Button("Analyze Image")
-                # Disable button if GPU is not available
-                if not USE_GPU:
                     submit_btn.interactive = False
             with gr.Column(scale=2):
                 output_text = gr.Textbox(label="Analysis Result", lines=20)
                 if not USE_GPU:
-                    output_text.value = f"""ERROR: NVIDIA GPU driver not detected. This application requires GPU acceleration to run the InternVL2 model.
 Diagnostics:
 - PyTorch Version: {torch.__version__}
 - CUDA Available via PyTorch: {torch.cuda.is_available()}
 - nvidia-smi Available: {nvidia_smi_available}
 - GPU Working: {USE_GPU}
 Please ensure this Space is using a GPU-enabled instance and that the GPU is correctly initialized."""
         submit_btn.click(
             fn=process_image,
@@ -424,22 +342,6 @@ Please ensure this Space is using a GPU-enabled instance and that the GPU is cor
         If you're running this on Hugging Face Spaces, make sure to select a GPU-enabled hardware type.
         """)
-        # Examples
-        try:
-            gr.Examples(
-                examples=[
-                    ["data_temp/page_2.png", "general"],
-                    ["data_temp/page_2.png", "text"],
-                    ["data_temp/page_2.png", "chart"]
-                ],
-                inputs=[input_image, analysis_type],
-                outputs=output_text,
-                fn=process_image,
-                cache_examples=True
-            )
-        except Exception as e:
-            print(f"Warning: Could not load examples: {str(e)}")
     return demo
@@ -448,5 +350,5 @@ if __name__ == "__main__":
     # Create the Gradio interface
     demo = create_interface()
-    # Launch the interface (removed incompatible parameters)
     demo.launch(share=False, server_name="0.0.0.0")

 import stat
 import subprocess
 import sys
 # Set environment variables
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
         return False
 # Global variables
+internvl2_model = None
 USE_GPU = check_gpu_availability()
 if USE_GPU:
 else:
     print("WARNING: GPU is not available or not working properly. This application requires GPU acceleration.")
+# ALTERNATIVE MODEL: Let's try a simpler vision model as backup
+try:
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+    HAS_BLIP = True
+    blip_processor = None
+    blip_model = None
+    print("Successfully imported BLIP model")
+except ImportError:
+    HAS_BLIP = False
+    print("BLIP model not available, will try InternVL2")
+# Try importing lmdeploy for InternVL2
 try:
     from lmdeploy import pipeline, TurbomindEngineConfig
+    HAS_LMDEPLOY = True
     print("Successfully imported lmdeploy")
 except ImportError as e:
+    HAS_LMDEPLOY = False
+    print(f"lmdeploy import failed: {str(e)}. Will try backup model.")
+# Try to load the appropriate model
+def load_model():
+    global internvl2_model, blip_processor, blip_model
     if not USE_GPU:
+        print("Cannot load models without GPU acceleration.")
         return False
+    # First try to load InternVL2 if lmdeploy is available
+    if HAS_LMDEPLOY:
+        try:
+            print("Attempting to load InternVL2 model...")
+            # Configure for AWQ quantized model
+            backend_config = TurbomindEngineConfig(
+                model_format='awq',
+                session_len=2048  # Explicitly set session length
+            )
+            # Set to non-streaming mode
+            internvl2_model = pipeline(
+                "OpenGVLab/InternVL2-40B-AWQ",
+                backend_config=backend_config,
+                model_name_or_path=None,
+                backend_name="turbomind",
+                stream=False,  # Disable streaming
+            )
+            print("InternVL2 model loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"Failed to load InternVL2: {str(e)}")
+            internvl2_model = None
+    # If InternVL2 failed or lmdeploy not available, try BLIP
+    if HAS_BLIP:
+        try:
+            print("Falling back to BLIP model...")
+            blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+            blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
+            print("BLIP model loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"Failed to load BLIP: {str(e)}")
+            blip_processor = None
+            blip_model = None
+    print("Could not load any model")
+    return False
+# Try to load a model at startup
+MODEL_LOADED = load_model()
+WHICH_MODEL = "InternVL2" if internvl2_model is not None else "BLIP" if blip_model is not None else "None"
 def analyze_image(image, prompt):
+    """Analyze the image using available model"""
+    if not MODEL_LOADED:
+        return "No model could be loaded. Please check the logs for details."
+    if not USE_GPU:
+        return "ERROR: This application requires GPU acceleration. No GPU detected."
     try:
+        # Convert image to right format if needed
         if isinstance(image, np.ndarray):
+            pil_image = Image.fromarray(image).convert('RGB')
         else:
+            pil_image = image.convert('RGB')
+        # If we have InternVL2 loaded, use it
+        if internvl2_model is not None:
             try:
+                print("Running inference with InternVL2...")
+                response = internvl2_model((prompt, pil_image))
                 result = response.text if hasattr(response, "text") else str(response)
+                return f"[InternVL2] {result}"
+            except Exception as e:
+                print(f"Error with InternVL2: {str(e)}")
+                # If InternVL2 fails, fall back to BLIP if available
+        # If we have BLIP loaded, use it
+        if blip_model is not None and blip_processor is not None:
+            try:
+                print("Running inference with BLIP...")
+                # BLIP doesn't use prompts the same way, simplify
+                inputs = blip_processor(pil_image, return_tensors="pt").to("cuda")
+                out = blip_model.generate(**inputs, max_new_tokens=100)
+                result = blip_processor.decode(out[0], skip_special_tokens=True)
+                return f"[BLIP] {result} (Note: Custom prompts not supported with BLIP fallback model)"
             except Exception as e:
+                print(f"Error with BLIP: {str(e)}")
+        return "No model was able to analyze the image. See logs for details."
     except Exception as e:
         print(f"Error in image analysis: {str(e)}")
         # Try to clean up memory in case of error
 # Define the Gradio interface
 def create_interface():
     with gr.Blocks(title="Image Analysis with InternVL2") as demo:
+        gr.Markdown(f"# Image Analysis with {WHICH_MODEL}")
         # System diagnostics
         system_info = f"""
         ## System Diagnostics:
+        - Model Used: {WHICH_MODEL}
+        - Model Loaded: {MODEL_LOADED}
         - PyTorch Version: {torch.__version__}
         - CUDA Available: {torch.cuda.is_available()}
         - GPU Working: {USE_GPU}
         """
         gr.Markdown(system_info)
+        gr.Markdown(f"Upload an image to analyze it using the {WHICH_MODEL} model.")
         # Show warnings based on system status
+        if not MODEL_LOADED:
+            gr.Markdown("⚠️ **WARNING**: No model could be loaded. This demo will not function correctly.", elem_classes=["warning-message"])
         if not USE_GPU:
+            gr.Markdown("🚫 **ERROR**: NVIDIA GPU not detected. This application requires GPU acceleration.", elem_classes=["error-message"])
         with gr.Row():
             with gr.Column(scale=1):
                 )
                 submit_btn = gr.Button("Analyze Image")
+                # Disable button if GPU is not available or no model loaded
+                if not USE_GPU or not MODEL_LOADED:
                     submit_btn.interactive = False
             with gr.Column(scale=2):
                 output_text = gr.Textbox(label="Analysis Result", lines=20)
                 if not USE_GPU:
+                    output_text.value = f"""ERROR: NVIDIA GPU driver not detected. This application requires GPU acceleration.
 Diagnostics:
+- Model Used: {WHICH_MODEL}
 - PyTorch Version: {torch.__version__}
 - CUDA Available via PyTorch: {torch.cuda.is_available()}
 - nvidia-smi Available: {nvidia_smi_available}
 - GPU Working: {USE_GPU}
 Please ensure this Space is using a GPU-enabled instance and that the GPU is correctly initialized."""
+                elif not MODEL_LOADED:
+                    output_text.value = f"""ERROR: No model could be loaded.
+Diagnostics:
+- Model Used: {WHICH_MODEL}
+- PyTorch Version: {torch.__version__}
+- CUDA Available via PyTorch: {torch.cuda.is_available()}
+- nvidia-smi Available: {nvidia_smi_available}
+- GPU Working: {USE_GPU}
+Please check the logs for more details."""
         submit_btn.click(
             fn=process_image,
         If you're running this on Hugging Face Spaces, make sure to select a GPU-enabled hardware type.
         """)
     return demo
     # Create the Gradio interface
     demo = create_interface()
+    # Launch the interface
     demo.launch(share=False, server_name="0.0.0.0")