Spaces:

Luigi
/

Video-Human-Fall-Detection-with-CLIP

Running on Zero

Luigi commited on 19 days ago

Commit

d5cb4e0

1 Parent(s): c5ee215

Delay Model Loading Until Inside a GPU Context

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,29 +5,20 @@ from transformers import CLIPProcessor, CLIPModel
 # Load the CLIP model and processor on the CPU initially
 model_name = "openai/clip-vit-base-patch32"
-model = CLIPModel.from_pretrained(model_name)
-processor = CLIPProcessor.from_pretrained(model_name)
 @spaces.GPU
 def clip_similarity(image, text):
-    """
-    Computes a similarity score between an input image and text using the CLIP model.
-    This function is decorated with @spaces.GPU so that the model is moved to GPU only when needed.
-    """
-    # Create a torch device for cuda
-    device = torch.device("cuda")
-    # Move the model to GPU within the function
     model.to(device)
-    # Preprocess the inputs and move tensors to GPU
     inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
-    inputs = {key: val.to(device) for key, val in inputs.items()}
-    # Run inference
     outputs = model(**inputs)
-    # Extract similarity score (logits_per_image): higher value indicates better matching
     similarity_score = outputs.logits_per_image.detach().cpu().numpy()[0]
     return float(similarity_score)

 # Load the CLIP model and processor on the CPU initially
 model_name = "openai/clip-vit-base-patch32"
 @spaces.GPU
 def clip_similarity(image, text):
+    # Load the model and processor inside GPU context
+    model = CLIPModel.from_pretrained(model_name)
+    processor = CLIPProcessor.from_pretrained(model_name)
+    device = torch.device("cuda")
     model.to(device)
     inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     outputs = model(**inputs)
     similarity_score = outputs.logits_per_image.detach().cpu().numpy()[0]
     return float(similarity_score)