Spaces:

nvidia
/

Eagle2-Demo

Running on Zero

App Files Files Community

Zhiding commited on 16 days ago

Commit

e49e746

1 Parent(s): 79b84d6

update

Browse files

Files changed (1) hide show

eagle_vl/serve/inference.py +6 -6

eagle_vl/serve/inference.py CHANGED Viewed

@@ -22,7 +22,7 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
     token = os.environ.get("HF_TOKEN")
     # hotfix the model to use flash attention 2
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, use_auth_token=token)
     config._attn_implementation = "flash_attention_2"
     config.vision_config._attn_implementation = "flash_attention_2"
     config.text_config._attn_implementation = "flash_attention_2"
@@ -34,10 +34,10 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
         trust_remote_code=True,
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
-        use_auth_token=token
     )
     model.to("cuda")
-    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, use_auth_token=token)
     return model, processor
@@ -45,7 +45,7 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
     token = os.environ.get("HF_TOKEN")
     # hotfix the model to use flash attention 2
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, use_auth_token=token)
     config._attn_implementation = "flash_attention_2"
     config.vision_config._attn_implementation = "flash_attention_2"
     config.text_config._attn_implementation = "flash_attention_2"
@@ -57,10 +57,10 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
         trust_remote_code=True,
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
-        use_auth_token=token
     )
     model.to("cuda")
-    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, use_auth_token=token)
     return model, processor

     token = os.environ.get("HF_TOKEN")
     # hotfix the model to use flash attention 2
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
     config._attn_implementation = "flash_attention_2"
     config.vision_config._attn_implementation = "flash_attention_2"
     config.text_config._attn_implementation = "flash_attention_2"
         trust_remote_code=True,
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
+        token=token
     )
     model.to("cuda")
+    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
     return model, processor
     token = os.environ.get("HF_TOKEN")
     # hotfix the model to use flash attention 2
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
     config._attn_implementation = "flash_attention_2"
     config.vision_config._attn_implementation = "flash_attention_2"
     config.text_config._attn_implementation = "flash_attention_2"
         trust_remote_code=True,
         torch_dtype=torch.bfloat16,
         attn_implementation="flash_attention_2",
+        token=token
     )
     model.to("cuda")
+    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
     return model, processor