Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files
eagle_vl/serve/inference.py
CHANGED
@@ -22,7 +22,7 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
|
|
22 |
|
23 |
token = os.environ.get("HF_TOKEN")
|
24 |
# hotfix the model to use flash attention 2
|
25 |
-
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True,
|
26 |
config._attn_implementation = "flash_attention_2"
|
27 |
config.vision_config._attn_implementation = "flash_attention_2"
|
28 |
config.text_config._attn_implementation = "flash_attention_2"
|
@@ -34,10 +34,10 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
|
|
34 |
trust_remote_code=True,
|
35 |
torch_dtype=torch.bfloat16,
|
36 |
attn_implementation="flash_attention_2",
|
37 |
-
|
38 |
)
|
39 |
model.to("cuda")
|
40 |
-
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True,
|
41 |
|
42 |
return model, processor
|
43 |
|
@@ -45,7 +45,7 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
|
45 |
|
46 |
token = os.environ.get("HF_TOKEN")
|
47 |
# hotfix the model to use flash attention 2
|
48 |
-
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True,
|
49 |
config._attn_implementation = "flash_attention_2"
|
50 |
config.vision_config._attn_implementation = "flash_attention_2"
|
51 |
config.text_config._attn_implementation = "flash_attention_2"
|
@@ -57,10 +57,10 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
|
57 |
trust_remote_code=True,
|
58 |
torch_dtype=torch.bfloat16,
|
59 |
attn_implementation="flash_attention_2",
|
60 |
-
|
61 |
)
|
62 |
model.to("cuda")
|
63 |
-
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True,
|
64 |
|
65 |
return model, processor
|
66 |
|
|
|
22 |
|
23 |
token = os.environ.get("HF_TOKEN")
|
24 |
# hotfix the model to use flash attention 2
|
25 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
26 |
config._attn_implementation = "flash_attention_2"
|
27 |
config.vision_config._attn_implementation = "flash_attention_2"
|
28 |
config.text_config._attn_implementation = "flash_attention_2"
|
|
|
34 |
trust_remote_code=True,
|
35 |
torch_dtype=torch.bfloat16,
|
36 |
attn_implementation="flash_attention_2",
|
37 |
+
token=token
|
38 |
)
|
39 |
model.to("cuda")
|
40 |
+
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
|
41 |
|
42 |
return model, processor
|
43 |
|
|
|
45 |
|
46 |
token = os.environ.get("HF_TOKEN")
|
47 |
# hotfix the model to use flash attention 2
|
48 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
49 |
config._attn_implementation = "flash_attention_2"
|
50 |
config.vision_config._attn_implementation = "flash_attention_2"
|
51 |
config.text_config._attn_implementation = "flash_attention_2"
|
|
|
57 |
trust_remote_code=True,
|
58 |
torch_dtype=torch.bfloat16,
|
59 |
attn_implementation="flash_attention_2",
|
60 |
+
token=token
|
61 |
)
|
62 |
model.to("cuda")
|
63 |
+
processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
|
64 |
|
65 |
return model, processor
|
66 |
|