Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files
eagle_vl/serve/inference.py
CHANGED
@@ -44,6 +44,8 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
|
|
44 |
def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
45 |
|
46 |
token = os.environ.get("HF_TOKEN")
|
|
|
|
|
47 |
# hotfix the model to use flash attention 2
|
48 |
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
49 |
config._attn_implementation = "flash_attention_2"
|
@@ -51,7 +53,6 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
|
51 |
config.text_config._attn_implementation = "flash_attention_2"
|
52 |
print("Successfully set the attn_implementation to flash_attention_2")
|
53 |
|
54 |
-
logger.info(f"token = {token[:4]}***{token[-2:]}")
|
55 |
model = AutoModel.from_pretrained(
|
56 |
model_path,
|
57 |
trust_remote_code=True,
|
|
|
44 |
def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
45 |
|
46 |
token = os.environ.get("HF_TOKEN")
|
47 |
+
logger.info(f"token = {token[:4]}***{token[-2:]}")
|
48 |
+
|
49 |
# hotfix the model to use flash attention 2
|
50 |
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
51 |
config._attn_implementation = "flash_attention_2"
|
|
|
53 |
config.text_config._attn_implementation = "flash_attention_2"
|
54 |
print("Successfully set the attn_implementation to flash_attention_2")
|
55 |
|
|
|
56 |
model = AutoModel.from_pretrained(
|
57 |
model_path,
|
58 |
trust_remote_code=True,
|