Zhiding commited on
Commit
e49e746
·
1 Parent(s): 79b84d6
Files changed (1) hide show
  1. eagle_vl/serve/inference.py +6 -6
eagle_vl/serve/inference.py CHANGED
@@ -22,7 +22,7 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
22
 
23
  token = os.environ.get("HF_TOKEN")
24
  # hotfix the model to use flash attention 2
25
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, use_auth_token=token)
26
  config._attn_implementation = "flash_attention_2"
27
  config.vision_config._attn_implementation = "flash_attention_2"
28
  config.text_config._attn_implementation = "flash_attention_2"
@@ -34,10 +34,10 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
34
  trust_remote_code=True,
35
  torch_dtype=torch.bfloat16,
36
  attn_implementation="flash_attention_2",
37
- use_auth_token=token
38
  )
39
  model.to("cuda")
40
- processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, use_auth_token=token)
41
 
42
  return model, processor
43
 
@@ -45,7 +45,7 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
45
 
46
  token = os.environ.get("HF_TOKEN")
47
  # hotfix the model to use flash attention 2
48
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, use_auth_token=token)
49
  config._attn_implementation = "flash_attention_2"
50
  config.vision_config._attn_implementation = "flash_attention_2"
51
  config.text_config._attn_implementation = "flash_attention_2"
@@ -57,10 +57,10 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
57
  trust_remote_code=True,
58
  torch_dtype=torch.bfloat16,
59
  attn_implementation="flash_attention_2",
60
- use_auth_token=token
61
  )
62
  model.to("cuda")
63
- processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, use_auth_token=token)
64
 
65
  return model, processor
66
 
 
22
 
23
  token = os.environ.get("HF_TOKEN")
24
  # hotfix the model to use flash attention 2
25
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
26
  config._attn_implementation = "flash_attention_2"
27
  config.vision_config._attn_implementation = "flash_attention_2"
28
  config.text_config._attn_implementation = "flash_attention_2"
 
34
  trust_remote_code=True,
35
  torch_dtype=torch.bfloat16,
36
  attn_implementation="flash_attention_2",
37
+ token=token
38
  )
39
  model.to("cuda")
40
+ processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
41
 
42
  return model, processor
43
 
 
45
 
46
  token = os.environ.get("HF_TOKEN")
47
  # hotfix the model to use flash attention 2
48
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
49
  config._attn_implementation = "flash_attention_2"
50
  config.vision_config._attn_implementation = "flash_attention_2"
51
  config.text_config._attn_implementation = "flash_attention_2"
 
57
  trust_remote_code=True,
58
  torch_dtype=torch.bfloat16,
59
  attn_implementation="flash_attention_2",
60
+ token=token
61
  )
62
  model.to("cuda")
63
+ processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, token=token)
64
 
65
  return model, processor
66