kimhyunwoo commited on
Commit
7ca8994
ยท
verified ยท
1 Parent(s): 5ae42ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -138
app.py CHANGED
@@ -1,18 +1,10 @@
1
- # ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์„ค์น˜ํ•˜๋Š” ๋ช…๋ น์–ด์ž…๋‹ˆ๋‹ค.
2
- # ์ด ๋ถ€๋ถ„์€ ์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ ์ดˆ๋ฐ˜์— ํ•œ ๋ฒˆ ์‹คํ–‰๋ฉ๋‹ˆ๋‹ค.
3
  import os
4
- print("Installing required transformers branch...")
5
  os.system("pip install git+https://github.com/shumingma/transformers.git")
6
- print("Installation complete.")
7
 
8
- # ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋“ค์„ import ํ•ฉ๋‹ˆ๋‹ค.
9
  import threading
10
  import torch
11
  import torch._dynamo
12
- import gradio as gr
13
- import spaces # Hugging Face Spaces ๊ด€๋ จ ์œ ํ‹ธ๋ฆฌํ‹ฐ
14
-
15
- # torch._dynamo ์„ค์ • (์„ ํƒ ์‚ฌํ•ญ, ์„ฑ๋Šฅ ํ–ฅ์ƒ ์‹œ๋„)
16
  torch._dynamo.config.suppress_errors = True
17
 
18
  from transformers import (
@@ -20,37 +12,20 @@ from transformers import (
20
  AutoTokenizer,
21
  TextIteratorStreamer,
22
  )
 
 
23
 
24
- # --- ๋ชจ๋ธ ๋กœ๋“œ ---
25
- # ๋ชจ๋ธ ๊ฒฝ๋กœ ์„ค์ • (Hugging Face ๋ชจ๋ธ ID)
26
  model_id = "microsoft/bitnet-b1.58-2B-4T"
27
 
28
- # ๋ชจ๋ธ ๋กœ๋“œ ์‹œ ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€๋ฅผ ์ตœ์†Œํ™”ํ•˜๊ธฐ ์œ„ํ•ด ๋กœ๊น… ๋ ˆ๋ฒจ ์„ค์ •
29
- os.environ["TRANSFORMERS_VERBOSITY"] = "error"
30
-
31
- # AutoModelForCausalLM๊ณผ AutoTokenizer๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
32
- # trust_remote_code=True๊ฐ€ ํ•„์š”ํ•˜๋ฉฐ, device_map="auto"๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ž๋™์œผ๋กœ ๋””๋ฐ”์ด์Šค ์„ค์ •
33
- try:
34
- print(f"๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {model_id}...")
35
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
36
- model = AutoModelForCausalLM.from_pretrained(
37
- model_id,
38
- torch_dtype=torch.bfloat16, # bf16 ์‚ฌ์šฉ (GPU ๊ถŒ์žฅ)
39
- device_map="auto", # ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋””๋ฐ”์ด์Šค์— ์ž๋™์œผ๋กœ ๋ชจ๋ธ ๋ฐฐ์น˜
40
- trust_remote_code=True
41
- )
42
- print(f"๋ชจ๋ธ ๋””๋ฐ”์ด์Šค: {model.device}")
43
- print("๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ.")
44
-
45
- except Exception as e:
46
- print(f"๋ชจ๋ธ ๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
47
- tokenizer = None
48
- model = None
49
- print("๋ชจ๋ธ ๋กœ๋“œ์— ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค. ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜์ด ์ œ๋Œ€๋กœ ๋™์ž‘ํ•˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
50
-
51
 
52
- # --- ํ…์ŠคํŠธ ์ƒ์„ฑ ํ•จ์ˆ˜ (Gradio ChatInterface์šฉ) ---
53
- @spaces.GPU # ์ด ํ•จ์ˆ˜๊ฐ€ GPU ์ž์›์„ ์‚ฌ์šฉํ•˜๋„๋ก ๋ช…์‹œ (Hugging Face Spaces)
54
  def respond(
55
  message: str,
56
  history: list[tuple[str, str]],
@@ -59,109 +34,98 @@ def respond(
59
  temperature: float,
60
  top_p: float,
61
  ):
62
- if model is None or tokenizer is None:
63
- yield "๋ชจ๋ธ ๋กœ๋“œ์— ์‹คํŒจํ•˜์—ฌ ํ…์ŠคํŠธ ์ƒ์„ฑ์„ ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
64
- return # ์ƒ์„ฑ๊ธฐ ํ•จ์ˆ˜์ด๋ฏ€๋กœ return ๋Œ€์‹  ๋นˆ yield ๋˜๋Š” ๊ทธ๋ƒฅ return
65
-
66
- try:
67
- # ๋ฉ”์‹œ์ง€ ํ˜•์‹์„ ๋ชจ๋ธ์˜ chat template์— ๋งž๊ฒŒ ๊ตฌ์„ฑ
68
- messages = [{"role": "system", "content": system_message}]
69
- for user_msg, bot_msg in history:
70
- if user_msg:
71
- messages.append({"role": "user", "content": user_msg})
72
- if bot_msg:
73
- messages.append({"role": "assistant", "content": bot_msg})
74
- messages.append({"role": "user", "content": message})
75
-
76
- prompt = tokenizer.apply_chat_template(
77
- messages, tokenize=False, add_generation_prompt=True
78
- )
79
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
80
-
81
- # ํ…์ŠคํŠธ ์ŠคํŠธ๋ฆฌ๋ฐ์„ ์œ„ํ•œ streamer ์„ค์ •
82
- streamer = TextIteratorStreamer(
83
- tokenizer, skip_prompt=True, skip_special_tokens=True
84
- )
85
- generate_kwargs = dict(
86
- **inputs,
87
- streamer=streamer,
88
- max_new_tokens=max_tokens,
89
- temperature=temperature,
90
- top_p=top_p,
91
- do_sample=True,
92
- pad_token_id=tokenizer.eos_token_id # ํŒจ๋”ฉ ํ† ํฐ ID ์„ค์ •
93
- )
94
-
95
- # ๋ชจ๋ธ ์ƒ์„ฑ์„ ๋ณ„๋„์˜ ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰
96
- thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
97
- thread.start()
98
-
99
- # ์ŠคํŠธ๋ฆฌ๋จธ์—์„œ ์ƒ์„ฑ๋œ ํ…์ŠคํŠธ๋ฅผ ์ฝ์–ด์™€ yield
100
- response = ""
101
- for new_text in streamer:
102
- response += new_text
103
- yield response # ์‹ค์‹œ๊ฐ„์œผ๋กœ ์‘๋‹ต์„ Gradio ์ธํ„ฐํŽ˜์ด์Šค๋กœ ์ „๋‹ฌ
104
-
105
- except Exception as e:
106
- yield f"ํ…์ŠคํŠธ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}"
107
- # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์Šค๋ ˆ๋“œ ์ฒ˜๋ฆฌ ๋กœ์ง ์ถ”๊ฐ€ ๊ณ ๋ ค ํ•„์š” (์„ ํƒ ์‚ฌํ•ญ)
108
-
109
 
110
- # --- Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ • ---
111
- if model is not None and tokenizer is not None:
112
- demo = gr.ChatInterface(
113
- fn=respond,
114
- title="Bitnet-b1.58-2B-4T Chatbot",
115
- description="Microsoft Bitnet-b1.58-2B-4T ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•œ ์ฑ„ํŒ… ๋ฐ๋ชจ์ž…๋‹ˆ๋‹ค.",
116
- examples=[
117
- [
118
- "์•ˆ๋…•ํ•˜์„ธ์š”! ์ž๊ธฐ์†Œ๊ฐœ ํ•ด์ฃผ์„ธ์š”.",
119
- "๋‹น์‹ ์€ ์œ ๋Šฅํ•œ AI ๋น„์„œ์ž…๋‹ˆ๋‹ค.", # System message ์˜ˆ์‹œ
120
- 512, # Max new tokens ์˜ˆ์‹œ
121
- 0.7, # Temperature ์˜ˆ์‹œ
122
- 0.95, # Top-p ์˜ˆ์‹œ
123
- ],
124
- [
125
- "ํŒŒ์ด์ฌ์œผ๋กœ ๊ฐ„๋‹จํ•œ ์›น ์„œ๋ฒ„ ๋งŒ๋“œ๋Š” ์ฝ”๋“œ ์•Œ๋ ค์ค˜",
126
- "๋‹น์‹ ์€ ์œ ๋Šฅํ•œ AI ๊ฐœ๋ฐœ์ž์ž…๋‹ˆ๋‹ค.", # System message ์˜ˆ์‹œ
127
- 1024, # Max new tokens ์˜ˆ์‹œ
128
- 0.8, # Temperature ์˜ˆ์‹œ
129
- 0.9, # Top-p ์˜ˆ์‹œ
130
- ],
 
 
 
 
 
 
 
 
 
131
  ],
132
- additional_inputs=[
133
- gr.Textbox(
134
- value="๋‹น์‹ ์€ ์œ ๋Šฅํ•œ AI ๋น„์„œ์ž…๋‹ˆ๋‹ค.", # ๊ธฐ๋ณธ ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€
135
- label="System message",
136
- lines=1
137
- ),
138
- gr.Slider(
139
- minimum=1,
140
- maximum=4096, # ๋ชจ๋ธ ์ตœ๋Œ€ ์ปจํ…์ŠคํŠธ ๊ธธ์ด ๊ณ ๋ ค (๋˜๋Š” ๋” ๊ธธ๊ฒŒ ์„ค์ •)
141
- value=512,
142
- step=1,
143
- label="Max new tokens"
144
- ),
145
- gr.Slider(
146
- minimum=0.1,
147
- maximum=2.0, # Temperature ๋ฒ”์œ„ ์กฐ์ • (ํ•„์š”์‹œ)
148
- value=0.7,
149
- step=0.1,
150
- label="Temperature"
151
- ),
152
- gr.Slider(
153
- minimum=0.0, # Top-p ๋ฒ”์œ„ ์กฐ์ • (ํ•„์š”์‹œ)
154
- maximum=1.0,
155
- value=0.95,
156
- step=0.05,
157
- label="Top-p (nucleus sampling)"
158
- ),
159
  ],
160
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- # Gradio ์•ฑ ์‹คํ–‰
163
- # Hugging Face Spaces์—์„œ๋Š” share=True๊ฐ€ ์ž๋™์œผ๋กœ ์„ค์ •๋ฉ๋‹ˆ๋‹ค.
164
- # debug=True๋กœ ์„ค์ •ํ•˜๋ฉด ์ƒ์„ธ ๋กœ๊ทธ๋ฅผ ๋ณผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
165
- demo.launch(debug=True)
166
- else:
167
- print("๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ๋กœ ์ธํ•ด Gradio ์ธํ„ฐํŽ˜์ด์Šค๋ฅผ ์‹คํ–‰ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
 
 
 
1
  import os
2
+
3
  os.system("pip install git+https://github.com/shumingma/transformers.git")
 
4
 
 
5
  import threading
6
  import torch
7
  import torch._dynamo
 
 
 
 
8
  torch._dynamo.config.suppress_errors = True
9
 
10
  from transformers import (
 
12
  AutoTokenizer,
13
  TextIteratorStreamer,
14
  )
15
+ import gradio as gr
16
+ import spaces
17
 
 
 
18
  model_id = "microsoft/bitnet-b1.58-2B-4T"
19
 
20
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ model_id,
23
+ torch_dtype=torch.bfloat16,
24
+ device_map="auto"
25
+ )
26
+ print(model.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ @spaces.GPU
 
29
  def respond(
30
  message: str,
31
  history: list[tuple[str, str]],
 
34
  temperature: float,
35
  top_p: float,
36
  ):
37
+ """
38
+ Generate a chat response using streaming with TextIteratorStreamer.
39
+ Args:
40
+ message: User's current message.
41
+ history: List of (user, assistant) tuples from previous turns.
42
+ system_message: Initial system prompt guiding the assistant.
43
+ max_tokens: Maximum number of tokens to generate.
44
+ temperature: Sampling temperature.
45
+ top_p: Nucleus sampling probability.
46
+ Yields:
47
+ The growing response text as new tokens are generated.
48
+ """
49
+ messages = [{"role": "system", "content": system_message}]
50
+ for user_msg, bot_msg in history:
51
+ if user_msg:
52
+ messages.append({"role": "user", "content": user_msg})
53
+ if bot_msg:
54
+ messages.append({"role": "assistant", "content": bot_msg})
55
+ messages.append({"role": "user", "content": message})
56
+
57
+ prompt = tokenizer.apply_chat_template(
58
+ messages, tokenize=False, add_generation_prompt=True
59
+ )
60
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ streamer = TextIteratorStreamer(
63
+ tokenizer, skip_prompt=True, skip_special_tokens=True
64
+ )
65
+ generate_kwargs = dict(
66
+ **inputs,
67
+ streamer=streamer,
68
+ max_new_tokens=max_tokens,
69
+ temperature=temperature,
70
+ top_p=top_p,
71
+ do_sample=True,
72
+ )
73
+ thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
74
+ thread.start()
75
+
76
+ response = ""
77
+ for new_text in streamer:
78
+ response += new_text
79
+ yield response
80
+
81
+ demo = gr.ChatInterface(
82
+ fn=respond,
83
+ title="Bitnet-b1.58-2B-4T Chatbot",
84
+ description="This chat application is powered by Microsoft's SOTA Bitnet-b1.58-2B-4T and designed for natural and fast conversations.",
85
+ examples=[
86
+ [
87
+ "Hello! How are you?",
88
+ "You are a helpful AI assistant for everyday tasks.",
89
+ 512,
90
+ 0.7,
91
+ 0.95,
92
  ],
93
+ [
94
+ "Can you code a snake game in Python?",
95
+ "You are a helpful AI assistant for coding.",
96
+ 2048,
97
+ 0.7,
98
+ 0.95,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  ],
100
+ ],
101
+ additional_inputs=[
102
+ gr.Textbox(
103
+ value="You are a helpful AI assistant.",
104
+ label="System message"
105
+ ),
106
+ gr.Slider(
107
+ minimum=1,
108
+ maximum=8192,
109
+ value=2048,
110
+ step=1,
111
+ label="Max new tokens"
112
+ ),
113
+ gr.Slider(
114
+ minimum=0.1,
115
+ maximum=4.0,
116
+ value=0.7,
117
+ step=0.1,
118
+ label="Temperature"
119
+ ),
120
+ gr.Slider(
121
+ minimum=0.1,
122
+ maximum=1.0,
123
+ value=0.95,
124
+ step=0.05,
125
+ label="Top-p (nucleus sampling)"
126
+ ),
127
+ ],
128
+ )
129
 
130
+ if __name__ == "__main__":
131
+ demo.launch()