farmax commited on
Commit
cafecc7
·
verified ·
1 Parent(s): 01ed4ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -124
app.py CHANGED
@@ -1,11 +1,17 @@
1
  import gradio as gr
2
- import argparse
3
  from tabulate import tabulate
 
 
 
 
 
 
 
 
4
 
5
  def greet(name):
6
  return f"Ciao, {name}!"
7
 
8
- # Define all helper functions here
9
  def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
10
  result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
11
  return result if result >= 0 else "OOM"
@@ -23,40 +29,24 @@ def calc_estimated_response_time(prefill_time, generation_time, prompt_size, res
23
  return "OOM"
24
  return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
25
 
26
- # Move estimate_capacity_latency outside of main()
27
  def estimate_capacity_latency(model, gpu):
28
- kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
29
- prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
30
- generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
31
- estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
32
  return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
33
 
34
  def create_gradio_interface():
35
- # Definisci gpu_specs qui così che sia disponibile nella funzione
36
  gpu_specs = [
37
  {"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
38
  {"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
39
- {"name": "L40", "fp16_tflops": 181, "memory_bandwidth_gbps": 864},
40
- {"name": "L40s", "fp16_tflops": 362, "memory_bandwidth_gbps": 864},
41
- {"name": "A100 40 GB", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
42
- {"name": "A100 40 GB SXM", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
43
- {"name": "A100 80 GB PCIe", "fp16_tflops": 312, "memory_bandwidth_gbps": 1935},
44
- {"name": "A100 80 GB SXM", "fp16_tflops": 312, "memory_bandwidth_gbps": 2039},
45
- {"name": "H100 PCIe", "fp16_tflops": 1513, "memory_gb": 80, "memory_bandwidth_gbps": 2000},
46
- {"name": "H100 SXM", "fp16_tflops": 1979, "memory_bandwidth_gbps": 3350},
47
- {"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
48
  ]
49
-
50
- # Definisci model_specs qui così che sia disponibile nella funzione
51
  model_specs = [
52
  {"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
53
  {"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
54
- {"name": "Llama-3.1-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 131072, "d_head": 128},
55
- {"name": "Llama-3.1-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 131072, "d_head": 128},
56
- {"name": "Mistral-7B-v0.3", "params_billion": 7, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 32768, "d_head": 128},
57
- {"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
58
- {"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
59
- {"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
60
  ]
61
 
62
  demo = gr.Interface(
@@ -76,105 +66,8 @@ def create_gradio_interface():
76
 
77
  return demo
78
 
79
- # Create the Gradio interface
80
  gr_interface = create_gradio_interface()
81
 
82
- # Start the interface
83
  gr_interface.launch()
84
-
85
- def main():
86
- parser = argparse.ArgumentParser(description='Your script description')
87
- parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
88
- parser.add_argument('-p', '--prompt_sz', type=int, default=4096, help='Prompt size in tokens')
89
- parser.add_argument('-r', '--response_sz', type=int, default=256, help='Response size in tokens')
90
- parser.add_argument('-c', '--n_concurrent_req', type=int, default=10, help='Number of concurrent requests')
91
- parser.add_argument('-w', '-cw', '--ctx_window', type=int, default=1024, help='Average context window')
92
-
93
- args = parser.parse_args()
94
-
95
- num_gpu = args.num_gpu
96
- prompt_size = args.prompt_sz
97
- response_size = args.response_sz
98
- n_concurrent_request = args.n_concurrent_req
99
- avg_context_window = args.ctx_window
100
-
101
- # Print input
102
- print(f" num_gpu = {num_gpu}, prompt_size = {prompt_size} tokens, response_size = {response_size} tokens")
103
- print(f" n_concurrent_request = {n_concurrent_request}, avg_context_window = {avg_context_window} tokens")
104
-
105
- # Define variables
106
- gpu_specs = [
107
- {"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
108
- {"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
109
- {"name": "L40", "fp16_tflops": 181, "memory_gb": 48, "memory_bandwidth_gbps": 864},
110
- {"name": "L40s", "fp16_tflops": 362, "memory_gb": 48, "memory_bandwidth_gbps": 864},
111
- {"name": "A100 40 GB", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
112
- {"name": "A100 40 GB SXM", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
113
- {"name": "A100 80 GB PCIe", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 1935},
114
- {"name": "A100 80 GB SXM", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 2039},
115
- {"name": "H100 PCIe", "fp16_tflops": 1513, "memory_gb": 80, "memory_bandwidth_gbps": 2000},
116
- {"name": "H100 SXM", "fp16_tflops": 1979, "memory_gb": 80, "memory_bandwidth_gbps": 3350},
117
- {"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
118
- # Add or comment out GPU types as needed
119
- ]
120
-
121
- model_specs = [
122
- {"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
123
- {"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
124
- {"name": "Llama-3.1-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 131072, "d_head": 128},
125
- {"name": "Llama-3.1-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 131072, "d_head": 128},
126
- {"name": "Mistral-7B-v0.3", "params_billion": 7, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 32768, "d_head": 128},
127
- {"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
128
- {"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
129
- {"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
130
- # Add or comment out model specifications as needed
131
- ]
132
-
133
- BYTES_IN_GB = 1_073_741_824 # 1 GB = 1,073,741,824 bytes
134
-
135
- def calc_kv_cache_size_per_token(n_layers, d_model):
136
- return 2 * 2 * n_layers * d_model / BYTES_IN_GB # GB/token
137
-
138
- def calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window):
139
- kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
140
- target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
141
- return target_gpu_mem
142
-
143
- # ... rest of the code remains the same ...
144
- def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
145
- result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
146
- return result if result >= 0 else "OOM"
147
-
148
- def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
149
- result = (2 * model_params_billion / num_gpu) / fp16_tflops
150
- return result if result >= 0 else "OOM"
151
-
152
- def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
153
- result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
154
- return result if result >= 0 else "OOM"
155
-
156
- def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
157
- if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA"
158
- return "OOM"
159
- return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
160
-
161
- print(f"\n******************** Estimate LLM Memory Footprint ********************")
162
- memory_footprint_table = []
163
- for model_spec in model_specs:
164
- kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
165
- memory_footprint = calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window)
166
- memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
167
- print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
168
-
169
-
170
- capacity_latency_table = []
171
- for model in model_specs:
172
- for gpu in gpu_specs:
173
- prefill_time, generation_time, estimated_response_time = estimate_capacity_latency(model, gpu)
174
- capacity_latency_table.append([model['name'], gpu['name'], f"{prefill_time}", f"{generation_time}", f"{estimated_response_time}"])
175
-
176
- print(f"\n******************** Estimate LLM Capacity and Latency ******************** ")
177
- print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
178
-
179
- if __name__ == '__main__':
180
- main()
 
1
  import gradio as gr
 
2
  from tabulate import tabulate
3
+ import os
4
+
5
+ # Definisci le variabili di ambiente
6
+ os.environ['NUM_GPU'] = '1'
7
+ os.environ['PROMPT_SZ'] = '4096'
8
+ os.environ['RESPONSE_SZ'] = '256'
9
+ os.environ['N_CONCURRENT_REQ'] = '10'
10
+ os.environ['CTX_WINDOW'] = '1024'
11
 
12
  def greet(name):
13
  return f"Ciao, {name}!"
14
 
 
15
  def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
16
  result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
17
  return result if result >= 0 else "OOM"
 
29
  return "OOM"
30
  return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
31
 
 
32
  def estimate_capacity_latency(model, gpu):
33
+ kv_cache_tokens = calc_kv_cache_tokens(int(os.environ['NUM_GPU']), gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
34
+ prefill_time_per_token = calc_prefill_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['fp16_tflops'])
35
+ generation_time_per_token = calc_generation_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['memory_bandwidth_gbps'])
36
+ estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, int(os.environ['PROMPT_SZ']), int(os.environ['RESPONSE_SZ']))
37
  return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
38
 
39
  def create_gradio_interface():
 
40
  gpu_specs = [
41
  {"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
42
  {"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
43
+ # ... altri GPU ...
 
 
 
 
 
 
 
 
44
  ]
45
+
 
46
  model_specs = [
47
  {"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
48
  {"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
49
+ # ... altri modelli ...
 
 
 
 
 
50
  ]
51
 
52
  demo = gr.Interface(
 
66
 
67
  return demo
68
 
69
+ # Creare l'interfaccia Gradio
70
  gr_interface = create_gradio_interface()
71
 
72
+ # Avvia l'interfaccia
73
  gr_interface.launch()