Trouter-Library commited on
Commit
e489ad0
·
verified ·
1 Parent(s): 60d47fc

Create benchmark.py

Browse files
Files changed (1) hide show
  1. benchmark.py +446 -0
benchmark.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive benchmarking script for Helion-V2.0-Thinking
3
+ Measures performance, throughput, latency, and memory usage
4
+ """
5
+
6
+ import torch
7
+ from transformers import AutoModelForCausalLM, AutoProcessor
8
+ from typing import List, Dict, Any
9
+ import time
10
+ import psutil
11
+ import numpy as np
12
+ from PIL import Image
13
+ import json
14
+ from tqdm import tqdm
15
+ import gc
16
+
17
+
18
+ class HelionBenchmark:
19
+ """Performance benchmarking for Helion-V2.0-Thinking"""
20
+
21
+ def __init__(self, model_name: str = "DeepXR/Helion-V2.0-Thinking"):
22
+ """Initialize benchmark suite"""
23
+ print(f"Loading model for benchmarking: {model_name}")
24
+
25
+ self.model = AutoModelForCausalLM.from_pretrained(
26
+ model_name,
27
+ torch_dtype=torch.bfloat16,
28
+ device_map="auto",
29
+ trust_remote_code=True
30
+ )
31
+ self.processor = AutoProcessor.from_pretrained(model_name)
32
+ self.model.eval()
33
+
34
+ print("Model loaded successfully")
35
+
36
+ # Get device info
37
+ if torch.cuda.is_available():
38
+ self.device = "cuda"
39
+ self.device_name = torch.cuda.get_device_name(0)
40
+ self.total_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)
41
+ else:
42
+ self.device = "cpu"
43
+ self.device_name = "CPU"
44
+ self.total_vram = 0
45
+
46
+ print(f"Device: {self.device_name}")
47
+ if self.device == "cuda":
48
+ print(f"Total VRAM: {self.total_vram:.2f} GB")
49
+
50
+ def measure_memory_usage(self) -> Dict[str, float]:
51
+ """Measure current memory usage"""
52
+ memory_stats = {}
53
+
54
+ if self.device == "cuda":
55
+ memory_stats['vram_allocated_gb'] = torch.cuda.memory_allocated() / (1024**3)
56
+ memory_stats['vram_reserved_gb'] = torch.cuda.memory_reserved() / (1024**3)
57
+ memory_stats['vram_peak_gb'] = torch.cuda.max_memory_allocated() / (1024**3)
58
+
59
+ # System RAM
60
+ memory_stats['ram_used_gb'] = psutil.Process().memory_info().rss / (1024**3)
61
+
62
+ return memory_stats
63
+
64
+ def benchmark_text_generation(
65
+ self,
66
+ prompts: List[str],
67
+ max_new_tokens: int = 256
68
+ ) -> Dict[str, Any]:
69
+ """
70
+ Benchmark text generation performance
71
+
72
+ Returns:
73
+ Dict with latency, throughput, and token metrics
74
+ """
75
+ print("\n=== Benchmarking Text Generation ===")
76
+
77
+ latencies = []
78
+ tokens_per_second = []
79
+
80
+ # Warmup
81
+ warmup_prompt = "Test prompt for warmup."
82
+ inputs = self.processor(text=warmup_prompt, return_tensors="pt").to(self.model.device)
83
+ with torch.no_grad():
84
+ _ = self.model.generate(**inputs, max_new_tokens=50)
85
+
86
+ if self.device == "cuda":
87
+ torch.cuda.synchronize()
88
+ torch.cuda.reset_peak_memory_stats()
89
+
90
+ # Benchmark
91
+ for prompt in tqdm(prompts, desc="Text Generation"):
92
+ inputs = self.processor(text=prompt, return_tensors="pt").to(self.model.device)
93
+
94
+ start_time = time.time()
95
+
96
+ with torch.no_grad():
97
+ outputs = self.model.generate(
98
+ **inputs,
99
+ max_new_tokens=max_new_tokens,
100
+ do_sample=False
101
+ )
102
+
103
+ if self.device == "cuda":
104
+ torch.cuda.synchronize()
105
+
106
+ end_time = time.time()
107
+ latency = end_time - start_time
108
+
109
+ # Calculate tokens generated
110
+ generated_tokens = outputs.shape[1] - inputs['input_ids'].shape[1]
111
+ tps = generated_tokens / latency
112
+
113
+ latencies.append(latency)
114
+ tokens_per_second.append(tps)
115
+
116
+ # Memory stats
117
+ memory_stats = self.measure_memory_usage()
118
+
119
+ return {
120
+ "text_generation": {
121
+ "avg_latency_ms": np.mean(latencies) * 1000,
122
+ "p50_latency_ms": np.percentile(latencies, 50) * 1000,
123
+ "p95_latency_ms": np.percentile(latencies, 95) * 1000,
124
+ "p99_latency_ms": np.percentile(latencies, 99) * 1000,
125
+ "avg_tokens_per_second": np.mean(tokens_per_second),
126
+ "total_prompts": len(prompts),
127
+ **memory_stats
128
+ }
129
+ }
130
+
131
+ def benchmark_vision(
132
+ self,
133
+ image_prompts: List[tuple[Image.Image, str]],
134
+ max_new_tokens: int = 256
135
+ ) -> Dict[str, Any]:
136
+ """
137
+ Benchmark vision + text generation
138
+
139
+ Args:
140
+ image_prompts: List of (image, prompt) tuples
141
+
142
+ Returns:
143
+ Performance metrics
144
+ """
145
+ print("\n=== Benchmarking Vision Tasks ===")
146
+
147
+ latencies = []
148
+ tokens_per_second = []
149
+
150
+ # Warmup
151
+ if image_prompts:
152
+ warmup_image, warmup_prompt = image_prompts[0]
153
+ inputs = self.processor(
154
+ text=warmup_prompt,
155
+ images=warmup_image,
156
+ return_tensors="pt"
157
+ ).to(self.model.device)
158
+ with torch.no_grad():
159
+ _ = self.model.generate(**inputs, max_new_tokens=50)
160
+
161
+ if self.device == "cuda":
162
+ torch.cuda.synchronize()
163
+ torch.cuda.reset_peak_memory_stats()
164
+
165
+ # Benchmark
166
+ for image, prompt in tqdm(image_prompts, desc="Vision Tasks"):
167
+ inputs = self.processor(
168
+ text=prompt,
169
+ images=image,
170
+ return_tensors="pt"
171
+ ).to(self.model.device)
172
+
173
+ start_time = time.time()
174
+
175
+ with torch.no_grad():
176
+ outputs = self.model.generate(
177
+ **inputs,
178
+ max_new_tokens=max_new_tokens,
179
+ do_sample=False
180
+ )
181
+
182
+ if self.device == "cuda":
183
+ torch.cuda.synchronize()
184
+
185
+ end_time = time.time()
186
+ latency = end_time - start_time
187
+
188
+ generated_tokens = outputs.shape[1] - inputs['input_ids'].shape[1]
189
+ tps = generated_tokens / latency
190
+
191
+ latencies.append(latency)
192
+ tokens_per_second.append(tps)
193
+
194
+ memory_stats = self.measure_memory_usage()
195
+
196
+ return {
197
+ "vision_tasks": {
198
+ "avg_latency_ms": np.mean(latencies) * 1000,
199
+ "p50_latency_ms": np.percentile(latencies, 50) * 1000,
200
+ "p95_latency_ms": np.percentile(latencies, 95) * 1000,
201
+ "p99_latency_ms": np.percentile(latencies, 99) * 1000,
202
+ "avg_tokens_per_second": np.mean(tokens_per_second),
203
+ "total_image_prompts": len(image_prompts),
204
+ **memory_stats
205
+ }
206
+ }
207
+
208
+ def benchmark_long_context(
209
+ self,
210
+ context_lengths: List[int] = [1000, 5000, 10000, 50000, 100000]
211
+ ) -> Dict[str, Any]:
212
+ """
213
+ Benchmark performance with varying context lengths
214
+
215
+ Args:
216
+ context_lengths: List of context lengths to test
217
+
218
+ Returns:
219
+ Performance metrics by context length
220
+ """
221
+ print("\n=== Benchmarking Long Context Performance ===")
222
+
223
+ results = {}
224
+
225
+ for length in tqdm(context_lengths, desc="Context Lengths"):
226
+ # Generate synthetic context
227
+ context = "This is a test sentence. " * (length // 6) # Approx tokens
228
+ prompt = f"{context}\n\nQuestion: What is the main topic of this text?\nAnswer:"
229
+
230
+ inputs = self.processor(text=prompt, return_tensors="pt").to(self.model.device)
231
+
232
+ # Check if context fits
233
+ if inputs['input_ids'].shape[1] > length:
234
+ print(f"Skipping {length} - generated context too long")
235
+ continue
236
+
237
+ if self.device == "cuda":
238
+ torch.cuda.synchronize()
239
+ torch.cuda.reset_peak_memory_stats()
240
+
241
+ start_time = time.time()
242
+
243
+ try:
244
+ with torch.no_grad():
245
+ outputs = self.model.generate(
246
+ **inputs,
247
+ max_new_tokens=128,
248
+ do_sample=False
249
+ )
250
+
251
+ if self.device == "cuda":
252
+ torch.cuda.synchronize()
253
+
254
+ end_time = time.time()
255
+ latency = end_time - start_time
256
+
257
+ memory_stats = self.measure_memory_usage()
258
+
259
+ results[f"context_{length}"] = {
260
+ "latency_ms": latency * 1000,
261
+ "input_tokens": inputs['input_ids'].shape[1],
262
+ **memory_stats
263
+ }
264
+
265
+ except Exception as e:
266
+ results[f"context_{length}"] = {
267
+ "error": str(e)
268
+ }
269
+
270
+ # Cleanup
271
+ del inputs, outputs
272
+ gc.collect()
273
+ if self.device == "cuda":
274
+ torch.cuda.empty_cache()
275
+
276
+ return {"long_context": results}
277
+
278
+ def benchmark_throughput(
279
+ self,
280
+ batch_sizes: List[int] = [1, 2, 4, 8],
281
+ sequence_length: int = 512
282
+ ) -> Dict[str, Any]:
283
+ """
284
+ Benchmark throughput with different batch sizes
285
+
286
+ Args:
287
+ batch_sizes: List of batch sizes to test
288
+ sequence_length: Target sequence length
289
+
290
+ Returns:
291
+ Throughput metrics
292
+ """
293
+ print("\n=== Benchmarking Throughput ===")
294
+
295
+ results = {}
296
+ prompt = "Explain the concept of artificial intelligence. " * 10
297
+
298
+ for batch_size in tqdm(batch_sizes, desc="Batch Sizes"):
299
+ try:
300
+ # Create batch
301
+ prompts = [prompt] * batch_size
302
+ inputs = self.processor(
303
+ text=prompts,
304
+ return_tensors="pt",
305
+ padding=True
306
+ ).to(self.model.device)
307
+
308
+ if self.device == "cuda":
309
+ torch.cuda.synchronize()
310
+ torch.cuda.reset_peak_memory_stats()
311
+
312
+ start_time = time.time()
313
+
314
+ with torch.no_grad():
315
+ outputs = self.model.generate(
316
+ **inputs,
317
+ max_new_tokens=256,
318
+ do_sample=False
319
+ )
320
+
321
+ if self.device == "cuda":
322
+ torch.cuda.synchronize()
323
+
324
+ end_time = time.time()
325
+ latency = end_time - start_time
326
+
327
+ # Calculate throughput
328
+ total_tokens = outputs.shape[0] * outputs.shape[1]
329
+ throughput = total_tokens / latency
330
+
331
+ memory_stats = self.measure_memory_usage()
332
+
333
+ results[f"batch_{batch_size}"] = {
334
+ "latency_ms": latency * 1000,
335
+ "throughput_tokens_per_sec": throughput,
336
+ "tokens_per_sample": outputs.shape[1],
337
+ **memory_stats
338
+ }
339
+
340
+ except Exception as e:
341
+ results[f"batch_{batch_size}"] = {
342
+ "error": str(e)
343
+ }
344
+
345
+ # Cleanup
346
+ del inputs, outputs
347
+ gc.collect()
348
+ if self.device == "cuda":
349
+ torch.cuda.empty_cache()
350
+
351
+ return {"throughput": results}
352
+
353
+ def run_full_benchmark(self) -> Dict[str, Any]:
354
+ """Run complete benchmark suite"""
355
+ print("\n" + "="*60)
356
+ print("Starting Full Benchmark Suite")
357
+ print(f"Device: {self.device_name}")
358
+ print("="*60)
359
+
360
+ results = {
361
+ "system_info": {
362
+ "device": self.device,
363
+ "device_name": self.device_name,
364
+ "total_vram_gb": self.total_vram if self.device == "cuda" else None,
365
+ "pytorch_version": torch.__version__,
366
+ "cuda_available": torch.cuda.is_available()
367
+ }
368
+ }
369
+
370
+ # Text generation benchmark
371
+ text_prompts = [
372
+ "Explain quantum mechanics in simple terms.",
373
+ "Write a short story about space exploration.",
374
+ "What are the benefits of machine learning?",
375
+ "Describe the process of photosynthesis.",
376
+ "How does blockchain technology work?"
377
+ ]
378
+ results.update(self.benchmark_text_generation(text_prompts, max_new_tokens=256))
379
+
380
+ # Long context benchmark
381
+ results.update(self.benchmark_long_context([1000, 5000, 10000, 50000]))
382
+
383
+ # Throughput benchmark
384
+ results.update(self.benchmark_throughput([1, 2, 4]))
385
+
386
+ print("\n" + "="*60)
387
+ print("Benchmark Complete")
388
+ print("="*60)
389
+
390
+ return results
391
+
392
+ def print_results(self, results: Dict[str, Any]):
393
+ """Print benchmark results in a readable format"""
394
+ print("\n" + "="*60)
395
+ print("BENCHMARK RESULTS")
396
+ print("="*60)
397
+
398
+ def print_dict(d, indent=0):
399
+ for key, value in d.items():
400
+ if isinstance(value, dict):
401
+ print(" " * indent + f"{key}:")
402
+ print_dict(value, indent + 1)
403
+ elif isinstance(value, float):
404
+ print(" " * indent + f"{key}: {value:.4f}")
405
+ else:
406
+ print(" " * indent + f"{key}: {value}")
407
+
408
+ print_dict(results)
409
+ print("="*60 + "\n")
410
+
411
+ def save_results(self, results: Dict[str, Any], filename: str = "benchmark_results.json"):
412
+ """Save results to JSON file"""
413
+ with open(filename, 'w') as f:
414
+ json.dump(results, f, indent=2)
415
+ print(f"Results saved to {filename}")
416
+
417
+
418
+ def main():
419
+ """Main benchmark function"""
420
+ import argparse
421
+
422
+ parser = argparse.ArgumentParser(description="Benchmark Helion-V2.0-Thinking")
423
+ parser.add_argument(
424
+ "--model",
425
+ type=str,
426
+ default="DeepXR/Helion-V2.0-Thinking",
427
+ help="Model name or path"
428
+ )
429
+ parser.add_argument(
430
+ "--output",
431
+ type=str,
432
+ default="benchmark_results.json",
433
+ help="Output file for results"
434
+ )
435
+
436
+ args = parser.parse_args()
437
+
438
+ # Run benchmark
439
+ benchmark = HelionBenchmark(args.model)
440
+ results = benchmark.run_full_benchmark()
441
+ benchmark.print_results(results)
442
+ benchmark.save_results(results, args.output)
443
+
444
+
445
+ if __name__ == "__main__":
446
+ main()