File size: 13,600 Bytes
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d34ea8
 
b613c3c
d2662cc
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d34ea8
 
 
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d34ea8
 
 
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d34ea8
b613c3c
0d34ea8
 
 
 
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d49c538
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d49c538
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d49c538
b613c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
"""
System monitoring service for Video Model Studio.
Tracks system resources like CPU, memory, and other metrics.
"""

import os
import time
import logging
import platform
import threading
from datetime import datetime, timedelta
from collections import deque
from typing import Dict, List, Optional, Tuple, Any

import psutil

# Force the use of the Agg backend which is thread-safe
import matplotlib
matplotlib.use('Agg')  # Must be before importing pyplot
import matplotlib.pyplot as plt

import numpy as np

from vms.ui.monitoring.services.gpu import GPUMonitoringService

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

class MonitoringService:
    """Service for monitoring system resources and performance"""
    
    def __init__(self, history_minutes: int = 10, sample_interval: int = 5):
        """Initialize the monitoring service
        
        Args:
            history_minutes: How many minutes of history to keep
            sample_interval: How many seconds between samples
        """
        self.history_minutes = history_minutes
        self.sample_interval = sample_interval
        self.max_samples = (history_minutes * 60) // sample_interval
        
        # Initialize data structures for metrics
        self.timestamps = deque(maxlen=self.max_samples)
        self.cpu_percent = deque(maxlen=self.max_samples)
        self.memory_percent = deque(maxlen=self.max_samples)
        self.memory_used = deque(maxlen=self.max_samples)
        self.memory_available = deque(maxlen=self.max_samples)
        
        # CPU temperature history (might not be available on all systems)
        self.cpu_temp = deque(maxlen=self.max_samples)
        
        # Per-core CPU history
        self.cpu_cores_percent = {}
        
        # Initialize GPU monitoring service
        self.gpu = GPUMonitoringService(history_minutes=history_minutes, sample_interval=sample_interval)
        
        # Track if the monitoring thread is running
        self.is_running = False
        self.thread = None
        
        # Initialize with current values
        self.collect_metrics()
        
    def collect_metrics(self) -> Dict[str, Any]:
        """Collect current system metrics
        
        Returns:
            Dictionary of current metrics
        """
        metrics = {
            'timestamp': datetime.now(),
            'cpu_percent': psutil.cpu_percent(interval=0.1),
            'memory_percent': psutil.virtual_memory().percent,
            'memory_used': psutil.virtual_memory().used / (1024**3),  # GB
            'memory_available': psutil.virtual_memory().available / (1024**3),  # GB
            'cpu_temp': None,
            'per_cpu_percent': psutil.cpu_percent(interval=0.1, percpu=True)
        }
        
        # Try to get CPU temperature (platform specific)
        try:
            if platform.system() == 'Linux':
                # Try to get temperature from psutil
                temps = psutil.sensors_temperatures()
                for name, entries in temps.items():
                    if name.startswith(('coretemp', 'k10temp', 'cpu_thermal')):
                        metrics['cpu_temp'] = entries[0].current
                        break
            elif platform.system() == 'Darwin':  # macOS
                # On macOS, we could use SMC reader but it requires additional dependencies
                # Leaving as None for now
                pass
            elif platform.system() == 'Windows':
                # Windows might require WMI, leaving as None for simplicity
                pass
        except (AttributeError, KeyError, IndexError, NotImplementedError):
            # Sensors not available
            pass
        
        return metrics
    
    def update_history(self, metrics: Dict[str, Any]) -> None:
        """Update metric history with new values
        
        Args:
            metrics: New metrics to add to history
        """
        self.timestamps.append(metrics['timestamp'])
        self.cpu_percent.append(metrics['cpu_percent'])
        self.memory_percent.append(metrics['memory_percent'])
        self.memory_used.append(metrics['memory_used'])
        self.memory_available.append(metrics['memory_available'])
        
        if metrics['cpu_temp'] is not None:
            self.cpu_temp.append(metrics['cpu_temp'])
        
        # Update per-core CPU metrics
        for i, percent in enumerate(metrics['per_cpu_percent']):
            if i not in self.cpu_cores_percent:
                self.cpu_cores_percent[i] = deque(maxlen=self.max_samples)
            self.cpu_cores_percent[i].append(percent)
    
    def start_monitoring(self) -> None:
        """Start background thread for collecting metrics"""
        if self.is_running:
            logger.warning("Monitoring thread already running")
            return
            
        self.is_running = True

        # Start GPU monitoring if available
        self.gpu.start_monitoring()
        
        def _monitor_loop():
            while self.is_running:
                try:
                    metrics = self.collect_metrics()
                    self.update_history(metrics)
                    time.sleep(self.sample_interval)
                except Exception as e:
                    logger.error(f"Error in monitoring thread: {str(e)}", exc_info=True)
                    time.sleep(self.sample_interval)
        
        self.thread = threading.Thread(target=_monitor_loop, daemon=True)
        self.thread.start()
        logger.info("System monitoring thread started")
    
    def stop_monitoring(self) -> None:
        """Stop the monitoring thread"""
        if not self.is_running:
            return

        self.is_running = False

        # Stop GPU monitoring
        self.gpu.stop_monitoring()

        if self.thread:
            self.thread.join(timeout=1.0)
            logger.info("System monitoring thread stopped")
    
    def get_current_metrics(self) -> Dict[str, Any]:
        """Get current system metrics
        
        Returns:
            Dictionary with current system metrics
        """
        return self.collect_metrics()
    
    def get_system_info(self) -> Dict[str, Any]:
        """Get general system information
        
        Returns:
            Dictionary with system details
        """
        cpu_info = {
            'cores_physical': psutil.cpu_count(logical=False),
            'cores_logical': psutil.cpu_count(logical=True),
            'current_frequency': None,
            'architecture': platform.machine(),
        }
        
        # Try to get CPU frequency
        try:
            cpu_freq = psutil.cpu_freq()
            if cpu_freq:
                cpu_info['current_frequency'] = cpu_freq.current
        except Exception:
            pass
            
        memory_info = {
            'total': psutil.virtual_memory().total / (1024**3),  # GB
            'available': psutil.virtual_memory().available / (1024**3),  # GB
            'used': psutil.virtual_memory().used / (1024**3),  # GB
            'percent': psutil.virtual_memory().percent
        }
        
        disk_info = {}
        for part in psutil.disk_partitions(all=False):
            if os.name == 'nt' and ('cdrom' in part.opts or part.fstype == ''):
                # Skip CD-ROM drives on Windows
                continue
            try:
                usage = psutil.disk_usage(part.mountpoint)
                disk_info[part.mountpoint] = {
                    'total': usage.total / (1024**3),  # GB
                    'used': usage.used / (1024**3),  # GB
                    'free': usage.free / (1024**3),  # GB
                    'percent': usage.percent
                }
            except PermissionError:
                continue
        
        sys_info = {
            'system': platform.system(),
            'version': platform.version(),
            'platform': platform.platform(),
            'processor': platform.processor(),
            'hostname': platform.node(),
            'python_version': platform.python_version(),
            'uptime': time.time() - psutil.boot_time()
        }
        
        return {
            'cpu': cpu_info,
            'memory': memory_info,
            'disk': disk_info,
            'system': sys_info,
        }
    
    def generate_cpu_plot(self) -> plt.Figure:
        """Generate a plot of CPU usage over time
        
        Returns:
            Matplotlib figure with CPU usage plot
        """
        plt.close('all')  # Close all existing figures
        fig, ax = plt.subplots(figsize=(10, 5))
        
        if not self.timestamps:
            ax.set_title("No CPU data available yet")
            return fig
            
        x = [t.strftime('%H:%M:%S') for t in self.timestamps]
        if len(x) > 10:
            # Show fewer x-axis labels for readability
            step = len(x) // 10
            ax.set_xticks(range(0, len(x), step))
            ax.set_xticklabels([x[i] for i in range(0, len(x), step)])
        
        ax.plot(x, list(self.cpu_percent), 'b-', label='CPU Usage %')
        
        if self.cpu_temp and len(self.cpu_temp) > 0:
            # Plot temperature on a secondary y-axis if available
            ax2 = ax.twinx()
            ax2.plot(x[:len(self.cpu_temp)], list(self.cpu_temp), 'r-', label='CPU Temp °C')
            ax2.set_ylabel('Temperature (°C)', color='r')
            ax2.tick_params(axis='y', colors='r')
            
        ax.set_title('CPU Usage Over Time')
        ax.set_xlabel('Time')
        ax.set_ylabel('Usage %')
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 100)
        
        # Add legend
        lines, labels = ax.get_legend_handles_labels()
        if hasattr(locals(), 'ax2'):
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax.legend(lines + lines2, labels + labels2, loc='upper left')
        else:
            ax.legend(loc='upper left')
            
        plt.tight_layout()
        return fig
    
    def generate_memory_plot(self) -> plt.Figure:
        """Generate a plot of memory usage over time
        
        Returns:
            Matplotlib figure with memory usage plot
        """
        plt.close('all')  # Close all existing figures
        fig, ax = plt.subplots(figsize=(10, 5))
        
        if not self.timestamps:
            ax.set_title("No memory data available yet")
            return fig
            
        x = [t.strftime('%H:%M:%S') for t in self.timestamps]
        if len(x) > 10:
            # Show fewer x-axis labels for readability
            step = len(x) // 10
            ax.set_xticks(range(0, len(x), step))
            ax.set_xticklabels([x[i] for i in range(0, len(x), step)])
        
        ax.plot(x, list(self.memory_percent), 'g-', label='Memory Usage %')
        
        # Add secondary y-axis for absolute memory values
        ax2 = ax.twinx()
        ax2.plot(x, list(self.memory_used), 'm--', label='Used (GB)')
        ax2.plot(x, list(self.memory_available), 'c--', label='Available (GB)')
        ax2.set_ylabel('Memory (GB)')
        
        ax.set_title('Memory Usage Over Time')
        ax.set_xlabel('Time')
        ax.set_ylabel('Usage %')
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 100)
        
        # Add legend
        lines, labels = ax.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax.legend(lines + lines2, labels + labels2, loc='upper left')
            
        plt.tight_layout()
        return fig
    
    def generate_per_core_plot(self) -> plt.Figure:
        """Generate a plot of per-core CPU usage
        
        Returns:
            Matplotlib figure with per-core CPU usage
        """
        num_cores = len(self.cpu_cores_percent)
        if num_cores == 0:
            # No data yet
            plt.close('all')  # Close all existing figures
            fig, ax = plt.subplots(figsize=(10, 5))
            ax.set_title("No per-core CPU data available yet")
            return fig
            
        # Determine grid layout based on number of cores
        if num_cores <= 4:
            rows, cols = 2, 2
        elif num_cores <= 6:
            rows, cols = 2, 3
        elif num_cores <= 9:
            rows, cols = 3, 3
        elif num_cores <= 12:
            rows, cols = 3, 4
        else:
            rows, cols = 4, 4
            
        fig, axes = plt.subplots(rows, cols, figsize=(12, 8), sharex=True, sharey=True)
        axes = axes.flatten()
        
        x = [t.strftime('%H:%M:%S') for t in self.timestamps]
        if len(x) > 5:
            # Show fewer x-axis labels for readability
            step = len(x) // 5
        else:
            step = 1
            
        for i, (core_id, percentages) in enumerate(self.cpu_cores_percent.items()):
            if i >= len(axes):
                break
                
            ax = axes[i]
            ax.plot(x[:len(percentages)], list(percentages), 'b-')
            ax.set_title(f'Core {core_id}')
            ax.set_ylim(0, 100)
            ax.grid(True, alpha=0.3)
            
            # Add x-axis labels sparingly for readability
            if i >= len(axes) - cols:  # Only for bottom row
                ax.set_xticks(range(0, len(x), step))
                ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
                
        # Hide unused subplots
        for i in range(num_cores, len(axes)):
            axes[i].set_visible(False)
            
        plt.tight_layout()
        return fig