模型部署后性能监控方案设计
在大模型推理加速实践中,部署后的性能监控是确保系统稳定运行的关键环节。本文将介绍一套完整的性能监控方案,涵盖关键指标采集、阈值设置和告警机制。
核心监控指标
import time
import psutil
import torch
from datetime import datetime
class ModelMonitor:
def __init__(self):
self.metrics = {
'cpu_utilization': [],
'memory_usage': [],
'inference_time': [],
'throughput': []
}
def collect_metrics(self, model, input_data):
# CPU利用率
cpu_percent = psutil.cpu_percent(interval=1)
# 内存使用率
memory_info = psutil.virtual_memory()
memory_usage = memory_info.percent
# 推理时间
start_time = time.time()
with torch.no_grad():
output = model(input_data)
end_time = time.time()
inference_time = (end_time - start_time) * 1000 # 转换为毫秒
# 吞吐量计算
throughput = 1000 / inference_time if inference_time > 0 else 0
self.metrics['cpu_utilization'].append(cpu_percent)
self.metrics['memory_usage'].append(memory_usage)
self.metrics['inference_time'].append(inference_time)
self.metrics['throughput'].append(throughput)
return {
'timestamp': datetime.now().isoformat(),
'cpu_percent': cpu_percent,
'memory_usage': memory_usage,
'inference_time_ms': inference_time,
'throughput_qps': throughput
}
告警阈值设置
# 性能阈值配置
ALERT_THRESHOLDS = {
'cpu_threshold': 85.0, # CPU使用率超过85%告警
'memory_threshold': 90.0, # 内存使用率超过90%告警
'latency_threshold': 100.0, # 平均延迟超过100ms告警
'throughput_threshold': 5.0 # 吞吐量低于5 QPS告警
}
def check_alerts(metrics):
alerts = []
if metrics['cpu_percent'] > ALERT_THRESHOLDS['cpu_threshold']:
alerts.append(f'CPU使用率过高: {metrics["cpu_percent"]}%')
if metrics['memory_usage'] > ALERT_THRESHOLDS['memory_threshold']:
alerts.append(f'内存使用率过高: {metrics["memory_usage"]}%')
# 检查平均延迟
avg_latency = sum([t for t in metrics['inference_time'][-10:]]) / len(metrics['inference_time'][-10:])
if avg_latency > ALERT_THRESHOLDS['latency_threshold']:
alerts.append(f'平均延迟过高: {avg_latency:.2f}ms')
return alerts
实施建议
- 数据采集频率:每秒采集一次性能指标
- 历史数据存储:使用InfluxDB或Prometheus存储时间序列数据
- 可视化展示:集成Grafana进行实时监控面板展示
- 自动化告警:通过Webhook将告警发送到钉钉/企业微信
部署后应持续优化监控策略,结合实际业务场景调整阈值参数。

讨论