模型输出概率分布非正态性监控方法
在生产环境中,模型输出的概率分布偏离正态性往往预示着模型性能下降或数据分布漂移。本文介绍如何通过统计检验和可视化手段进行实时监控。
核心监控指标
- Shapiro-Wilk检验统计量:用于检验数据是否符合正态分布
- Kolmogorov-Smirnov检验p值:阈值设置为0.05
- 偏度(Skewness):绝对值超过2认为存在显著偏斜
- 峰度(Kurtosis):超出±3范围表示分布异常
实施方案
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
class DistributionMonitor:
def __init__(self, threshold=0.05):
self.threshold = threshold
def check_normality(self, predictions):
# Shapiro-Wilk检验
shapiro_stat, shapiro_p = stats.shapiro(predictions)
# 偏度和峰度
skewness = pd.Series(predictions).skew()
kurtosis = pd.Series(predictions).kurtosis()
# 检查是否违反正态性假设
is_normal = (shapiro_p > self.threshold and
abs(skewness) < 2 and
abs(kurtosis) < 3)
return {
'is_normal': is_normal,
'shapiro_stat': shapiro_stat,
'shapiro_p': shapiro_p,
'skewness': skewness,
'kurtosis': kurtosis
}
def alert_on_distribution_drift(self, current_data, reference_data):
# 对比当前与参考分布
current_result = self.check_normality(current_data)
reference_result = self.check_normality(reference_data)
if not current_result['is_normal']:
print(f"[ALERT] 非正态分布检测到: 偏度={current_result['skewness']:.2f}")
return True
return False
讨论