引言
在现代软件开发中,构建高可用、可扩展的应用系统已经成为基本要求。对于Node.js后端服务而言,良好的可观测性是保障系统稳定运行的关键。可观测性不仅包括对应用性能的监控,还涵盖了日志收集、错误追踪、指标分析等多个维度。
本文将深入探讨如何为Node.js后端服务构建完整的可观测性体系,重点介绍如何集成Prometheus监控系统和ELK(Elasticsearch, Logstash, Kibana)日志收集栈。通过实际的技术方案和代码示例,帮助开发者构建一套高效、可靠的监控与日志分析系统。
什么是可观测性
可观测性是现代软件工程中的重要概念,它指的是通过系统的输出来理解系统内部状态的能力。对于后端服务而言,可观测性主要包含三个核心维度:
- 指标(Metrics):量化系统运行状态的数值数据
- 日志(Logs):系统运行过程中的详细记录
- 追踪(Traces):请求在分布式系统中的完整路径
这三者相互配合,共同构成了完整的可观测性体系。在Node.js应用中,我们主要通过Prometheus来收集指标数据,通过ELK Stack来处理和分析日志信息。
Prometheus监控系统集成
Prometheus简介
Prometheus是一个开源的系统监控和告警工具包,特别适合云原生环境下的监控需求。它采用多维数据模型,通过HTTP拉取方式获取指标数据,具有强大的查询语言PromQL。
Node.js Prometheus客户端集成
首先,我们需要在Node.js项目中集成Prometheus客户端库:
npm install prom-client
基础监控指标配置
const client = require('prom-client');
const express = require('express');
// 创建监控指标
const collectDefaultMetrics = client.collectDefaultMetrics;
const Registry = client.Registry;
const register = new Registry();
// 收集默认指标(CPU、内存等)
collectDefaultMetrics({ register });
// 自定义指标
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.1, 0.5, 1, 2, 5, 10]
});
const httpRequestCount = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
const errorCounter = new client.Counter({
name: 'app_errors_total',
help: 'Total number of application errors',
labelNames: ['error_type', 'service']
});
// 注册指标
register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestCount);
register.registerMetric(errorCounter);
// Express中间件用于收集请求指标
const metricsMiddleware = (req, res, next) => {
const end = httpRequestDuration.startTimer();
res.on('finish', () => {
const route = req.route ? req.route.path : req.path;
const statusCode = res.statusCode;
// 记录请求计数
httpRequestCount.inc({
method: req.method,
route,
status_code: statusCode
});
// 结束请求耗时统计
end({
method: req.method,
route,
status_code: statusCode
});
});
next();
};
// 错误处理中间件
const errorMiddleware = (error, req, res, next) => {
errorCounter.inc({
error_type: error.name,
service: 'nodejs-api'
});
next(error);
};
module.exports = {
register,
metricsMiddleware,
errorMiddleware
};
创建监控端点
const express = require('express');
const { register, metricsMiddleware } = require('./metrics');
const app = express();
// 应用监控中间件
app.use(metricsMiddleware);
// 监控端点
app.get('/metrics', async (req, res) => {
try {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
} catch (ex) {
res.status(500).end(ex);
}
});
// 其他应用路由
app.get('/', (req, res) => {
res.json({ message: 'Hello World' });
});
module.exports = app;
Prometheus配置文件
创建prometheus.yml配置文件:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'nodejs-app'
static_configs:
- targets: ['localhost:3000']
metrics_path: '/metrics'
scrape_interval: 5s
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
Docker部署示例
# Dockerfile
FROM node:18-alpine
WORKDIR /app
COPY package*.json ./
RUN npm install
COPY . .
EXPOSE 3000
CMD ["npm", "start"]
# docker-compose.yml
version: '3.8'
services:
nodejs-app:
build: .
ports:
- "3000:3000"
environment:
- NODE_ENV=production
depends_on:
- prometheus
prometheus:
image: prom/prometheus:v2.37.0
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
grafana:
image: grafana/grafana-enterprise:9.5.0
ports:
- "3001:3001"
depends_on:
- prometheus
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
ELK日志收集系统
ELK Stack架构
ELK Stack由三个核心组件组成:
- Elasticsearch:分布式搜索和分析引擎
- Logstash:数据处理管道
- Kibana:数据可视化界面
Node.js日志收集配置
Winston日志库集成
npm install winston @winstonjs/cloudwatch-transport
const winston = require('winston');
const expressWinston = require('express-winston');
// 创建日志记录器
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: { service: 'nodejs-api' },
transports: [
// 控制台输出
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
}),
// 文件输出
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
// JSON文件输出
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
// Express中间件
const requestLogger = expressWinston.logger({
transports: [
new winston.transports.File({ filename: 'logs/request.log' })
],
format: winston.format.combine(
winston.format.json(),
winston.format.timestamp()
),
expressFormat: true,
colorize: false
});
// 错误日志中间件
const errorLogger = expressWinston.errorLogger({
transports: [
new winston.transports.File({ filename: 'logs/error.log' })
],
format: winston.format.combine(
winston.format.json(),
winston.format.timestamp()
)
});
module.exports = {
logger,
requestLogger,
errorLogger
};
结构化日志格式
// 创建结构化日志处理器
const createStructuredLogger = (service) => {
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: { service },
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.printf(({ timestamp, level, message, ...rest }) => {
return `${timestamp} [${level}] ${message} ${JSON.stringify(rest)}`;
})
)
})
]
});
// 添加请求上下文信息
const logWithContext = (level, message, context = {}) => {
logger.log(level, message, {
...context,
timestamp: new Date().toISOString()
});
};
return {
logger,
logWithContext
};
};
// 使用示例
const { logger, logWithContext } = createStructuredLogger('user-service');
app.get('/users/:id', async (req, res) => {
const userId = req.params.id;
logWithContext('info', 'Fetching user details', {
userId,
requestId: req.headers['x-request-id'],
ip: req.ip
});
try {
const user = await getUserById(userId);
logWithContext('info', 'User fetched successfully', {
userId,
responseTime: Date.now() - startTime
});
res.json(user);
} catch (error) {
logWithContext('error', 'Failed to fetch user', {
userId,
error: error.message,
stack: error.stack
});
res.status(500).json({ error: 'Internal server error' });
}
});
Logstash配置
创建logstash.conf文件:
input {
beats {
port => 5044
host => "0.0.0.0"
}
# 监控日志文件
file {
path => "/app/logs/*.log"
start_position => "beginning"
sincedb_path => "/dev/null"
type => "nodejs-logs"
}
}
filter {
if [type] == "nodejs-logs" {
json {
source => "message"
skip_on_invalid_json => true
}
# 解析时间戳
date {
match => [ "timestamp", "yyyy-MM-dd HH:mm:ss.SSS" ]
target => "@timestamp"
}
# 添加标签
mutate {
add_tag => [ "nodejs", "application" ]
}
}
# 处理错误日志
if [level] == "error" {
mutate {
add_tag => [ "error" ]
}
}
}
output {
elasticsearch {
hosts => ["http://elasticsearch:9200"]
index => "nodejs-app-%{+YYYY.MM.dd}"
}
stdout {
codec => rubydebug
}
}
Docker Compose部署
version: '3.8'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0
container_name: elasticsearch
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms1g -Xmx1g
ports:
- "9200:9200"
volumes:
- esdata:/usr/share/elasticsearch/data
networks:
- elk
logstash:
image: docker.elastic.co/logstash/logstash:8.7.0
container_name: logstash
ports:
- "5044:5044"
- "9600:9600"
volumes:
- ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf
- ./logs:/app/logs
depends_on:
- elasticsearch
networks:
- elk
kibana:
image: docker.elastic.co/kibana/kibana:8.7.0
container_name: kibana
ports:
- "5601:5601"
depends_on:
- elasticsearch
networks:
- elk
nodejs-app:
build: .
container_name: nodejs-app
ports:
- "3000:3000"
environment:
- NODE_ENV=production
- LOG_LEVEL=info
depends_on:
- elasticsearch
- logstash
networks:
- elk
volumes:
esdata:
networks:
elk:
driver: bridge
高级监控功能实现
自定义指标收集
const client = require('prom-client');
// 数据库连接池指标
const dbPool = new client.Gauge({
name: 'db_pool_connections',
help: 'Number of database connections in pool',
labelNames: ['pool_name']
});
// 缓存命中率指标
const cacheMetrics = {
hits: new client.Counter({
name: 'cache_hits_total',
help: 'Total number of cache hits'
}),
misses: new client.Counter({
name: 'cache_misses_total',
help: 'Total number of cache misses'
}),
evictions: new client.Counter({
name: 'cache_evictions_total',
help: 'Total number of cache evictions'
})
};
// 响应时间分位数指标
const responseTimeQuantiles = new client.Histogram({
name: 'response_time_seconds',
help: 'Response time in seconds',
labelNames: ['endpoint', 'method'],
percentiles: [0.5, 0.9, 0.95, 0.99]
});
// 内存使用指标
const memoryUsage = new client.Gauge({
name: 'nodejs_memory_usage_bytes',
help: 'Memory usage in bytes',
labelNames: ['type']
});
// 定期更新内存指标
setInterval(() => {
const usage = process.memoryUsage();
memoryUsage.set({ type: 'rss' }, usage.rss);
memoryUsage.set({ type: 'heapTotal' }, usage.heapTotal);
memoryUsage.set({ type: 'heapUsed' }, usage.heapUsed);
}, 5000);
module.exports = {
dbPool,
cacheMetrics,
responseTimeQuantiles,
memoryUsage
};
异常监控与告警
const { errorCounter } = require('./metrics');
const axios = require('axios');
// 异常处理和监控
class ErrorMonitor {
constructor() {
this.errorCounts = new Map();
this.alertThreshold = 10;
this.alertWindow = 60000; // 1分钟
}
async handleException(error, context = {}) {
const errorKey = `${error.name}_${error.message.substring(0, 50)}`;
// 记录错误
errorCounter.inc({
error_type: error.name,
service: 'nodejs-api',
...context
});
// 统计错误频率
if (!this.errorCounts.has(errorKey)) {
this.errorCounts.set(errorKey, { count: 0, timestamp: Date.now() });
}
const errorInfo = this.errorCounts.get(errorKey);
errorInfo.count++;
// 检查是否需要告警
if (errorInfo.count >= this.alertThreshold) {
await this.sendAlert({
error: error.name,
message: error.message,
context,
count: errorInfo.count,
timestamp: new Date().toISOString()
});
// 重置计数
errorInfo.count = 0;
errorInfo.timestamp = Date.now();
}
}
async sendAlert(alertData) {
try {
// 发送到告警系统(如Slack、Email等)
const webhookUrl = process.env.ALERT_WEBHOOK_URL;
if (webhookUrl) {
await axios.post(webhookUrl, {
text: `⚠️ Node.js Application Alert`,
attachments: [{
color: 'danger',
fields: [
{
title: 'Error Type',
value: alertData.error,
short: true
},
{
title: 'Message',
value: alertData.message,
short: false
},
{
title: 'Count',
value: alertData.count,
short: true
}
]
}]
});
}
console.error('Alert sent:', JSON.stringify(alertData));
} catch (alertError) {
console.error('Failed to send alert:', alertError);
}
}
// 清理过期错误记录
cleanup() {
const now = Date.now();
for (const [key, info] of this.errorCounts.entries()) {
if (now - info.timestamp > this.alertWindow) {
this.errorCounts.delete(key);
}
}
}
}
const errorMonitor = new ErrorMonitor();
// 全局错误处理
process.on('uncaughtException', (error) => {
console.error('Uncaught Exception:', error);
errorMonitor.handleException(error, { type: 'uncaught_exception' });
});
process.on('unhandledRejection', (reason, promise) => {
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
errorMonitor.handleException(reason, { type: 'unhandled_rejection' });
});
module.exports = errorMonitor;
监控面板和可视化
Grafana仪表板配置
创建grafana-dashboard.json:
{
"dashboard": {
"id": null,
"title": "Node.js Application Dashboard",
"tags": ["nodejs", "prometheus"],
"timezone": "browser",
"schemaVersion": 16,
"version": 0,
"refresh": "5s",
"panels": [
{
"id": 1,
"title": "HTTP Request Rate",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Request Duration",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
"legendFormat": "95th percentile"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 3,
"title": "Error Rate",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "rate(app_errors_total[5m])",
"legendFormat": "{{error_type}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
}
},
{
"id": 4,
"title": "Memory Usage",
"type": "graph",
"datasource": "Prometheus",
"targets": [
{
"expr": "nodejs_memory_usage_bytes",
"legendFormat": "{{type}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
}
}
]
}
}
告警规则配置
# alerting-rules.yml
groups:
- name: nodejs-app-alerts
rules:
- alert: HighErrorRate
expr: rate(app_errors_total[5m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Application is experiencing {{ $value }} errors per second"
- alert: HighResponseTime
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 5
for: 2m
labels:
severity: critical
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }} seconds"
- alert: MemoryUsageHigh
expr: nodejs_memory_usage_bytes{type="rss"} > 1073741824
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "RSS memory usage is {{ $value }} bytes"
最佳实践和优化建议
性能优化
// 优化指标收集性能
const client = require('prom-client');
// 使用标签缓存减少内存分配
class OptimizedMetrics {
constructor() {
// 预定义常用的标签值
this.commonLabels = new Set();
this.labelCache = new Map();
// 定期清理缓存
setInterval(() => {
this.cleanupCache();
}, 300000); // 5分钟清理一次
}
// 获取标签缓存
getLabelValue(labelName, value) {
const key = `${labelName}:${value}`;
if (!this.labelCache.has(key)) {
this.labelCache.set(key, value);
this.commonLabels.add(value);
}
return this.labelCache.get(key);
}
// 清理过期缓存
cleanupCache() {
const now = Date.now();
for (const [key, timestamp] of this.cacheTimestamps.entries()) {
if (now - timestamp > 3600000) { // 1小时过期
this.labelCache.delete(key);
this.cacheTimestamps.delete(key);
}
}
}
}
// 指标收集优化
const createOptimizedHistogram = (name, help, labelNames, buckets) => {
const histogram = new client.Histogram({
name,
help,
labelNames,
buckets
});
// 添加指标收集的性能监控
const originalStartTimer = histogram.startTimer;
histogram.startTimer = function(...args) {
const start = process.hrtime.bigint();
const timer = originalStartTimer.apply(this, args);
return (labels) => {
const end = process.hrtime.bigint();
const duration = Number(end - start) / 1000000; // 转换为毫秒
// 如果耗时超过阈值,记录警告
if (duration > 100) {
console.warn(`Histogram ${name} took ${duration}ms to record`);
}
return timer(labels);
};
};
return histogram;
};
日志管理策略
// 日志轮转配置
const winston = require('winston');
require('winston-daily-rotate-file');
const createRotatingLogger = (serviceName) => {
return winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: { service: serviceName },
transports: [
// 错误日志轮转
new winston.transports.DailyRotateFile({
filename: 'logs/error-%DATE%.log',
datePattern: 'YYYY-MM-DD',
zippedArchive: true,
maxSize: '20m',
maxFiles: '14d',
level: 'error'
}),
// 一般日志轮转
new winston.transports.DailyRotateFile({
filename: 'logs/combined-%DATE%.log',
datePattern: 'YYYY-MM-DD',
zippedArchive: true,
maxSize: '20m',
maxFiles: '14d'
}),
// 控制台输出
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
})
]
});
};
// 日志清理脚本
const cleanupOldLogs = async () => {
const fs = require('fs');
const path = require('path');
try {
const logsDir = './logs';
const files = fs.readdirSync(logsDir);
const now = Date.now();
const oneWeekAgo = now - 7 * 24 * 60 * 60 * 1000;
for (const file of files) {
if (file.startsWith('log-') || file.endsWith('.log')) {
const filePath = path.join(logsDir, file);
const stats = fs.statSync(filePath);
if (stats.mtime.getTime() < oneWeekAgo) {
fs.unlinkSync(filePath);
console.log(`Deleted old log file: ${file}`);
}
}
}
} catch (error) {
console.error('Log cleanup error:', error);
}
};
// 定期执行日志清理
setInterval(cleanupOldLogs, 24 * 60 * 60 * 1000); // 每天执行一次
安全配置
// 监控端点安全配置
const express = require('express');
const helmet = require('helmet');
const rateLimit = require('express-rate-limit');
const secureApp = (app) => {
// 安全头部
app.use(helmet());
// API限流
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15分钟
max: 100 // 限制每个IP 100次请求
});
app.use('/metrics', limiter);
// 监控端点认证(生产环境)
if (process.env.NODE_ENV === 'production') {
app.get('/metrics', (req, res, next) => {
const auth = req.headers.authorization;
if (!auth || auth !== `Bearer ${process.env.MONITORING_TOKEN}`) {
return res.status(401).json({ error: 'Unauthorized' });
}
next();
});
}
return app;
};
module.exports = secureApp;
总结
通过本文的详细介绍,我们了解了如何为Node.js后端服务构建

评论 (0)