Node.js后端服务监控与日志收集最佳实践:基于Prometheus和ELK Stack的可观测性建设

SwiftGuru
SwiftGuru 2026-01-14T21:08:40+08:00
0 0 0

引言

在现代软件开发中,构建高可用、可扩展的应用系统已经成为基本要求。对于Node.js后端服务而言,良好的可观测性是保障系统稳定运行的关键。可观测性不仅包括对应用性能的监控,还涵盖了日志收集、错误追踪、指标分析等多个维度。

本文将深入探讨如何为Node.js后端服务构建完整的可观测性体系,重点介绍如何集成Prometheus监控系统和ELK(Elasticsearch, Logstash, Kibana)日志收集栈。通过实际的技术方案和代码示例,帮助开发者构建一套高效、可靠的监控与日志分析系统。

什么是可观测性

可观测性是现代软件工程中的重要概念,它指的是通过系统的输出来理解系统内部状态的能力。对于后端服务而言,可观测性主要包含三个核心维度:

  1. 指标(Metrics):量化系统运行状态的数值数据
  2. 日志(Logs):系统运行过程中的详细记录
  3. 追踪(Traces):请求在分布式系统中的完整路径

这三者相互配合,共同构成了完整的可观测性体系。在Node.js应用中,我们主要通过Prometheus来收集指标数据,通过ELK Stack来处理和分析日志信息。

Prometheus监控系统集成

Prometheus简介

Prometheus是一个开源的系统监控和告警工具包,特别适合云原生环境下的监控需求。它采用多维数据模型,通过HTTP拉取方式获取指标数据,具有强大的查询语言PromQL。

Node.js Prometheus客户端集成

首先,我们需要在Node.js项目中集成Prometheus客户端库:

npm install prom-client

基础监控指标配置

const client = require('prom-client');
const express = require('express');

// 创建监控指标
const collectDefaultMetrics = client.collectDefaultMetrics;
const Registry = client.Registry;
const register = new Registry();

// 收集默认指标(CPU、内存等)
collectDefaultMetrics({ register });

// 自定义指标
const httpRequestDuration = new client.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.1, 0.5, 1, 2, 5, 10]
});

const httpRequestCount = new client.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code']
});

const errorCounter = new client.Counter({
  name: 'app_errors_total',
  help: 'Total number of application errors',
  labelNames: ['error_type', 'service']
});

// 注册指标
register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestCount);
register.registerMetric(errorCounter);

// Express中间件用于收集请求指标
const metricsMiddleware = (req, res, next) => {
  const end = httpRequestDuration.startTimer();
  
  res.on('finish', () => {
    const route = req.route ? req.route.path : req.path;
    const statusCode = res.statusCode;
    
    // 记录请求计数
    httpRequestCount.inc({
      method: req.method,
      route,
      status_code: statusCode
    });
    
    // 结束请求耗时统计
    end({
      method: req.method,
      route,
      status_code: statusCode
    });
  });
  
  next();
};

// 错误处理中间件
const errorMiddleware = (error, req, res, next) => {
  errorCounter.inc({
    error_type: error.name,
    service: 'nodejs-api'
  });
  
  next(error);
};

module.exports = {
  register,
  metricsMiddleware,
  errorMiddleware
};

创建监控端点

const express = require('express');
const { register, metricsMiddleware } = require('./metrics');

const app = express();

// 应用监控中间件
app.use(metricsMiddleware);

// 监控端点
app.get('/metrics', async (req, res) => {
  try {
    res.set('Content-Type', register.contentType);
    res.end(await register.metrics());
  } catch (ex) {
    res.status(500).end(ex);
  }
});

// 其他应用路由
app.get('/', (req, res) => {
  res.json({ message: 'Hello World' });
});

module.exports = app;

Prometheus配置文件

创建prometheus.yml配置文件:

global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'nodejs-app'
    static_configs:
      - targets: ['localhost:3000']
    metrics_path: '/metrics'
    scrape_interval: 5s

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['localhost:9100']

Docker部署示例

# Dockerfile
FROM node:18-alpine

WORKDIR /app

COPY package*.json ./
RUN npm install

COPY . .

EXPOSE 3000

CMD ["npm", "start"]
# docker-compose.yml
version: '3.8'

services:
  nodejs-app:
    build: .
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
    depends_on:
      - prometheus

  prometheus:
    image: prom/prometheus:v2.37.0
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'

  grafana:
    image: grafana/grafana-enterprise:9.5.0
    ports:
      - "3001:3001"
    depends_on:
      - prometheus
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin

ELK日志收集系统

ELK Stack架构

ELK Stack由三个核心组件组成:

  • Elasticsearch:分布式搜索和分析引擎
  • Logstash:数据处理管道
  • Kibana:数据可视化界面

Node.js日志收集配置

Winston日志库集成

npm install winston @winstonjs/cloudwatch-transport
const winston = require('winston');
const expressWinston = require('express-winston');

// 创建日志记录器
const logger = winston.createLogger({
  level: 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.errors({ stack: true }),
    winston.format.json()
  ),
  defaultMeta: { service: 'nodejs-api' },
  transports: [
    // 控制台输出
    new winston.transports.Console({
      format: winston.format.combine(
        winston.format.colorize(),
        winston.format.simple()
      )
    }),
    
    // 文件输出
    new winston.transports.File({ 
      filename: 'logs/error.log', 
      level: 'error' 
    }),
    
    // JSON文件输出
    new winston.transports.File({ 
      filename: 'logs/combined.log' 
    })
  ]
});

// Express中间件
const requestLogger = expressWinston.logger({
  transports: [
    new winston.transports.File({ filename: 'logs/request.log' })
  ],
  format: winston.format.combine(
    winston.format.json(),
    winston.format.timestamp()
  ),
  expressFormat: true,
  colorize: false
});

// 错误日志中间件
const errorLogger = expressWinston.errorLogger({
  transports: [
    new winston.transports.File({ filename: 'logs/error.log' })
  ],
  format: winston.format.combine(
    winston.format.json(),
    winston.format.timestamp()
  )
});

module.exports = {
  logger,
  requestLogger,
  errorLogger
};

结构化日志格式

// 创建结构化日志处理器
const createStructuredLogger = (service) => {
  const logger = winston.createLogger({
    level: 'info',
    format: winston.format.combine(
      winston.format.timestamp(),
      winston.format.errors({ stack: true }),
      winston.format.json()
    ),
    defaultMeta: { service },
    transports: [
      new winston.transports.Console({
        format: winston.format.combine(
          winston.format.colorize(),
          winston.format.printf(({ timestamp, level, message, ...rest }) => {
            return `${timestamp} [${level}] ${message} ${JSON.stringify(rest)}`;
          })
        )
      })
    ]
  });

  // 添加请求上下文信息
  const logWithContext = (level, message, context = {}) => {
    logger.log(level, message, {
      ...context,
      timestamp: new Date().toISOString()
    });
  };

  return {
    logger,
    logWithContext
  };
};

// 使用示例
const { logger, logWithContext } = createStructuredLogger('user-service');

app.get('/users/:id', async (req, res) => {
  const userId = req.params.id;
  
  logWithContext('info', 'Fetching user details', {
    userId,
    requestId: req.headers['x-request-id'],
    ip: req.ip
  });
  
  try {
    const user = await getUserById(userId);
    logWithContext('info', 'User fetched successfully', {
      userId,
      responseTime: Date.now() - startTime
    });
    
    res.json(user);
  } catch (error) {
    logWithContext('error', 'Failed to fetch user', {
      userId,
      error: error.message,
      stack: error.stack
    });
    
    res.status(500).json({ error: 'Internal server error' });
  }
});

Logstash配置

创建logstash.conf文件:

input {
  beats {
    port => 5044
    host => "0.0.0.0"
  }
  
  # 监控日志文件
  file {
    path => "/app/logs/*.log"
    start_position => "beginning"
    sincedb_path => "/dev/null"
    type => "nodejs-logs"
  }
}

filter {
  if [type] == "nodejs-logs" {
    json {
      source => "message"
      skip_on_invalid_json => true
    }
    
    # 解析时间戳
    date {
      match => [ "timestamp", "yyyy-MM-dd HH:mm:ss.SSS" ]
      target => "@timestamp"
    }
    
    # 添加标签
    mutate {
      add_tag => [ "nodejs", "application" ]
    }
  }
  
  # 处理错误日志
  if [level] == "error" {
    mutate {
      add_tag => [ "error" ]
    }
  }
}

output {
  elasticsearch {
    hosts => ["http://elasticsearch:9200"]
    index => "nodejs-app-%{+YYYY.MM.dd}"
  }
  
  stdout {
    codec => rubydebug
  }
}

Docker Compose部署

version: '3.8'

services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0
    container_name: elasticsearch
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
      - ES_JAVA_OPTS=-Xms1g -Xmx1g
    ports:
      - "9200:9200"
    volumes:
      - esdata:/usr/share/elasticsearch/data
    networks:
      - elk

  logstash:
    image: docker.elastic.co/logstash/logstash:8.7.0
    container_name: logstash
    ports:
      - "5044:5044"
      - "9600:9600"
    volumes:
      - ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf
      - ./logs:/app/logs
    depends_on:
      - elasticsearch
    networks:
      - elk

  kibana:
    image: docker.elastic.co/kibana/kibana:8.7.0
    container_name: kibana
    ports:
      - "5601:5601"
    depends_on:
      - elasticsearch
    networks:
      - elk

  nodejs-app:
    build: .
    container_name: nodejs-app
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
      - LOG_LEVEL=info
    depends_on:
      - elasticsearch
      - logstash
    networks:
      - elk

volumes:
  esdata:

networks:
  elk:
    driver: bridge

高级监控功能实现

自定义指标收集

const client = require('prom-client');

// 数据库连接池指标
const dbPool = new client.Gauge({
  name: 'db_pool_connections',
  help: 'Number of database connections in pool',
  labelNames: ['pool_name']
});

// 缓存命中率指标
const cacheMetrics = {
  hits: new client.Counter({
    name: 'cache_hits_total',
    help: 'Total number of cache hits'
  }),
  misses: new client.Counter({
    name: 'cache_misses_total',
    help: 'Total number of cache misses'
  }),
  evictions: new client.Counter({
    name: 'cache_evictions_total',
    help: 'Total number of cache evictions'
  })
};

// 响应时间分位数指标
const responseTimeQuantiles = new client.Histogram({
  name: 'response_time_seconds',
  help: 'Response time in seconds',
  labelNames: ['endpoint', 'method'],
  percentiles: [0.5, 0.9, 0.95, 0.99]
});

// 内存使用指标
const memoryUsage = new client.Gauge({
  name: 'nodejs_memory_usage_bytes',
  help: 'Memory usage in bytes',
  labelNames: ['type']
});

// 定期更新内存指标
setInterval(() => {
  const usage = process.memoryUsage();
  memoryUsage.set({ type: 'rss' }, usage.rss);
  memoryUsage.set({ type: 'heapTotal' }, usage.heapTotal);
  memoryUsage.set({ type: 'heapUsed' }, usage.heapUsed);
}, 5000);

module.exports = {
  dbPool,
  cacheMetrics,
  responseTimeQuantiles,
  memoryUsage
};

异常监控与告警

const { errorCounter } = require('./metrics');
const axios = require('axios');

// 异常处理和监控
class ErrorMonitor {
  constructor() {
    this.errorCounts = new Map();
    this.alertThreshold = 10;
    this.alertWindow = 60000; // 1分钟
  }

  async handleException(error, context = {}) {
    const errorKey = `${error.name}_${error.message.substring(0, 50)}`;
    
    // 记录错误
    errorCounter.inc({
      error_type: error.name,
      service: 'nodejs-api',
      ...context
    });

    // 统计错误频率
    if (!this.errorCounts.has(errorKey)) {
      this.errorCounts.set(errorKey, { count: 0, timestamp: Date.now() });
    }

    const errorInfo = this.errorCounts.get(errorKey);
    errorInfo.count++;
    
    // 检查是否需要告警
    if (errorInfo.count >= this.alertThreshold) {
      await this.sendAlert({
        error: error.name,
        message: error.message,
        context,
        count: errorInfo.count,
        timestamp: new Date().toISOString()
      });
      
      // 重置计数
      errorInfo.count = 0;
      errorInfo.timestamp = Date.now();
    }
  }

  async sendAlert(alertData) {
    try {
      // 发送到告警系统(如Slack、Email等)
      const webhookUrl = process.env.ALERT_WEBHOOK_URL;
      
      if (webhookUrl) {
        await axios.post(webhookUrl, {
          text: `⚠️ Node.js Application Alert`,
          attachments: [{
            color: 'danger',
            fields: [
              {
                title: 'Error Type',
                value: alertData.error,
                short: true
              },
              {
                title: 'Message',
                value: alertData.message,
                short: false
              },
              {
                title: 'Count',
                value: alertData.count,
                short: true
              }
            ]
          }]
        });
      }
      
      console.error('Alert sent:', JSON.stringify(alertData));
    } catch (alertError) {
      console.error('Failed to send alert:', alertError);
    }
  }

  // 清理过期错误记录
  cleanup() {
    const now = Date.now();
    for (const [key, info] of this.errorCounts.entries()) {
      if (now - info.timestamp > this.alertWindow) {
        this.errorCounts.delete(key);
      }
    }
  }
}

const errorMonitor = new ErrorMonitor();

// 全局错误处理
process.on('uncaughtException', (error) => {
  console.error('Uncaught Exception:', error);
  errorMonitor.handleException(error, { type: 'uncaught_exception' });
});

process.on('unhandledRejection', (reason, promise) => {
  console.error('Unhandled Rejection at:', promise, 'reason:', reason);
  errorMonitor.handleException(reason, { type: 'unhandled_rejection' });
});

module.exports = errorMonitor;

监控面板和可视化

Grafana仪表板配置

创建grafana-dashboard.json

{
  "dashboard": {
    "id": null,
    "title": "Node.js Application Dashboard",
    "tags": ["nodejs", "prometheus"],
    "timezone": "browser",
    "schemaVersion": 16,
    "version": 0,
    "refresh": "5s",
    "panels": [
      {
        "id": 1,
        "title": "HTTP Request Rate",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{method}} {{route}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "Request Duration",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
            "legendFormat": "95th percentile"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      },
      {
        "id": 3,
        "title": "Error Rate",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "rate(app_errors_total[5m])",
            "legendFormat": "{{error_type}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 8
        }
      },
      {
        "id": 4,
        "title": "Memory Usage",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "nodejs_memory_usage_bytes",
            "legendFormat": "{{type}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 8
        }
      }
    ]
  }
}

告警规则配置

# alerting-rules.yml
groups:
  - name: nodejs-app-alerts
    rules:
      - alert: HighErrorRate
        expr: rate(app_errors_total[5m]) > 1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Application is experiencing {{ $value }} errors per second"

      - alert: HighResponseTime
        expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 5
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "High response time detected"
          description: "95th percentile response time is {{ $value }} seconds"

      - alert: MemoryUsageHigh
        expr: nodejs_memory_usage_bytes{type="rss"} > 1073741824
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "RSS memory usage is {{ $value }} bytes"

最佳实践和优化建议

性能优化

// 优化指标收集性能
const client = require('prom-client');

// 使用标签缓存减少内存分配
class OptimizedMetrics {
  constructor() {
    // 预定义常用的标签值
    this.commonLabels = new Set();
    this.labelCache = new Map();
    
    // 定期清理缓存
    setInterval(() => {
      this.cleanupCache();
    }, 300000); // 5分钟清理一次
  }

  // 获取标签缓存
  getLabelValue(labelName, value) {
    const key = `${labelName}:${value}`;
    if (!this.labelCache.has(key)) {
      this.labelCache.set(key, value);
      this.commonLabels.add(value);
    }
    return this.labelCache.get(key);
  }

  // 清理过期缓存
  cleanupCache() {
    const now = Date.now();
    for (const [key, timestamp] of this.cacheTimestamps.entries()) {
      if (now - timestamp > 3600000) { // 1小时过期
        this.labelCache.delete(key);
        this.cacheTimestamps.delete(key);
      }
    }
  }
}

// 指标收集优化
const createOptimizedHistogram = (name, help, labelNames, buckets) => {
  const histogram = new client.Histogram({
    name,
    help,
    labelNames,
    buckets
  });

  // 添加指标收集的性能监控
  const originalStartTimer = histogram.startTimer;
  histogram.startTimer = function(...args) {
    const start = process.hrtime.bigint();
    const timer = originalStartTimer.apply(this, args);
    
    return (labels) => {
      const end = process.hrtime.bigint();
      const duration = Number(end - start) / 1000000; // 转换为毫秒
      
      // 如果耗时超过阈值,记录警告
      if (duration > 100) {
        console.warn(`Histogram ${name} took ${duration}ms to record`);
      }
      
      return timer(labels);
    };
  };

  return histogram;
};

日志管理策略

// 日志轮转配置
const winston = require('winston');
require('winston-daily-rotate-file');

const createRotatingLogger = (serviceName) => {
  return winston.createLogger({
    level: 'info',
    format: winston.format.combine(
      winston.format.timestamp(),
      winston.format.errors({ stack: true }),
      winston.format.json()
    ),
    defaultMeta: { service: serviceName },
    transports: [
      // 错误日志轮转
      new winston.transports.DailyRotateFile({
        filename: 'logs/error-%DATE%.log',
        datePattern: 'YYYY-MM-DD',
        zippedArchive: true,
        maxSize: '20m',
        maxFiles: '14d',
        level: 'error'
      }),
      
      // 一般日志轮转
      new winston.transports.DailyRotateFile({
        filename: 'logs/combined-%DATE%.log',
        datePattern: 'YYYY-MM-DD',
        zippedArchive: true,
        maxSize: '20m',
        maxFiles: '14d'
      }),
      
      // 控制台输出
      new winston.transports.Console({
        format: winston.format.combine(
          winston.format.colorize(),
          winston.format.simple()
        )
      })
    ]
  });
};

// 日志清理脚本
const cleanupOldLogs = async () => {
  const fs = require('fs');
  const path = require('path');
  
  try {
    const logsDir = './logs';
    const files = fs.readdirSync(logsDir);
    
    const now = Date.now();
    const oneWeekAgo = now - 7 * 24 * 60 * 60 * 1000;
    
    for (const file of files) {
      if (file.startsWith('log-') || file.endsWith('.log')) {
        const filePath = path.join(logsDir, file);
        const stats = fs.statSync(filePath);
        
        if (stats.mtime.getTime() < oneWeekAgo) {
          fs.unlinkSync(filePath);
          console.log(`Deleted old log file: ${file}`);
        }
      }
    }
  } catch (error) {
    console.error('Log cleanup error:', error);
  }
};

// 定期执行日志清理
setInterval(cleanupOldLogs, 24 * 60 * 60 * 1000); // 每天执行一次

安全配置

// 监控端点安全配置
const express = require('express');
const helmet = require('helmet');
const rateLimit = require('express-rate-limit');

const secureApp = (app) => {
  // 安全头部
  app.use(helmet());
  
  // API限流
  const limiter = rateLimit({
    windowMs: 15 * 60 * 1000, // 15分钟
    max: 100 // 限制每个IP 100次请求
  });
  
  app.use('/metrics', limiter);
  
  // 监控端点认证(生产环境)
  if (process.env.NODE_ENV === 'production') {
    app.get('/metrics', (req, res, next) => {
      const auth = req.headers.authorization;
      
      if (!auth || auth !== `Bearer ${process.env.MONITORING_TOKEN}`) {
        return res.status(401).json({ error: 'Unauthorized' });
      }
      
      next();
    });
  }
  
  return app;
};

module.exports = secureApp;

总结

通过本文的详细介绍,我们了解了如何为Node.js后端服务构建

相关推荐
广告位招租

相似文章

    评论 (0)

    0/2000