可观测性:日志、指标与追踪
可观测性日志、指标与追踪大家好我是欧阳瑞Rich Own。今天想和大家聊聊可观测性这个重要话题。作为一个全栈开发者可观测性是保障系统稳定运行的关键。今天就来分享一下日志、指标与追踪的实战经验。可观测性概述什么是可观测性可观测性是指通过外部输出日志、指标、追踪理解系统内部状态的能力三大支柱支柱说明工具日志事件的结构化记录ELK、Loki指标数值型数据的聚合Prometheus追踪分布式系统的请求路径Jaeger、Zipkin日志管理日志收集const winston require(winston); const logger winston.createLogger({ level: info, format: winston.format.json(), transports: [ new winston.transports.File({ filename: error.log, level: error }), new winston.transports.File({ filename: combined.log }), new winston.transports.Console({ format: winston.format.simple() }) ] }); // 使用 logger.info(User logged in, { userId: 123 }); logger.error(Database connection failed, { error: err.message });日志结构化const structuredLogger winston.createLogger({ format: winston.format.combine( winston.format.timestamp(), winston.format.errors({ stack: true }), winston.format.json() ), transports: [new winston.transports.Console()] }); // 结构化日志输出 // { // level: info, // message: Order created, // timestamp: 2024-01-01T12:00:00.000Z, // orderId: abc123, // userId: 456 // }指标监控Prometheus配置# prometheus.yml global: scrape_interval: 15s scrape_configs: - job_name: node_exporter static_configs: - targets: [localhost:9100] - job_name: app_metrics static_configs: - targets: [localhost:3000]自定义指标const client require(prom-client); const httpRequestDuration new client.Histogram({ name: http_request_duration_seconds, help: Duration of HTTP requests in seconds, labelNames: [method, route, status_code] }); const requestCounter new client.Counter({ name: http_requests_total, help: Total number of HTTP requests, labelNames: [method, route, status_code] }); // 在中间件中使用 app.use((req, res, next) { const start Date.now(); res.on(finish, () { const duration (Date.now() - start) / 1000; httpRequestDuration.observe({ method: req.method, route: req.path, status_code: res.statusCode }, duration); requestCounter.inc({ method: req.method, route: req.path, status_code: res.statusCode }); }); next(); }); // 暴露指标端点 app.get(/metrics, (req, res) { res.set(Content-Type, client.register.contentType); res.send(client.register.metrics()); });分布式追踪OpenTelemetry配置const { NodeSDK } require(opentelemetry/sdk-node); const { ConsoleSpanExporter } require(opentelemetry/sdk-trace-base); const { getNodeAutoInstrumentations } require(opentelemetry/auto-instrumentations-node); const sdk new NodeSDK({ traceExporter: new ConsoleSpanExporter(), instrumentations: [getNodeAutoInstrumentations()] }); sdk.start();自定义追踪const { trace } require(opentelemetry/api); const tracer trace.getTracer(my-service); async function processOrder(orderId) { return tracer.startActiveSpan(process-order, async (span) { span.setAttribute(orderId, orderId); try { await validateOrder(orderId); await updateInventory(orderId); await sendNotification(orderId); } finally { span.end(); } }); }实战案例完整监控系统class MonitoringSystem { constructor() { this.logger this.setupLogger(); this.metrics this.setupMetrics(); this.tracer this.setupTracer(); } setupLogger() { return winston.createLogger({ level: info, format: winston.format.json(), transports: [new winston.transports.Console()] }); } setupMetrics() { const client require(prom-client); return { requests: new client.Counter({ name: app_requests_total, help: Total requests }), latency: new client.Histogram({ name: app_request_latency_seconds, help: Request latency }) }; } setupTracer() { const { trace } require(opentelemetry/api); return trace.getTracer(app-tracer); } log(level, message, metadata) { this.logger[level](message, metadata); } recordRequest(duration) { this.metrics.requests.inc(); this.metrics.latency.observe(duration); } }最佳实践1. 统一日志格式// 使用ECS格式 const ecsFormat winston.format((info) { return { timestamp: info.timestamp, log.level: info.level, message: info.message, service.name: my-service, ...info.metadata }; });2. 设置告警规则# Prometheus Alertmanager配置 groups: - name: example rules: - alert: HighErrorRate expr: sum(rate(http_requests_total{status_code5xx}[5m])) / sum(rate(http_requests_total[5m])) 0.1 for: 5m labels: severity: critical annotations: summary: High error rate detected总结可观测性是现代系统运维的核心。通过日志、指标和追踪可以全面了解系统状态快速定位问题。我的鬃狮蜥Hash对可观测性也有自己的理解——它总是能感知周围环境的变化这也许就是自然界的可观测性吧如果你对可观测性有任何问题欢迎留言交流我是欧阳瑞极客之路永无止境技术栈可观测性 · 日志 · 指标 · 追踪