You can't fix what you can't see. Most teams discover problems when customers report them — by then, the damage is done. Proper monitoring means you know about issues before users do, you have data to diagnose them quickly, and you can see trends before they become incidents.
Prometheus + Grafana is the most widely used open-source monitoring stack in 2026. Here's how to set it up for a Node.js application.
/metrics endpoint that Prometheus readsThe key insight: Prometheus pulls metrics from your app (instead of your app pushing to Prometheus). This means Prometheus controls the scrape interval and your app just needs to maintain an accurate metrics endpoint.
npm install prom-client// lib/metrics.ts
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client'
export const register = new Registry()
// Collect default Node.js metrics (CPU, memory, event loop, GC)
collectDefaultMetrics({ register })
// HTTP request counter
export const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code'],
registers: [register],
})
// HTTP request duration histogram
export const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5],
registers: [register],
})
// Active connections gauge
export const activeConnections = new Gauge({
name: 'active_connections',
help: 'Number of active connections',
registers: [register],
})
// Business metrics
export const userRegistrationsTotal = new Counter({
name: 'user_registrations_total',
help: 'Total number of user registrations',
labelNames: ['method'],
registers: [register],
})
export const ordersTotal = new Counter({
name: 'orders_total',
help: 'Total number of orders placed',
labelNames: ['status'],
registers: [register],
})// middleware/metrics.ts
import { Request, Response, NextFunction } from 'express'
import { httpRequestsTotal, httpRequestDuration } from '../lib/metrics'
export function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
const start = Date.now()
res.on('finish', () => {
const duration = (Date.now() - start) / 1000
// Normalize route to avoid high cardinality
// /api/users/123 → /api/users/:id
const route = req.route?.path || req.path
const labels = {
method: req.method,
route,
status_code: String(res.statusCode),
}
httpRequestsTotal.inc(labels)
httpRequestDuration.observe(labels, duration)
})
next()
}// app.ts — add the metrics endpoint and middleware
import { register } from './lib/metrics'
import { metricsMiddleware } from './middleware/metrics'
app.use(metricsMiddleware)
// Metrics endpoint — only accessible internally
app.get('/metrics', async (req, res) => {
// In production, add IP allowlist or internal-only network rule
res.set('Content-Type', register.contentType)
res.send(await register.metrics())
})Now visit http://localhost:3000/metrics and you'll see raw Prometheus metrics.
Technical metrics (CPU, memory, request rate) tell you the system is struggling. Business metrics tell you what's actually happening to your users:
// In your auth controller
import { userRegistrationsTotal } from '../lib/metrics'
export async function register(req, res) {
// ... registration logic
userRegistrationsTotal.inc({ method: 'email' })
// or { method: 'google' } for OAuth
}
// In your orders controller
import { ordersTotal } from '../lib/metrics'
export async function createOrder(req, res) {
try {
const order = await processOrder(req.body)
ordersTotal.inc({ status: 'success' })
res.json(order)
} catch (error) {
ordersTotal.inc({ status: 'failed' })
throw error
}
}# docker-compose.yml
version: '3.8'
services:
app:
build: .
ports:
- "3000:3000"
labels:
- "prometheus.scrape=true"
- "prometheus.port=3000"
- "prometheus.path=/metrics"
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=30d'
grafana:
image: grafana/grafana:latest
environment:
- GF_SECURITY_ADMIN_PASSWORD=changeme
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana_data:/var/lib/grafana
ports:
- "3001:3000"
depends_on:
- prometheus
volumes:
prometheus_data:
grafana_data:# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'nodejs-app'
static_configs:
- targets: ['app:3000']
metrics_path: '/metrics'Once Prometheus is scraping your app, these queries cover most use cases:
# Request rate per second (last 5 minutes)
rate(http_requests_total[5m])
# Error rate (4xx + 5xx)
rate(http_requests_total{status_code=~"[45].."}[5m])
/ rate(http_requests_total[5m]) * 100
# 95th percentile latency
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Requests by route
topk(10, rate(http_requests_total[5m]))
# Memory usage
process_resident_memory_bytes / 1024 / 1024
# Event loop lag (Node.js performance indicator)
nodejs_eventloop_lag_secondshttp://localhost:3001 (admin/changeme)http://prometheus:9090A basic dashboard should show:
# alerting_rules.yml
groups:
- name: app_alerts
rules:
- alert: HighErrorRate
expr: |
rate(http_requests_total{status_code=~"5.."}[5m])
/ rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "Error rate above 5%"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: HighLatency
expr: |
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "P95 latency above 1 second"/metrics endpoint your app exposes — instrument your app with prom-client