Application Monitoring
Observability for Production Apps
Explanation
Why Monitor?
Monitoring provides visibility into application health, performance, and behavior. It enables proactive issue detection and data-driven decisions.
Three Pillars of Observability
| Pillar | Description | Tools | |--------|-------------|-------| | Metrics | Numeric measurements | Prometheus, Datadog | | Logs | Event records | ELK Stack, Loki | | Traces | Request flow | Jaeger, Zipkin |
Key Metrics
- RED Method: Rate, Errors, Duration
- USE Method: Utilization, Saturation, Errors
- Golden Signals: Latency, Traffic, Errors, Saturation
Demonstration
Example 1: Prometheus Metrics
// Express with prometheus-client
const client = require('prom-client');
const express = require('express');
// Create Registry
const register = new client.Registry();
// Add default metrics
client.collectDefaultMetrics({ register });
// Custom metrics
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});
register.registerMetric(httpRequestDuration);
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
register.registerMetric(httpRequestsTotal);
const activeConnections = new client.Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
register.registerMetric(activeConnections);
// Middleware to collect metrics
function metricsMiddleware(req, res, next) {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route?.path || req.path;
httpRequestDuration
.labels(req.method, route, res.statusCode)
.observe(duration);
httpRequestsTotal
.labels(req.method, route, res.statusCode)
.inc();
});
next();
}
// Expose metrics endpoint
const app = express();
app.use(metricsMiddleware);
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
// Business metrics
const ordersProcessed = new client.Counter({
name: 'orders_processed_total',
help: 'Total orders processed',
labelNames: ['status']
});
const orderValue = new client.Histogram({
name: 'order_value_dollars',
help: 'Order value in dollars',
buckets: [10, 50, 100, 500, 1000, 5000]
});
// In your order processing
async function processOrder(order) {
try {
await saveOrder(order);
ordersProcessed.labels('success').inc();
orderValue.observe(order.total);
} catch (error) {
ordersProcessed.labels('failure').inc();
throw error;
}
}
Example 2: Structured Logging
const winston = require('winston');
const { format } = winston;
// Create logger
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: format.combine(
format.timestamp(),
format.errors({ stack: true }),
format.json()
),
defaultMeta: {
service: process.env.SERVICE_NAME || 'api',
version: process.env.APP_VERSION || '1.0.0',
environment: process.env.NODE_ENV || 'development'
},
transports: [
new winston.transports.Console(),
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
// Request ID middleware
const { v4: uuidv4 } = require('uuid');
function requestIdMiddleware(req, res, next) {
req.requestId = req.headers['x-request-id'] || uuidv4();
res.setHeader('X-Request-Id', req.requestId);
next();
}
// Request logging middleware
function requestLogger(req, res, next) {
const start = Date.now();
// Log request
logger.info('Request received', {
requestId: req.requestId,
method: req.method,
path: req.path,
query: req.query,
userAgent: req.get('user-agent'),
ip: req.ip
});
// Log response
res.on('finish', () => {
const duration = Date.now() - start;
const logData = {
requestId: req.requestId,
method: req.method,
path: req.path,
statusCode: res.statusCode,
duration,
contentLength: res.get('content-length')
};
if (res.statusCode >= 500) {
logger.error('Request failed', logData);
} else if (res.statusCode >= 400) {
logger.warn('Request error', logData);
} else {
logger.info('Request completed', logData);
}
});
next();
}
// Child logger with context
class RequestLogger {
constructor(requestId, userId) {
this.logger = logger.child({
requestId,
userId
});
}
info(message, data = {}) {
this.logger.info(message, data);
}
error(message, error, data = {}) {
this.logger.error(message, {
...data,
error: error.message,
stack: error.stack
});
}
}
// Usage in handlers
app.post('/orders', async (req, res) => {
const log = new RequestLogger(req.requestId, req.user.id);
log.info('Creating order', { items: req.body.items.length });
try {
const order = await createOrder(req.body);
log.info('Order created', { orderId: order.id });
res.json(order);
} catch (error) {
log.error('Failed to create order', error);
res.status(500).json({ error: 'Failed to create order' });
}
});
Example 3: Distributed Tracing
const { NodeTracerProvider } = require('@opentelemetry/node');
const { SimpleSpanProcessor } = require('@opentelemetry/tracing');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
// Initialize tracing
const provider = new NodeTracerProvider();
const jaegerExporter = new JaegerExporter({
serviceName: 'api-service',
host: process.env.JAEGER_HOST || 'localhost',
port: 6832
});
provider.addSpanProcessor(new SimpleSpanProcessor(jaegerExporter));
provider.register();
// Auto-instrumentation
registerInstrumentations({
instrumentations: [
new HttpInstrumentation(),
new ExpressInstrumentation()
]
});
const tracer = provider.getTracer('api-service');
// Manual span creation
async function processOrder(orderId) {
const span = tracer.startSpan('process-order');
span.setAttribute('order.id', orderId);
try {
// Validate order
const validateSpan = tracer.startSpan('validate-order', { parent: span });
await validateOrder(orderId);
validateSpan.end();
// Process payment
const paymentSpan = tracer.startSpan('process-payment', { parent: span });
paymentSpan.setAttribute('payment.method', 'credit_card');
await processPayment(orderId);
paymentSpan.end();
// Send confirmation
const emailSpan = tracer.startSpan('send-confirmation', { parent: span });
await sendConfirmationEmail(orderId);
emailSpan.end();
span.setStatus({ code: SpanStatusCode.OK });
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
span.recordException(error);
throw error;
} finally {
span.end();
}
}
// Context propagation
const { context, propagation } = require('@opentelemetry/api');
async function callExternalService(data) {
const span = tracer.startSpan('external-service-call');
const headers = {};
propagation.inject(context.active(), headers);
try {
const response = await fetch('https://external-service.com/api', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...headers // Inject trace context
},
body: JSON.stringify(data)
});
span.setAttribute('http.status_code', response.status);
return response.json();
} finally {
span.end();
}
}
Example 4: Health Checks
// Health check endpoints
const healthChecks = {
database: async () => {
try {
await db.query('SELECT 1');
return { status: 'healthy', latency: 0 };
} catch (error) {
return { status: 'unhealthy', error: error.message };
}
},
redis: async () => {
try {
const start = Date.now();
await redis.ping();
return {
status: 'healthy',
latency: Date.now() - start
};
} catch (error) {
return { status: 'unhealthy', error: error.message };
}
},
externalApi: async () => {
try {
const start = Date.now();
const response = await fetch('https://api.example.com/health');
return {
status: response.ok ? 'healthy' : 'degraded',
latency: Date.now() - start
};
} catch (error) {
return { status: 'unhealthy', error: error.message };
}
}
};
// Liveness probe - is the app running?
app.get('/health/live', (req, res) => {
res.json({ status: 'alive' });
});
// Readiness probe - is the app ready to serve traffic?
app.get('/health/ready', async (req, res) => {
const results = {};
let isReady = true;
for (const [name, check] of Object.entries(healthChecks)) {
results[name] = await check();
if (results[name].status === 'unhealthy') {
isReady = false;
}
}
res.status(isReady ? 200 : 503).json({
status: isReady ? 'ready' : 'not_ready',
checks: results,
timestamp: new Date().toISOString()
});
});
// Detailed health check
app.get('/health', async (req, res) => {
const results = {};
for (const [name, check] of Object.entries(healthChecks)) {
results[name] = await check();
}
const healthy = Object.values(results).every(r => r.status === 'healthy');
const degraded = Object.values(results).some(r => r.status === 'degraded');
res.status(healthy ? 200 : degraded ? 200 : 503).json({
status: healthy ? 'healthy' : degraded ? 'degraded' : 'unhealthy',
version: process.env.APP_VERSION,
uptime: process.uptime(),
memory: process.memoryUsage(),
checks: results
});
});
Example 5: Alerting Rules
# prometheus/alerts.yml
groups:
- name: api_alerts
rules:
# High error rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status_code=~"5.."}[5m])) /
sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: High error rate detected
description: Error rate is {{ $value | humanizePercentage }}
# High latency
- alert: HighLatency
expr: |
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: High latency detected
description: 95th percentile latency is {{ $value }}s
# Service down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: Service is down
description: "{{ $labels.instance }} has been down for more than 1 minute"
# Memory usage
- alert: HighMemoryUsage
expr: |
process_resident_memory_bytes / 1024 / 1024 > 500
for: 10m
labels:
severity: warning
annotations:
summary: High memory usage
description: Memory usage is {{ $value }}MB
# Database connection pool
- alert: DatabaseConnectionPoolExhausted
expr: db_pool_available_connections < 5
for: 2m
labels:
severity: critical
annotations:
summary: Database connection pool nearly exhausted
Example 6: Dashboard Configuration
// Grafana dashboard JSON (simplified)
const dashboardConfig = {
title: 'API Dashboard',
panels: [
{
title: 'Request Rate',
type: 'graph',
targets: [{
expr: 'sum(rate(http_requests_total[5m])) by (route)',
legendFormat: '{{ route }}'
}]
},
{
title: 'Error Rate',
type: 'graph',
targets: [{
expr: 'sum(rate(http_requests_total{status_code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100',
legendFormat: 'Error %'
}]
},
{
title: 'Latency (p95)',
type: 'graph',
targets: [{
expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))',
legendFormat: '{{ route }}'
}]
},
{
title: 'Active Connections',
type: 'stat',
targets: [{
expr: 'active_connections'
}]
}
]
};
// Custom metrics for business monitoring
const businessMetrics = {
// User activity
activeUsers: new client.Gauge({
name: 'active_users',
help: 'Number of active users in the last 15 minutes'
}),
// Revenue tracking
revenueTotal: new client.Counter({
name: 'revenue_total_dollars',
help: 'Total revenue in dollars',
labelNames: ['product_type']
}),
// Feature usage
featureUsage: new client.Counter({
name: 'feature_usage_total',
help: 'Feature usage count',
labelNames: ['feature', 'user_tier']
})
};
// Update metrics
setInterval(async () => {
const count = await getActiveUserCount();
businessMetrics.activeUsers.set(count);
}, 60000);
// Track revenue
async function recordPurchase(order) {
businessMetrics.revenueTotal
.labels(order.productType)
.inc(order.total);
}
Key Takeaways:
- Collect metrics, logs, and traces
- Use structured logging
- Implement health checks
- Set up meaningful alerts
- Create actionable dashboards
Imitation
Challenge 1: Implement SLO Monitoring
Task: Create Service Level Objective monitoring with error budgets.
Solution
class SLOMonitor {
constructor(redis) {
this.redis = redis;
this.slos = new Map();
}
defineSLO(name, config) {
this.slos.set(name, {
target: config.target, // e.g., 0.999 (99.9%)
window: config.window, // e.g., 30 days
metric: config.metric // e.g., 'availability'
});
}
async recordEvent(sloName, success) {
const key = `slo:${sloName}:${this.getCurrentWindow()}`;
await this.redis.hincrby(key, 'total', 1);
if (success) {
await this.redis.hincrby(key, 'success', 1);
}
// Set expiry
await this.redis.expire(key, 60 * 60 * 24 * 35);
}
async getSLOStatus(sloName) {
const slo = this.slos.get(sloName);
if (!slo) return null;
const key = `slo:${sloName}:${this.getCurrentWindow()}`;
const data = await this.redis.hgetall(key);
const total = parseInt(data.total) || 0;
const success = parseInt(data.success) || 0;
const current = total > 0 ? success / total : 1;
const errorBudget = 1 - slo.target;
const consumedBudget = 1 - current;
const remainingBudget = Math.max(0, errorBudget - consumedBudget);
return {
name: sloName,
target: slo.target,
current: current,
errorBudget: errorBudget,
consumedBudget: consumedBudget,
remainingBudgetPercent: (remainingBudget / errorBudget) * 100,
totalEvents: total,
successfulEvents: success
};
}
getCurrentWindow() {
return new Date().toISOString().slice(0, 7); // YYYY-MM
}
}
// Usage
const sloMonitor = new SLOMonitor(redis);
sloMonitor.defineSLO('api_availability', {
target: 0.999,
window: 30,
metric: 'availability'
});
// In middleware
app.use(async (req, res, next) => {
res.on('finish', async () => {
const success = res.statusCode < 500;
await sloMonitor.recordEvent('api_availability', success);
});
next();
});
// Endpoint to check SLO
app.get('/slo/status', async (req, res) => {
const status = await sloMonitor.getSLOStatus('api_availability');
res.json(status);
});
Practice
Exercise 1: Custom Metrics Library
Difficulty: Intermediate
Build a metrics library:
- Counter, Gauge, Histogram
- Label support
- Prometheus export format
Exercise 2: Log Aggregation
Difficulty: Advanced
Implement log aggregation:
- Collect from multiple services
- Search and filter
- Alerting on patterns
Summary
What you learned:
- Metrics collection with Prometheus
- Structured logging
- Distributed tracing
- Health check patterns
- Alerting and dashboards
Next Steps:
- Read: Error Handling
- Practice: Add monitoring to your app
- Explore: Grafana, Datadog
