Build a Service Health Monitoring Tool
Introduction
Service health monitoring tools track the availability and performance of services. They detect failures early, enable quick response to issues, and provide visibility into system health.
What You'll Build
- Health check framework
- Multiple check types
- Alerting system
- Status dashboard
Core Concepts
Health Checks
Periodic tests that verify a service is functioning correctly.
Uptime
Percentage of time a service is available over a period.
Prerequisites
- Python 3.8+
- requests library
Health Monitor
Create healthmon/monitor.py:
import time
import threading
from typing import Dict, List, Callable, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import Enum
class Status(Enum):
HEALTHY = "healthy"
UNHEALTHY = "unhealthy"
DEGRADED = "degraded"
UNKNOWN = "unknown"
@dataclass
class HealthResult:
service: str
status: Status
message: str
timestamp: datetime
response_time: float
details: dict = None
class Service:
def __init__(self, name: str, check_fn: Callable, interval: int = 60):
self.name = name
self.check_fn = check_fn
self.interval = interval
self.last_result: Optional[HealthResult] = None
self.history: List[HealthResult] = []
self.consecutive_failures = 0
def check(self) -> HealthResult:
start_time = time.time()
try:
result = self.check_fn()
response_time = time.time() - start_time
if result is True:
status = Status.HEALTHY
message = "OK"
self.consecutive_failures = 0
else:
status = Status.UNHEALTHY
message = str(result) if result else "Check failed"
self.consecutive_failures += 1
except Exception as e:
status = Status.UNHEALTHY
message = str(e)
response_time = time.time() - start_time
self.consecutive_failures += 1
health_result = HealthResult(
service=self.name,
status=status,
message=message,
timestamp=datetime.now(),
response_time=response_time
)
self.last_result = health_result
self.history.append(health_result)
if len(self.history) > 1000:
self.history = self.history[-1000:]
return health_result
def get_uptime(self, window: timedelta = timedelta(hours=1)) -> float:
cutoff = datetime.now() - window
recent = [r for r in self.history if r.timestamp > cutoff]
if not recent:
return 0.0
healthy = sum(1 for r in recent if r.status == Status.HEALTHY)
return (healthy / len(recent)) * 100
class HealthMonitor:
def __init__(self):
self.services: Dict[str, Service] = {}
self.alert_handlers: List[Callable] = []
self._running = False
self._thread = None
def register_service(self, name: str, check_fn: Callable, interval: int = 60):
service = Service(name, check_fn, interval)
self.services[name] = service
return service
def unregister_service(self, name: str):
self.services.pop(name, None)
def check_service(self, name: str) -> Optional[HealthResult]:
service = self.services.get(name)
return service.check() if service else None
def check_all(self) -> List[HealthResult]:
results = []
for service in self.services.values():
result = service.check()
results.append(result)
if result.status != Status.HEALTHY:
self._trigger_alerts(service, result)
return results
def get_status(self) -> Dict:
return {
name: {
'status': service.last_result.status.value if service.last_result else 'unknown',
'message': service.last_result.message if service.last_result else '',
'response_time': service.last_result.response_time if service.last_result else 0,
'uptime_1h': service.get_uptime(timedelta(hours=1)),
'uptime_24h': service.get_uptime(timedelta(hours=24))
}
for name, service in self.services.items()
}
def on_alert(self, handler: Callable):
self.alert_handlers.append(handler)
def _trigger_alerts(self, service: Service, result: HealthResult):
for handler in self.alert_handlers:
try:
handler(service, result)
except Exception:
pass
def start(self):
self._running = True
self._thread = threading.Thread(target=self._run_loop)
self._thread.daemon = True
self._thread.start()
def stop(self):
self._running = False
if self._thread:
self._thread.join()
def _run_loop(self):
while self._running:
self.check_all()
time.sleep(30)
Health Checks
import requests
def http_check(url: str, expected_status: int = 200, timeout: int = 5):
response = requests.get(url, timeout=timeout)
return response.status_code == expected_status
def tcp_check(host: str, port: int, timeout: int = 5):
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
result = sock.connect_ex((host, port))
sock.close()
return result == 0
def ping_check(host: str, timeout: int = 5):
import subprocess
try:
subprocess.run(['ping', '-c', '1', '-W', str(timeout), host],
capture_output=True, timeout=timeout + 1)
return True
except:
return False
def database_check(connection_string: str):
try:
import psycopg2
conn = psycopg2.connect(connection_string)
conn.close()
return True
except Exception as e:
return str(e)
def redis_check(host: str = 'localhost', port: int = 6379):
import redis
r = redis.Redis(host=host, port=port)
return r.ping()
def custom_check():
# Your custom health logic
return True
Alerts
def console_alert(service, result):
print(f"[ALERT] {service.name}: {result.status.value} - {result.message}")
def email_alert(service, result):
# Send email notification
print(f"Would send email: {service.name} is {result.status.value}")
def slack_alert(service, result):
# Send to Slack webhook
print(f"Would send Slack: {service.name} is {result.status.value}")
def pagerduty_alert(service, result):
# Trigger PagerDuty incident
print(f"Would trigger PagerDuty: {service.name} is {result.status.value}")
monitor = HealthMonitor()
monitor.register_service('api', lambda: http_check('http://localhost:8080/health'))
monitor.register_service('database', lambda: redis_check())
monitor.on_alert(console_alert)
monitor.on_alert(email_alert)
monitor.on_alert(slack_alert)
monitor.start()
import time
time.sleep(60)
monitor.stop()
Testing
from healthmon import HealthMonitor, http_check
monitor = HealthMonitor()
monitor.register_service('web', lambda: http_check('http://localhost:3000/health'))
monitor.register_service('api', lambda: http_check('http://localhost:8080/health'))
results = monitor.check_all()
for result in results:
print(f"{result.service}: {result.status.value} ({result.response_time:.3f}s)")
status = monitor.get_status()
print(f"\nStatus: {status}")
Summary
You built a service health monitoring tool with health checks, alerting, uptime tracking, and status reporting.