Improved health check and monitoring

This commit is contained in:
2025-06-05 11:22:22 +02:00
Verified
parent fed2e0df07
commit 996fe387df
4 changed files with 107 additions and 52 deletions
+4 -3
View File
@@ -1,5 +1,6 @@
from datetime import datetime, timedelta
from logging import Logger
from typing import Optional
from zoneinfo import ZoneInfo
from starlette.responses import JSONResponse
@@ -18,12 +19,12 @@ async def get_health_v1() -> JSONResponse:
@app.get("/v1/health", response_class=JSONResponse)
async def get_health_v1() -> JSONResponse:
if app.status is None or app.status._last_update < (
async def get_health_v1(detailed: Optional[bool] = False) -> JSONResponse:
if app.status is None or app.status.get_last_update() < (
datetime.now(tz=ZoneInfo("UTC")) - timedelta(seconds=30)
):
app.update_status(await ApplicationHealth.from_data(app, db))
health: ApplicationHealth = app.status
return JSONResponse(health.to_json())
return JSONResponse(health.to_json(detailed=detailed if detailed is not None else False))
+94 -42
View File
@@ -1,15 +1,21 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, Any
from logging import Logger
from typing import Dict, Any, Optional
from zoneinfo import ZoneInfo
from libbot.cache.classes import Cache
from libbot.pycord.classes import PycordBot
from pymongo.asynchronous.database import AsyncDatabase
from pymongo.errors import ConnectionFailure
from classes.enums import HealthStatus
from classes.fastapi import FastAPI
from classes.service_status import ServiceStatus
from modules.database import db_client
from modules.utils import get_logger
logger: Logger = get_logger(__name__)
@dataclass
@@ -23,14 +29,17 @@ class ApplicationHealth:
@classmethod
async def from_data(cls, app: FastAPI, database: AsyncDatabase) -> "ApplicationHealth":
database_health: ServiceStatus = await ApplicationHealth.get_database_health(database)
cache_health: ServiceStatus = ApplicationHealth.get_cache_health(app.bot.cache)
data: Dict[str, Any] = {
"bot": ApplicationHealth.get_bot_health(app.bot),
"cache": ApplicationHealth.get_cache_health(app.bot.cache),
"database": await ApplicationHealth.get_database_health(database),
"bot": ApplicationHealth.get_bot_health(app.bot, cache_health, database_health),
"cache": cache_health,
"database": database_health,
}
data["api"] = ApplicationHealth.get_api_health(
data["bot"], data["cache"], data["database"]
data["bot"], data["cache"], database_health
)
data["_last_update"] = datetime.now(tz=ZoneInfo("UTC"))
@@ -42,10 +51,29 @@ class ApplicationHealth:
# TODO Fix the message
@staticmethod
def get_bot_health(bot: PycordBot) -> ServiceStatus:
return ServiceStatus(
HealthStatus.OPERATIONAL if bot.is_ready() else HealthStatus.FAILED, None
)
def get_bot_health(
bot: PycordBot, cache_status: ServiceStatus, database_status: ServiceStatus
) -> ServiceStatus:
if not bot.is_ready():
return ServiceStatus(HealthStatus.FAILED, "discord connection has failed")
if database_status.status != HealthStatus.OPERATIONAL:
match database_status.status:
case HealthStatus.FAILED, HealthStatus.UNKNOWN:
return ServiceStatus(HealthStatus.FAILED, "database connection has failed")
case HealthStatus.DEGRADED:
return ServiceStatus(
HealthStatus.DEGRADED, "database connection is degraded"
)
if cache_status.status not in [HealthStatus.UNKNOWN, HealthStatus.OPERATIONAL]:
match cache_status.status:
case HealthStatus.FAILED:
return ServiceStatus(HealthStatus.DEGRADED, "cache connection has failed")
case HealthStatus.DEGRADED:
return ServiceStatus(HealthStatus.DEGRADED, "cache is degraded")
return ServiceStatus(HealthStatus.OPERATIONAL, None)
# TODO Fix the message
# TODO Implement this method
@@ -60,44 +88,68 @@ class ApplicationHealth:
@staticmethod
async def get_database_health(database: AsyncDatabase) -> ServiceStatus:
try:
return ServiceStatus(
(
HealthStatus.OPERATIONAL
if (await database.client.server_info()) is not None
else HealthStatus.FAILED
),
None,
)
except Exception as exc:
await db_client.admin.command("ping")
except ConnectionFailure as exc:
return ServiceStatus(HealthStatus.FAILED, str(exc))
@staticmethod
def get_api_health(
bot_status: ServiceStatus, cache_status: ServiceStatus, database_status: ServiceStatus
) -> ServiceStatus:
if database_status.status != HealthStatus.OPERATIONAL:
ServiceStatus(
HealthStatus.FAILED,
"database connection has failed",
)
elif (
bot_status.status != HealthStatus.OPERATIONAL
or cache_status.status != HealthStatus.OPERATIONAL
):
return ServiceStatus(
HealthStatus.DEGRADED,
None,
)
return ServiceStatus(
HealthStatus.OPERATIONAL,
None,
)
def to_json(self) -> Dict[str, Dict[str, str | None]]:
return {
"api": self.api.to_json(),
"bot": self.bot.to_json(),
"cache": self.cache.to_json(),
"database": self.database.to_json(),
@staticmethod
def get_api_health(
bot_status: ServiceStatus, cache_status: ServiceStatus, database_status: ServiceStatus
) -> ServiceStatus:
if database_status.status != HealthStatus.OPERATIONAL:
match database_status.status:
case HealthStatus.FAILED, HealthStatus.UNKNOWN:
return ServiceStatus(
HealthStatus.FAILED,
"database connection has failed",
)
case HealthStatus.DEGRADED:
return ServiceStatus(
HealthStatus.DEGRADED,
"database connection is degraded",
)
if bot_status.status != HealthStatus.OPERATIONAL:
match bot_status.status:
case HealthStatus.FAILED, HealthStatus.UNKNOWN:
return ServiceStatus(
HealthStatus.DEGRADED,
"bot integration has failed",
)
case HealthStatus.DEGRADED:
return ServiceStatus(
HealthStatus.DEGRADED,
"bot integration is degraded",
)
if cache_status.status not in [HealthStatus.OPERATIONAL, HealthStatus.UNKNOWN]:
match cache_status.status:
case HealthStatus.FAILED:
return ServiceStatus(HealthStatus.DEGRADED, "cache connection has failed")
case HealthStatus.DEGRADED:
return ServiceStatus(HealthStatus.DEGRADED, "cache is degraded")
return ServiceStatus(
HealthStatus.OPERATIONAL,
None,
)
def get_last_update(self) -> datetime:
return self._last_update
def to_json(self, detailed: Optional[bool] = False) -> Dict[str, Dict[str, str | None]]:
output: Dict[str, Any] = {
"api": self.api.to_json(detailed),
"bot": self.bot.to_json(detailed),
}
if detailed:
output["cache"] = self.cache.to_json(detailed)
output["database"] = self.database.to_json(detailed)
return output
+8 -6
View File
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Literal, Dict
from typing import Literal, Dict, Optional
from classes.enums import HealthStatus
@@ -14,8 +14,10 @@ class ServiceStatus:
]
message: str | None
def to_json(self) -> Dict[str, str | None]:
return {
"status": self.status.value,
"message": self.message,
}
def to_json(self, detailed: Optional[bool] = False) -> Dict[str, str | None]:
output: Dict[str, str | None] = {"status": self.status.value}
if detailed:
output["message"] = self.message
return output
+1 -1
View File
@@ -23,7 +23,7 @@ else:
)
# Async declarations
db_client = AsyncMongoClient(con_string, timeoutms=5000)
db_client = AsyncMongoClient(con_string, connectTimeoutMS=3000)
db: AsyncDatabase = db_client.get_database(name=db_config["name"])
col_users: AsyncCollection = db.get_collection("users")