Просмотр исходного кода

feat: add K8s health check probes with DB/Redis/Proxy checks (#1019)

* feat: add K8s health check probes with DB/Redis/Proxy checks

Replace the static "ok" health endpoint with proper K8s liveness/readiness
probes that check database, Redis, and Hono proxy layer health with
timeout protection.

Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>

* fix: address code review findings from PR #1019

- Fix timer type: use `| undefined` instead of non-null assertion
- Distinguish Redis "not configured" vs "init failed" via REDIS_URL check
- Extract shared `handleReadinessRequest()` to eliminate route duplication
- Preserve legacy `/api/actions/health` response format (`status: "ok"`)

Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
Ding 1 день назад
Родитель
Сommit
165285eaba

+ 19 - 8
src/app/api/actions/[...route]/route.ts

@@ -2157,14 +2157,25 @@ app.get(
   })
 );
 
-// 健康检查端点
-app.get("/health", (c) =>
-  c.json({
-    status: "ok",
-    timestamp: new Date().toISOString(),
-    version: "1.0.0",
-  })
-);
+// 健康检查端点 (保持旧格式兼容,详细探针请用 /api/health/ready)
+app.get("/health", async (c) => {
+  try {
+    const { checkReadiness, getAppVersion } = await import("@/lib/health/checker");
+    const health = await checkReadiness();
+    return c.json({
+      status: health.status === "unhealthy" ? "error" : "ok",
+      timestamp: health.timestamp,
+      version: getAppVersion(),
+      details: health,
+    });
+  } catch {
+    return c.json({
+      status: "error",
+      timestamp: new Date().toISOString(),
+      version: "unknown",
+    });
+  }
+});
 
 // 导出处理器 (Vercel Edge Functions 格式)
 export const GET = handle(app);

+ 11 - 0
src/app/api/health/live/route.ts

@@ -0,0 +1,11 @@
+import { NextResponse } from "next/server";
+
+export const runtime = "nodejs";
+export const dynamic = "force-dynamic";
+
+export function GET() {
+  return NextResponse.json(
+    { status: "alive", timestamp: new Date().toISOString() },
+    { status: 200 }
+  );
+}

+ 8 - 0
src/app/api/health/ready/route.ts

@@ -0,0 +1,8 @@
+import { handleReadinessRequest } from "@/lib/health/checker";
+
+export const runtime = "nodejs";
+export const dynamic = "force-dynamic";
+
+export function GET() {
+  return handleReadinessRequest("health_readiness_check_failed");
+}

+ 8 - 0
src/app/api/health/route.ts

@@ -0,0 +1,8 @@
+import { handleReadinessRequest } from "@/lib/health/checker";
+
+export const runtime = "nodejs";
+export const dynamic = "force-dynamic";
+
+export function GET() {
+  return handleReadinessRequest("health_check_failed");
+}

+ 5 - 0
src/app/v1/[...route]/route.ts

@@ -40,9 +40,14 @@ app.post("/chat/completions", handleProxyRequest);
 // Response API 路由(支持 Codex)
 app.post("/responses", handleProxyRequest);
 
+// 内部健康自检端点(不走 proxy,仅验证 Hono 中间件链可用)
+app.get("/_ping", (c) => c.json({ status: "pong" }));
+
 // Claude API 和其他所有请求(fallback)
 app.all("*", handleProxyRequest);
 
+export { app as v1App };
+
 export const GET = handle(app);
 export const POST = handle(app);
 export const PUT = handle(app);

+ 159 - 0
src/lib/health/checker.ts

@@ -0,0 +1,159 @@
+import { sql } from "drizzle-orm";
+import { NextResponse } from "next/server";
+import { db } from "@/drizzle/db";
+import { getRedisClient } from "@/lib/redis/client";
+import { APP_VERSION } from "@/lib/version";
+import type { ComponentHealth, HealthCheckResponse } from "./types";
+
+// -- 版本 --
+
+const cachedVersion = APP_VERSION.replace(/^v/i, "");
+
+export function getAppVersion(): string {
+  return cachedVersion;
+}
+
+// -- 超时工具 --
+
+const DB_CHECK_TIMEOUT_MS = 3_000;
+const REDIS_CHECK_TIMEOUT_MS = 2_000;
+
+async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, label: string): Promise<T> {
+  let timer: ReturnType<typeof setTimeout> | undefined;
+  const timeout = new Promise<never>((_, reject) => {
+    timer = setTimeout(
+      () => reject(new Error(`${label} health check timed out after ${timeoutMs}ms`)),
+      timeoutMs
+    );
+  });
+  try {
+    return await Promise.race([promise, timeout]);
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+// -- 数据库检查 --
+
+export async function checkDatabase(): Promise<ComponentHealth> {
+  const start = performance.now();
+  try {
+    await withTimeout(db.execute(sql`SELECT 1`), DB_CHECK_TIMEOUT_MS, "database");
+    return { status: "up", latencyMs: Math.round(performance.now() - start) };
+  } catch (error) {
+    return {
+      status: "down",
+      latencyMs: Math.round(performance.now() - start),
+      message: error instanceof Error ? error.message : String(error),
+    };
+  }
+}
+
+// -- Redis 检查 --
+
+export async function checkRedis(): Promise<ComponentHealth> {
+  const start = performance.now();
+  try {
+    const redisUrl = process.env.REDIS_URL?.trim();
+    if (!redisUrl) {
+      return { status: "unchecked", message: "Redis not configured" };
+    }
+
+    const client = getRedisClient({ allowWhenRateLimitDisabled: true });
+    if (!client) {
+      return {
+        status: "down",
+        latencyMs: Math.round(performance.now() - start),
+        message: "Redis client initialization failed",
+      };
+    }
+    if (client.status === "end" || client.status === "close") {
+      return {
+        status: "down",
+        latencyMs: Math.round(performance.now() - start),
+        message: `Redis client status: ${client.status}`,
+      };
+    }
+    await withTimeout(client.ping(), REDIS_CHECK_TIMEOUT_MS, "redis");
+    return { status: "up", latencyMs: Math.round(performance.now() - start) };
+  } catch (error) {
+    return {
+      status: "down",
+      latencyMs: Math.round(performance.now() - start),
+      message: error instanceof Error ? error.message : String(error),
+    };
+  }
+}
+
+// -- Hono 代理层自检 --
+
+const PROXY_CHECK_TIMEOUT_MS = 2_000;
+
+export async function checkProxy(): Promise<ComponentHealth> {
+  const start = performance.now();
+  try {
+    const { v1App } = await import("@/app/v1/[...route]/route");
+    const res = await withTimeout(
+      Promise.resolve(v1App.request("/v1/_ping", { method: "GET" })),
+      PROXY_CHECK_TIMEOUT_MS,
+      "proxy"
+    );
+    if (res.ok) {
+      return { status: "up", latencyMs: Math.round(performance.now() - start) };
+    }
+    return {
+      status: "down",
+      latencyMs: Math.round(performance.now() - start),
+      message: `Proxy returned HTTP ${res.status}`,
+    };
+  } catch (error) {
+    return {
+      status: "down",
+      latencyMs: Math.round(performance.now() - start),
+      message: error instanceof Error ? error.message : String(error),
+    };
+  }
+}
+
+// -- 综合判定 --
+
+export async function checkReadiness(): Promise<HealthCheckResponse> {
+  const version = getAppVersion();
+  const [database, redis, proxy] = await Promise.all([checkDatabase(), checkRedis(), checkProxy()]);
+
+  // DB 必需,Redis/Proxy 可选(降级但不摘流量)
+  let status: HealthCheckResponse["status"] = "healthy";
+  if (database.status === "down") {
+    status = "unhealthy";
+  } else if (redis.status === "down" || proxy.status === "down") {
+    status = "degraded";
+  }
+
+  return {
+    status,
+    timestamp: new Date().toISOString(),
+    version,
+    uptime: Math.round(process.uptime()),
+    components: { database, redis, proxy },
+  };
+}
+
+// -- 共享路由 handler --
+
+export async function handleReadinessRequest(action: string): Promise<NextResponse> {
+  try {
+    const health = await checkReadiness();
+    const httpStatus = health.status === "unhealthy" ? 503 : 200;
+    return NextResponse.json(health, { status: httpStatus });
+  } catch (error) {
+    const { logger } = await import("@/lib/logger");
+    logger.error({
+      action,
+      error: error instanceof Error ? error.message : String(error),
+    });
+    return NextResponse.json(
+      { status: "unhealthy", timestamp: new Date().toISOString(), error: "Health check failed" },
+      { status: 503 }
+    );
+  }
+}

+ 19 - 0
src/lib/health/types.ts

@@ -0,0 +1,19 @@
+export type ComponentStatus = "up" | "down" | "degraded" | "unchecked";
+
+export interface ComponentHealth {
+  status: ComponentStatus;
+  latencyMs?: number;
+  message?: string;
+}
+
+export interface HealthCheckResponse {
+  status: "healthy" | "degraded" | "unhealthy";
+  timestamp: string;
+  version: string;
+  uptime: number;
+  components?: {
+    database?: ComponentHealth;
+    redis?: ComponentHealth;
+    proxy?: ComponentHealth;
+  };
+}

+ 102 - 0
tests/unit/api/health-routes.test.ts

@@ -0,0 +1,102 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+// -- mocks --
+
+const mocks = vi.hoisted(() => ({
+  handleReadinessRequest: vi.fn(),
+}));
+
+vi.mock("@/lib/health/checker", () => ({
+  handleReadinessRequest: mocks.handleReadinessRequest,
+}));
+
+// helper: create NextResponse-like object
+function jsonResponse(body: Record<string, unknown>, status: number) {
+  return new Response(JSON.stringify(body), {
+    status,
+    headers: { "content-type": "application/json" },
+  });
+}
+
+// -- liveness --
+
+describe("GET /api/health/live", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.resetModules();
+  });
+
+  it("returns 200 with alive status", async () => {
+    const { GET } = await import("@/app/api/health/live/route");
+    const response = GET();
+    expect(response.status).toBe(200);
+    const body = await response.json();
+    expect(body.status).toBe("alive");
+    expect(body.timestamp).toBeDefined();
+  });
+});
+
+// -- readiness --
+
+describe("GET /api/health/ready", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.resetModules();
+  });
+
+  it("returns 200 for healthy", async () => {
+    mocks.handleReadinessRequest.mockResolvedValue(
+      jsonResponse({ status: "healthy", version: "0.6.8" }, 200)
+    );
+    const { GET } = await import("@/app/api/health/ready/route");
+    const response = await GET();
+    expect(response.status).toBe(200);
+    const body = await response.json();
+    expect(body.status).toBe("healthy");
+  });
+
+  it("returns 200 for degraded", async () => {
+    mocks.handleReadinessRequest.mockResolvedValue(
+      jsonResponse({ status: "degraded", version: "0.6.8" }, 200)
+    );
+    const { GET } = await import("@/app/api/health/ready/route");
+    const response = await GET();
+    expect(response.status).toBe(200);
+    const body = await response.json();
+    expect(body.status).toBe("degraded");
+  });
+
+  it("returns 503 for unhealthy", async () => {
+    mocks.handleReadinessRequest.mockResolvedValue(jsonResponse({ status: "unhealthy" }, 503));
+    const { GET } = await import("@/app/api/health/ready/route");
+    const response = await GET();
+    expect(response.status).toBe(503);
+    const body = await response.json();
+    expect(body.status).toBe("unhealthy");
+  });
+});
+
+// -- combined /api/health --
+
+describe("GET /api/health", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.resetModules();
+  });
+
+  it("returns 200 for healthy", async () => {
+    mocks.handleReadinessRequest.mockResolvedValue(
+      jsonResponse({ status: "healthy", version: "0.6.8" }, 200)
+    );
+    const { GET } = await import("@/app/api/health/route");
+    const response = await GET();
+    expect(response.status).toBe(200);
+  });
+
+  it("returns 503 for unhealthy", async () => {
+    mocks.handleReadinessRequest.mockResolvedValue(jsonResponse({ status: "unhealthy" }, 503));
+    const { GET } = await import("@/app/api/health/route");
+    const response = await GET();
+    expect(response.status).toBe(503);
+  });
+});

+ 276 - 0
tests/unit/lib/health-checker.test.ts

@@ -0,0 +1,276 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+// -- mocks --
+
+const mocks = vi.hoisted(() => ({
+  dbExecute: vi.fn(),
+  getRedisClient: vi.fn(),
+  APP_VERSION: "v0.6.8",
+  v1App: {
+    request: vi.fn(),
+  },
+}));
+
+vi.mock("@/drizzle/db", () => ({
+  db: { execute: mocks.dbExecute },
+}));
+
+vi.mock("drizzle-orm", () => ({
+  sql: (strings: TemplateStringsArray) => strings.join(""),
+}));
+
+vi.mock("@/lib/redis/client", () => ({
+  getRedisClient: mocks.getRedisClient,
+}));
+
+vi.mock("@/lib/version", () => ({
+  APP_VERSION: mocks.APP_VERSION,
+}));
+
+vi.mock("@/app/v1/[...route]/route", () => ({
+  v1App: mocks.v1App,
+}));
+
+// -- tests --
+
+describe("health/checker", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.resetModules();
+  });
+
+  // -- getAppVersion --
+
+  describe("getAppVersion", () => {
+    it("returns version without v prefix", async () => {
+      const { getAppVersion } = await import("@/lib/health/checker");
+      expect(getAppVersion()).toBe("0.6.8");
+    });
+  });
+
+  // -- checkDatabase --
+
+  describe("checkDatabase", () => {
+    it("returns up when SELECT 1 succeeds", async () => {
+      mocks.dbExecute.mockResolvedValue([{ "?column?": 1 }]);
+      const { checkDatabase } = await import("@/lib/health/checker");
+      const result = await checkDatabase();
+      expect(result.status).toBe("up");
+      expect(result.latencyMs).toBeGreaterThanOrEqual(0);
+    });
+
+    it("returns down when query throws", async () => {
+      mocks.dbExecute.mockRejectedValue(new Error("connection refused"));
+      const { checkDatabase } = await import("@/lib/health/checker");
+      const result = await checkDatabase();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("connection refused");
+    });
+
+    it("returns down on timeout", async () => {
+      mocks.dbExecute.mockImplementation(
+        () => new Promise((resolve) => setTimeout(resolve, 5_000))
+      );
+      const { checkDatabase } = await import("@/lib/health/checker");
+      const result = await checkDatabase();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("timed out");
+    }, 10_000);
+  });
+
+  // -- checkRedis --
+
+  describe("checkRedis", () => {
+    it("returns up when ping succeeds", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockResolvedValue("PONG"),
+      });
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("up");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns unchecked when REDIS_URL is not set", async () => {
+      delete process.env.REDIS_URL;
+      mocks.getRedisClient.mockReturnValue(null);
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("unchecked");
+      expect(result.message).toContain("not configured");
+    });
+
+    it("returns down when REDIS_URL is set but client is null", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.getRedisClient.mockReturnValue(null);
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("initialization failed");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns down when client status is end", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.getRedisClient.mockReturnValue({ status: "end" });
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("end");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns down when client status is close", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.getRedisClient.mockReturnValue({ status: "close" });
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("close");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns down when ping throws", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockRejectedValue(new Error("ECONNRESET")),
+      });
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("ECONNRESET");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns down on ping timeout", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockImplementation(() => new Promise((r) => setTimeout(r, 5_000))),
+      });
+      const { checkRedis } = await import("@/lib/health/checker");
+      const result = await checkRedis();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("timed out");
+      delete process.env.REDIS_URL;
+    }, 10_000);
+  });
+
+  // -- checkProxy --
+
+  describe("checkProxy", () => {
+    it("returns up when _ping returns 200", async () => {
+      mocks.v1App.request.mockResolvedValue(new Response('{"status":"pong"}', { status: 200 }));
+      const { checkProxy } = await import("@/lib/health/checker");
+      const result = await checkProxy();
+      expect(result.status).toBe("up");
+      expect(result.latencyMs).toBeGreaterThanOrEqual(0);
+    });
+
+    it("returns down when _ping returns non-200", async () => {
+      mocks.v1App.request.mockResolvedValue(new Response("error", { status: 500 }));
+      const { checkProxy } = await import("@/lib/health/checker");
+      const result = await checkProxy();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("HTTP 500");
+    });
+
+    it("returns down when request throws", async () => {
+      mocks.v1App.request.mockRejectedValue(new Error("middleware crashed"));
+      const { checkProxy } = await import("@/lib/health/checker");
+      const result = await checkProxy();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("middleware crashed");
+    });
+
+    it("returns down on timeout", async () => {
+      mocks.v1App.request.mockImplementation(() => new Promise((r) => setTimeout(r, 5_000)));
+      const { checkProxy } = await import("@/lib/health/checker");
+      const result = await checkProxy();
+      expect(result.status).toBe("down");
+      expect(result.message).toContain("timed out");
+    }, 10_000);
+  });
+
+  // -- checkReadiness --
+
+  describe("checkReadiness", () => {
+    it("returns healthy when all components are up", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.dbExecute.mockResolvedValue([{ "?column?": 1 }]);
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockResolvedValue("PONG"),
+      });
+      mocks.v1App.request.mockResolvedValue(new Response('{"status":"pong"}', { status: 200 }));
+      const { checkReadiness } = await import("@/lib/health/checker");
+      const result = await checkReadiness();
+      expect(result.status).toBe("healthy");
+      expect(result.version).toBe("0.6.8");
+      expect(result.uptime).toBeGreaterThanOrEqual(0);
+      expect(result.components?.database?.status).toBe("up");
+      expect(result.components?.redis?.status).toBe("up");
+      expect(result.components?.proxy?.status).toBe("up");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns degraded when Redis is down but DB and proxy are up", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.dbExecute.mockResolvedValue([{ "?column?": 1 }]);
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockRejectedValue(new Error("ECONNRESET")),
+      });
+      mocks.v1App.request.mockResolvedValue(new Response('{"status":"pong"}', { status: 200 }));
+      const { checkReadiness } = await import("@/lib/health/checker");
+      const result = await checkReadiness();
+      expect(result.status).toBe("degraded");
+      expect(result.components?.database?.status).toBe("up");
+      expect(result.components?.redis?.status).toBe("down");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns degraded when proxy is down but DB and Redis are up", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.dbExecute.mockResolvedValue([{ "?column?": 1 }]);
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockResolvedValue("PONG"),
+      });
+      mocks.v1App.request.mockRejectedValue(new Error("middleware crashed"));
+      const { checkReadiness } = await import("@/lib/health/checker");
+      const result = await checkReadiness();
+      expect(result.status).toBe("degraded");
+      expect(result.components?.proxy?.status).toBe("down");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns unhealthy when DB is down", async () => {
+      process.env.REDIS_URL = "redis://localhost:6379";
+      mocks.dbExecute.mockRejectedValue(new Error("connection refused"));
+      mocks.getRedisClient.mockReturnValue({
+        status: "ready",
+        ping: vi.fn().mockResolvedValue("PONG"),
+      });
+      mocks.v1App.request.mockResolvedValue(new Response('{"status":"pong"}', { status: 200 }));
+      const { checkReadiness } = await import("@/lib/health/checker");
+      const result = await checkReadiness();
+      expect(result.status).toBe("unhealthy");
+      expect(result.components?.database?.status).toBe("down");
+      delete process.env.REDIS_URL;
+    });
+
+    it("returns healthy when Redis is unchecked (not configured)", async () => {
+      mocks.dbExecute.mockResolvedValue([{ "?column?": 1 }]);
+      mocks.getRedisClient.mockReturnValue(null);
+      mocks.v1App.request.mockResolvedValue(new Response('{"status":"pong"}', { status: 200 }));
+      const { checkReadiness } = await import("@/lib/health/checker");
+      const result = await checkReadiness();
+      expect(result.status).toBe("healthy");
+      expect(result.components?.redis?.status).toBe("unchecked");
+    });
+  });
+});