Sfoglia il codice sorgente

Add CLI integration coverage for stdin stream routing/race invariants (#11846)

Add integration coverage for stdin stream routing and race invariants
Chris Estreich 1 mese fa
parent
commit
9a58f76299

+ 161 - 0
apps/cli/scripts/integration/cases/cancel-message-recovery-race.ts

@@ -0,0 +1,161 @@
+import { runStreamCase, StreamEvent } from "../lib/stream-harness"
+
+const START_PROMPT =
+	'Run exactly this command and do not summarize until it finishes: sleep 12 && echo "done". After it finishes, reply with exactly "done".'
+const FOLLOWUP_PROMPT = 'After cancellation, reply with only "RACE-OK".'
+
+async function main() {
+	const startRequestId = `start-${Date.now()}`
+	const cancelRequestId = `cancel-${Date.now()}`
+	const followupRequestId = `message-${Date.now()}`
+	const shutdownRequestId = `shutdown-${Date.now()}`
+
+	let initSeen = false
+	let sentCancelAndFollowup = false
+	let sentShutdown = false
+	let cancelDoneCode: string | undefined
+	let followupDoneCode: string | undefined
+	let followupResult = ""
+	let sawFollowupUserTurn = false
+	let sawMisroutedToolResult = false
+	let sawMessageControlError = false
+
+	await runStreamCase({
+		onEvent(event: StreamEvent, context) {
+			if (event.type === "system" && event.subtype === "init" && !initSeen) {
+				initSeen = true
+				context.sendCommand({
+					command: "start",
+					requestId: startRequestId,
+					prompt: START_PROMPT,
+				})
+				return
+			}
+
+			if (event.type === "control" && event.subtype === "error") {
+				if (event.requestId === followupRequestId) {
+					sawMessageControlError = true
+				}
+				throw new Error(
+					`received control error for requestId=${event.requestId ?? "unknown"} command=${event.command ?? "unknown"} code=${event.code ?? "unknown"} content=${event.content ?? ""}`,
+				)
+			}
+
+			if (
+				!sentCancelAndFollowup &&
+				event.type === "tool_use" &&
+				event.requestId === startRequestId &&
+				event.subtype === "command"
+			) {
+				context.sendCommand({
+					command: "cancel",
+					requestId: cancelRequestId,
+				})
+				context.sendCommand({
+					command: "message",
+					requestId: followupRequestId,
+					prompt: FOLLOWUP_PROMPT,
+				})
+				sentCancelAndFollowup = true
+				return
+			}
+
+			if (
+				event.type === "control" &&
+				event.command === "cancel" &&
+				event.subtype === "done" &&
+				event.requestId === cancelRequestId
+			) {
+				cancelDoneCode = event.code
+				return
+			}
+
+			if (
+				event.type === "control" &&
+				event.command === "message" &&
+				event.subtype === "done" &&
+				event.requestId === followupRequestId
+			) {
+				followupDoneCode = event.code
+				return
+			}
+
+			if (
+				event.type === "tool_result" &&
+				event.requestId === followupRequestId &&
+				typeof event.content === "string" &&
+				event.content.includes("<user_message>")
+			) {
+				sawMisroutedToolResult = true
+				return
+			}
+
+			if (event.type === "user" && event.requestId === followupRequestId) {
+				sawFollowupUserTurn = typeof event.content === "string" && event.content.includes("RACE-OK")
+				return
+			}
+
+			if (event.type !== "result" || event.done !== true || event.requestId !== followupRequestId) {
+				return
+			}
+
+			followupResult = event.content ?? ""
+
+			if (followupResult.trim().length === 0) {
+				throw new Error("follow-up after cancel produced an empty result")
+			}
+			if (cancelDoneCode !== "cancel_requested") {
+				throw new Error(
+					`cancel done code mismatch; expected cancel_requested, got "${cancelDoneCode ?? "none"}"`,
+				)
+			}
+			if (followupDoneCode !== "responded" && followupDoneCode !== "queued") {
+				throw new Error(
+					`unexpected follow-up done code after cancel race; expected responded|queued, got "${followupDoneCode ?? "none"}"`,
+				)
+			}
+			if (sawMessageControlError) {
+				throw new Error("follow-up message emitted control error in cancel recovery race")
+			}
+			if (sawMisroutedToolResult) {
+				throw new Error(
+					"follow-up message was misrouted into tool_result (<user_message>) in cancel recovery race",
+				)
+			}
+			if (!sawFollowupUserTurn) {
+				throw new Error("follow-up after cancel did not appear as a normal user turn")
+			}
+
+			console.log(`[PASS] cancel done code: "${cancelDoneCode}"`)
+			console.log(`[PASS] follow-up done code: "${followupDoneCode}"`)
+			console.log(`[PASS] follow-up user turn observed: ${sawFollowupUserTurn}`)
+			console.log(`[PASS] follow-up result: "${followupResult}"`)
+
+			if (!sentShutdown) {
+				context.sendCommand({
+					command: "shutdown",
+					requestId: shutdownRequestId,
+				})
+				sentShutdown = true
+			}
+		},
+		onTimeoutMessage() {
+			return [
+				"timed out waiting for cancel-message-recovery-race validation",
+				`initSeen=${initSeen}`,
+				`sentCancelAndFollowup=${sentCancelAndFollowup}`,
+				`cancelDoneCode=${cancelDoneCode ?? "none"}`,
+				`followupDoneCode=${followupDoneCode ?? "none"}`,
+				`sawFollowupUserTurn=${sawFollowupUserTurn}`,
+				`sawMisroutedToolResult=${sawMisroutedToolResult}`,
+				`sawMessageControlError=${sawMessageControlError}`,
+				`haveFollowupResult=${Boolean(followupResult)}`,
+			].join(" ")
+		},
+	})
+}
+
+main().catch((error) => {
+	console.error(`[FAIL] ${error instanceof Error ? error.message : String(error)}`)
+	process.exit(1)
+})

+ 73 - 0
apps/cli/scripts/integration/cases/cancel-without-active-task.ts

@@ -0,0 +1,73 @@
+import { runStreamCase, StreamEvent } from "../lib/stream-harness"
+
+async function main() {
+	const cancelRequestId = `cancel-${Date.now()}`
+	const shutdownRequestId = `shutdown-${Date.now()}`
+
+	let initSeen = false
+	let cancelAckSeen = false
+	let cancelDoneSeen = false
+	let shutdownSent = false
+
+	await runStreamCase({
+		onEvent(event: StreamEvent, context) {
+			if (event.type === "system" && event.subtype === "init" && !initSeen) {
+				initSeen = true
+				context.sendCommand({
+					command: "cancel",
+					requestId: cancelRequestId,
+				})
+				return
+			}
+
+			if (
+				event.type === "control" &&
+				event.subtype === "ack" &&
+				event.command === "cancel" &&
+				event.requestId === cancelRequestId
+			) {
+				cancelAckSeen = true
+				return
+			}
+
+			if (
+				event.type === "control" &&
+				event.subtype === "done" &&
+				event.command === "cancel" &&
+				event.requestId === cancelRequestId
+			) {
+				cancelDoneSeen = true
+
+				if (event.code !== "no_active_task") {
+					throw new Error(`cancel without task should return no_active_task, got "${event.code ?? "none"}"`)
+				}
+				if (event.success !== true) {
+					throw new Error("cancel without task should be treated as successful no-op")
+				}
+
+				if (!shutdownSent) {
+					context.sendCommand({
+						command: "shutdown",
+						requestId: shutdownRequestId,
+					})
+					shutdownSent = true
+				}
+				return
+			}
+
+			if (event.type === "control" && event.subtype === "error") {
+				throw new Error(
+					`unexpected control error command=${event.command ?? "unknown"} code=${event.code ?? "unknown"} content=${event.content ?? ""}`,
+				)
+			}
+		},
+		onTimeoutMessage() {
+			return `timed out waiting for cancel-without-active-task validation (initSeen=${initSeen}, cancelAckSeen=${cancelAckSeen}, cancelDoneSeen=${cancelDoneSeen}, shutdownSent=${shutdownSent})`
+		},
+	})
+}
+
+main().catch((error) => {
+	console.error(`[FAIL] ${error instanceof Error ? error.message : String(error)}`)
+	process.exit(1)
+})

+ 47 - 13
apps/cli/scripts/integration/cases/followup-after-completion.ts

@@ -7,18 +7,9 @@ function parseEventContent(text: string | undefined): string {
 	return typeof text === "string" ? text : ""
 }
 
-function validateFollowupAnswer(text: string): void {
-	const normalized = text.toLowerCase()
-	const containsExpected = /\b6\b/.test(normalized) || normalized.includes("six")
-	const containsOldAnswer = /\b1\+1\b/.test(normalized) || /\b2\b/.test(normalized)
-	const containsQuestionReference = normalized.includes("3+3")
-
-	if (!containsExpected) {
-		throw new Error(`follow-up result did not answer the follow-up question; result="${text}"`)
-	}
-
-	if (!containsQuestionReference && containsOldAnswer && !containsExpected) {
-		throw new Error(`follow-up result appears anchored to first question; result="${text}"`)
+function validateFollowupResult(text: string): void {
+	if (text.trim().length === 0) {
+		throw new Error("follow-up produced an empty result")
 	}
 }
 
@@ -32,6 +23,9 @@ async function main() {
 	let sentShutdown = false
 	let firstResult = ""
 	let followupResult = ""
+	let followupDoneCode: string | undefined
+	let sawFollowupUserTurn = false
+	let sawMisroutedToolResult = false
 
 	await runStreamCase({
 		onEvent(event: StreamEvent, context) {
@@ -52,6 +46,31 @@ async function main() {
 			}
 
 			if (event.type !== "result" || event.done !== true) {
+				if (
+					event.type === "control" &&
+					event.requestId === followupRequestId &&
+					event.command === "message" &&
+					event.subtype === "done"
+				) {
+					followupDoneCode = event.code
+					return
+				}
+
+				if (
+					event.type === "tool_result" &&
+					event.requestId === followupRequestId &&
+					typeof event.content === "string" &&
+					event.content.includes("<user_message>")
+				) {
+					sawMisroutedToolResult = true
+					return
+				}
+
+				if (event.type === "user" && event.requestId === followupRequestId) {
+					sawFollowupUserTurn = typeof event.content === "string" && event.content.includes("3+3")
+					return
+				}
+
 				return
 			}
 
@@ -77,7 +96,22 @@ async function main() {
 			}
 
 			followupResult = parseEventContent(event.content)
-			validateFollowupAnswer(followupResult)
+			validateFollowupResult(followupResult)
+
+			if (followupDoneCode !== "responded") {
+				throw new Error(
+					`follow-up message was not routed as ask response; code="${followupDoneCode ?? "none"}"`,
+				)
+			}
+
+			if (!sawFollowupUserTurn) {
+				throw new Error("follow-up did not appear as a normal user turn in stream output")
+			}
+
+			if (sawMisroutedToolResult) {
+				throw new Error("follow-up message was misrouted into tool_result (<user_message>), old bug reproduced")
+			}
+
 			console.log(`[PASS] first result="${firstResult}"`)
 			console.log(`[PASS] follow-up result="${followupResult}"`)
 

+ 136 - 0
apps/cli/scripts/integration/cases/followup-completion-ask-response-images.ts

@@ -0,0 +1,136 @@
+import { runStreamCase, StreamEvent } from "../lib/stream-harness"
+
+const START_PROMPT = 'Answer this question and finish: What is 1+1? Reply with only "2", then complete the task.'
+const FOLLOWUP_PROMPT = 'Different question now: what is 3+3? Reply with only "6".'
+const ONE_PIXEL_IMAGE =
+	"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAusB9Y9R4WQAAAAASUVORK5CYII="
+
+async function main() {
+	const startRequestId = `start-${Date.now()}`
+	const followupRequestId = `message-${Date.now()}`
+	const shutdownRequestId = `shutdown-${Date.now()}`
+
+	let initSeen = false
+	let sentFollowup = false
+	let sentShutdown = false
+	let followupDoneCode: string | undefined
+	let sawFollowupUserTurn = false
+	let sawMisroutedToolResult = false
+	let sawQueueImageMetadata = false
+	let shutdownDoneSeen = false
+
+	await runStreamCase({
+		onEvent(event: StreamEvent, context) {
+			if (event.type === "system" && event.subtype === "init" && !initSeen) {
+				initSeen = true
+				context.sendCommand({
+					command: "start",
+					requestId: startRequestId,
+					prompt: START_PROMPT,
+				})
+				return
+			}
+
+			if (event.type === "control" && event.subtype === "error") {
+				throw new Error(
+					`received control error for requestId=${event.requestId ?? "unknown"} command=${event.command ?? "unknown"} code=${event.code ?? "unknown"} content=${event.content ?? ""}`,
+				)
+			}
+
+			if (
+				event.type === "control" &&
+				event.command === "message" &&
+				event.subtype === "done" &&
+				event.requestId === followupRequestId
+			) {
+				followupDoneCode = event.code
+				if (!sentShutdown) {
+					context.sendCommand({
+						command: "shutdown",
+						requestId: shutdownRequestId,
+					})
+					sentShutdown = true
+				}
+				return
+			}
+
+			if (
+				event.type === "control" &&
+				event.command === "shutdown" &&
+				event.subtype === "done" &&
+				event.requestId === shutdownRequestId
+			) {
+				shutdownDoneSeen = true
+
+				if (followupDoneCode !== "responded") {
+					throw new Error(
+						`follow-up image message was not routed as ask response; code="${followupDoneCode ?? "none"}"`,
+					)
+				}
+				if (sawQueueImageMetadata) {
+					throw new Error("follow-up image message was unexpectedly queued (observed queue image metadata)")
+				}
+				if (sawMisroutedToolResult) {
+					throw new Error("follow-up image message was misrouted into tool_result (<user_message>)")
+				}
+
+				console.log(`[PASS] follow-up image control code: "${followupDoneCode}"`)
+				console.log(`[PASS] follow-up image user turn observed before shutdown: ${sawFollowupUserTurn}`)
+				return
+			}
+
+			if (
+				event.type === "queue" &&
+				Array.isArray(event.queue) &&
+				event.queue.some((item) => item?.imageCount === 1)
+			) {
+				sawQueueImageMetadata = true
+				return
+			}
+
+			if (
+				event.type === "tool_result" &&
+				event.requestId === followupRequestId &&
+				typeof event.content === "string" &&
+				event.content.includes("<user_message>")
+			) {
+				sawMisroutedToolResult = true
+				return
+			}
+
+			if (event.type === "user" && event.requestId === followupRequestId) {
+				sawFollowupUserTurn = typeof event.content === "string" && event.content.includes("3+3")
+				return
+			}
+
+			if (event.type === "result" && event.done === true && event.requestId === startRequestId && !sentFollowup) {
+				context.sendCommand({
+					command: "message",
+					requestId: followupRequestId,
+					prompt: FOLLOWUP_PROMPT,
+					images: [ONE_PIXEL_IMAGE],
+				})
+				sentFollowup = true
+				return
+			}
+		},
+		onTimeoutMessage() {
+			return [
+				"timed out waiting for followup-completion-ask-response-images validation",
+				`initSeen=${initSeen}`,
+				`sentFollowup=${sentFollowup}`,
+				`sentShutdown=${sentShutdown}`,
+				`shutdownDoneSeen=${shutdownDoneSeen}`,
+				`followupDoneCode=${followupDoneCode ?? "none"}`,
+				`sawFollowupUserTurn=${sawFollowupUserTurn}`,
+				`sawMisroutedToolResult=${sawMisroutedToolResult}`,
+				`sawQueueImageMetadata=${sawQueueImageMetadata}`,
+			].join(" ")
+		},
+	})
+}
+
+main().catch((error) => {
+	console.error(`[FAIL] ${error instanceof Error ? error.message : String(error)}`)
+	process.exit(1)
+})

+ 10 - 0
apps/cli/scripts/integration/cases/followup-completion-ask-response.ts

@@ -16,6 +16,7 @@ async function main() {
 	let followupDoneCode: string | undefined
 	let sawFollowupUserTurn = false
 	let sawMisroutedToolResult = false
+	let sawQueueEventForFollowupRequest = false
 	let followupResult = ""
 
 	await runStreamCase({
@@ -54,6 +55,11 @@ async function main() {
 				return
 			}
 
+			if (event.type === "queue" && event.requestId === followupRequestId) {
+				sawQueueEventForFollowupRequest = true
+				return
+			}
+
 			if (
 				event.type === "tool_result" &&
 				event.requestId === followupRequestId &&
@@ -97,6 +103,9 @@ async function main() {
 			if (sawMisroutedToolResult) {
 				throw new Error("follow-up message was misrouted into tool_result (<user_message>), old bug reproduced")
 			}
+			if (sawQueueEventForFollowupRequest) {
+				throw new Error("follow-up message produced queue events despite responded routing")
+			}
 
 			if (!sawFollowupUserTurn) {
 				throw new Error("follow-up did not appear as a normal user turn in stream output")
@@ -131,6 +140,7 @@ async function main() {
 				`followupDoneCode=${followupDoneCode ?? "none"}`,
 				`sawFollowupUserTurn=${sawFollowupUserTurn}`,
 				`sawMisroutedToolResult=${sawMisroutedToolResult}`,
+				`sawQueueEventForFollowupRequest=${sawQueueEventForFollowupRequest}`,
 				`haveFollowupResult=${Boolean(followupResult)}`,
 			].join(" ")
 		},

+ 148 - 0
apps/cli/scripts/integration/cases/mixed-command-ordering.ts

@@ -0,0 +1,148 @@
+import { runStreamCase, StreamEvent } from "../lib/stream-harness"
+
+const START_PROMPT =
+	'Run exactly this command and do not summarize until it finishes: sleep 8 && echo "done". After it finishes, reply with exactly "done".'
+
+async function main() {
+	const startRequestId = `start-${Date.now()}`
+	const pingARequestId = `ping-a-${Date.now()}`
+	const messageRequestId = `message-${Date.now()}`
+	const pingBRequestId = `ping-b-${Date.now()}`
+	const shutdownRequestId = `shutdown-${Date.now()}`
+
+	let initSeen = false
+	let sentInterleavedCommands = false
+	let sentShutdown = false
+
+	const eventOrderByRequestId = new Map<string, string[]>()
+	let messageDoneCode: string | undefined
+	let messageQueueEnqueuedSeen = false
+	let messageResultSeen = false
+
+	function recordControlEvent(event: StreamEvent): void {
+		if (!event.requestId || event.type !== "control" || !event.subtype) {
+			return
+		}
+		const existing = eventOrderByRequestId.get(event.requestId) ?? []
+		existing.push(event.subtype)
+		eventOrderByRequestId.set(event.requestId, existing)
+	}
+
+	await runStreamCase({
+		onEvent(event: StreamEvent, context) {
+			if (event.type === "system" && event.subtype === "init" && !initSeen) {
+				initSeen = true
+				context.sendCommand({
+					command: "start",
+					requestId: startRequestId,
+					prompt: START_PROMPT,
+				})
+				return
+			}
+
+			recordControlEvent(event)
+
+			if (event.type === "control" && event.subtype === "error") {
+				throw new Error(
+					`received control error for requestId=${event.requestId ?? "unknown"} command=${event.command ?? "unknown"} code=${event.code ?? "unknown"} content=${event.content ?? ""}`,
+				)
+			}
+
+			if (
+				!sentInterleavedCommands &&
+				event.type === "control" &&
+				event.subtype === "ack" &&
+				event.command === "start" &&
+				event.requestId === startRequestId
+			) {
+				context.sendCommand({
+					command: "ping",
+					requestId: pingARequestId,
+				})
+				context.sendCommand({
+					command: "message",
+					requestId: messageRequestId,
+					prompt: 'When this queued message is processed, reply with only "INTERLEAVED".',
+				})
+				context.sendCommand({
+					command: "ping",
+					requestId: pingBRequestId,
+				})
+				sentInterleavedCommands = true
+				return
+			}
+
+			if (
+				event.type === "control" &&
+				event.subtype === "done" &&
+				event.command === "message" &&
+				event.requestId === messageRequestId
+			) {
+				messageDoneCode = event.code
+				return
+			}
+
+			if (
+				event.type === "queue" &&
+				event.subtype === "enqueued" &&
+				event.requestId === startRequestId &&
+				event.queueDepth === 1
+			) {
+				messageQueueEnqueuedSeen = true
+				return
+			}
+
+			if (event.type === "result" && event.done === true && event.requestId === messageRequestId) {
+				messageResultSeen = true
+
+				const pingAOrder = eventOrderByRequestId.get(pingARequestId) ?? []
+				const pingBOrder = eventOrderByRequestId.get(pingBRequestId) ?? []
+				const messageOrder = eventOrderByRequestId.get(messageRequestId) ?? []
+
+				if (pingAOrder.join(",") !== "ack,done") {
+					throw new Error(`ping A control order mismatch: ${pingAOrder.join(",") || "none"}`)
+				}
+				if (pingBOrder.join(",") !== "ack,done") {
+					throw new Error(`ping B control order mismatch: ${pingBOrder.join(",") || "none"}`)
+				}
+				if (messageOrder.join(",") !== "ack,done") {
+					throw new Error(`message control order mismatch: ${messageOrder.join(",") || "none"}`)
+				}
+				if (messageDoneCode !== "queued") {
+					throw new Error(
+						`expected interleaved message done code \"queued\", got \"${messageDoneCode ?? "none"}\"`,
+					)
+				}
+				if (!messageQueueEnqueuedSeen) {
+					throw new Error("expected queue enqueued event after interleaved message")
+				}
+
+				if (!sentShutdown) {
+					context.sendCommand({
+						command: "shutdown",
+						requestId: shutdownRequestId,
+					})
+					sentShutdown = true
+				}
+			}
+		},
+		onTimeoutMessage() {
+			return [
+				"timed out waiting for mixed-command-ordering validation",
+				`initSeen=${initSeen}`,
+				`sentInterleavedCommands=${sentInterleavedCommands}`,
+				`messageDoneCode=${messageDoneCode ?? "none"}`,
+				`messageQueueEnqueuedSeen=${messageQueueEnqueuedSeen}`,
+				`messageResultSeen=${messageResultSeen}`,
+				`pingAOrder=${(eventOrderByRequestId.get(pingARequestId) ?? []).join(",") || "none"}`,
+				`messageOrder=${(eventOrderByRequestId.get(messageRequestId) ?? []).join(",") || "none"}`,
+				`pingBOrder=${(eventOrderByRequestId.get(pingBRequestId) ?? []).join(",") || "none"}`,
+			].join(" ")
+		},
+	})
+}
+
+main().catch((error) => {
+	console.error(`[FAIL] ${error instanceof Error ? error.message : String(error)}`)
+	process.exit(1)
+})

+ 10 - 2
apps/cli/src/commands/cli/__tests__/parse-stdin-command.test.ts

@@ -168,8 +168,16 @@ describe("shouldSendMessageAsAskResponse", () => {
 		expect(shouldSendMessageAsAskResponse(true, "completion_result")).toBe(true)
 	})
 
-	it("routes followup asks as ask responses", () => {
-		expect(shouldSendMessageAsAskResponse(true, "followup")).toBe(true)
+	it.each([
+		"followup",
+		"tool",
+		"command",
+		"use_mcp_server",
+		"resume_task",
+		"resume_completed_task",
+		"mistake_limit_reached",
+	])("routes %s asks as ask responses", (ask) => {
+		expect(shouldSendMessageAsAskResponse(true, ask)).toBe(true)
 	})
 
 	it("does not route when not waiting for input", () => {