Kaynağa Gözat

feat: allow read tool to handle images (#3052)

Aiden Cline 4 ay önce
ebeveyn
işleme
225adc46ba

+ 6 - 0
packages/opencode/src/provider/models.ts

@@ -28,6 +28,12 @@ export namespace ModelsDev {
         context: z.number(),
         output: z.number(),
       }),
+      modalities: z
+        .object({
+          input: z.array(z.enum(["text", "audio", "image", "video", "pdf"])),
+          output: z.array(z.enum(["text", "audio", "image", "video", "pdf"])),
+        })
+        .optional(),
       experimental: z.boolean().optional(),
       options: z.record(z.string(), z.any()),
       provider: z.object({ npm: z.string() }).optional(),

+ 5 - 0
packages/opencode/src/provider/provider.ts

@@ -279,6 +279,11 @@ export namespace Provider {
               context: 0,
               output: 0,
             },
+          modalities: model.modalities ??
+            existing?.modalities ?? {
+              input: ["text"],
+              output: ["text"],
+            },
           provider: model.provider ?? existing?.provider,
         }
         parsed.models[modelID] = parsedModel

+ 97 - 77
packages/opencode/src/session/message-v2.ts

@@ -17,71 +17,6 @@ export namespace MessageV2 {
     }),
   )
 
-  export const ToolStatePending = z
-    .object({
-      status: z.literal("pending"),
-    })
-    .meta({
-      ref: "ToolStatePending",
-    })
-
-  export type ToolStatePending = z.infer<typeof ToolStatePending>
-
-  export const ToolStateRunning = z
-    .object({
-      status: z.literal("running"),
-      input: z.any(),
-      title: z.string().optional(),
-      metadata: z.record(z.string(), z.any()).optional(),
-      time: z.object({
-        start: z.number(),
-      }),
-    })
-    .meta({
-      ref: "ToolStateRunning",
-    })
-  export type ToolStateRunning = z.infer<typeof ToolStateRunning>
-
-  export const ToolStateCompleted = z
-    .object({
-      status: z.literal("completed"),
-      input: z.record(z.string(), z.any()),
-      output: z.string(),
-      title: z.string(),
-      metadata: z.record(z.string(), z.any()),
-      time: z.object({
-        start: z.number(),
-        end: z.number(),
-        compacted: z.number().optional(),
-      }),
-    })
-    .meta({
-      ref: "ToolStateCompleted",
-    })
-  export type ToolStateCompleted = z.infer<typeof ToolStateCompleted>
-
-  export const ToolStateError = z
-    .object({
-      status: z.literal("error"),
-      input: z.record(z.string(), z.any()),
-      error: z.string(),
-      metadata: z.record(z.string(), z.any()).optional(),
-      time: z.object({
-        start: z.number(),
-        end: z.number(),
-      }),
-    })
-    .meta({
-      ref: "ToolStateError",
-    })
-  export type ToolStateError = z.infer<typeof ToolStateError>
-
-  export const ToolState = z
-    .discriminatedUnion("status", [ToolStatePending, ToolStateRunning, ToolStateCompleted, ToolStateError])
-    .meta({
-      ref: "ToolState",
-    })
-
   const PartBase = z.object({
     id: z.string(),
     sessionID: z.string(),
@@ -134,17 +69,6 @@ export namespace MessageV2 {
   })
   export type ReasoningPart = z.infer<typeof ReasoningPart>
 
-  export const ToolPart = PartBase.extend({
-    type: z.literal("tool"),
-    callID: z.string(),
-    tool: z.string(),
-    state: ToolState,
-    metadata: z.record(z.string(), z.any()).optional(),
-  }).meta({
-    ref: "ToolPart",
-  })
-  export type ToolPart = z.infer<typeof ToolPart>
-
   const FilePartSourceBase = z.object({
     text: z
       .object({
@@ -228,6 +152,83 @@ export namespace MessageV2 {
   })
   export type StepFinishPart = z.infer<typeof StepFinishPart>
 
+  export const ToolStatePending = z
+    .object({
+      status: z.literal("pending"),
+    })
+    .meta({
+      ref: "ToolStatePending",
+    })
+
+  export type ToolStatePending = z.infer<typeof ToolStatePending>
+
+  export const ToolStateRunning = z
+    .object({
+      status: z.literal("running"),
+      input: z.any(),
+      title: z.string().optional(),
+      metadata: z.record(z.string(), z.any()).optional(),
+      time: z.object({
+        start: z.number(),
+      }),
+    })
+    .meta({
+      ref: "ToolStateRunning",
+    })
+  export type ToolStateRunning = z.infer<typeof ToolStateRunning>
+
+  export const ToolStateCompleted = z
+    .object({
+      status: z.literal("completed"),
+      input: z.record(z.string(), z.any()),
+      output: z.string(),
+      title: z.string(),
+      metadata: z.record(z.string(), z.any()),
+      time: z.object({
+        start: z.number(),
+        end: z.number(),
+        compacted: z.number().optional(),
+      }),
+      attachments: FilePart.array().optional(),
+    })
+    .meta({
+      ref: "ToolStateCompleted",
+    })
+  export type ToolStateCompleted = z.infer<typeof ToolStateCompleted>
+
+  export const ToolStateError = z
+    .object({
+      status: z.literal("error"),
+      input: z.record(z.string(), z.any()),
+      error: z.string(),
+      metadata: z.record(z.string(), z.any()).optional(),
+      time: z.object({
+        start: z.number(),
+        end: z.number(),
+      }),
+    })
+    .meta({
+      ref: "ToolStateError",
+    })
+  export type ToolStateError = z.infer<typeof ToolStateError>
+
+  export const ToolState = z
+    .discriminatedUnion("status", [ToolStatePending, ToolStateRunning, ToolStateCompleted, ToolStateError])
+    .meta({
+      ref: "ToolState",
+    })
+
+  export const ToolPart = PartBase.extend({
+    type: z.literal("tool"),
+    callID: z.string(),
+    tool: z.string(),
+    state: ToolState,
+    metadata: z.record(z.string(), z.any()).optional(),
+  }).meta({
+    ref: "ToolPart",
+  })
+  export type ToolPart = z.infer<typeof ToolPart>
+
   const Base = z.object({
     id: z.string(),
     sessionID: z.string(),
@@ -531,7 +532,25 @@ export namespace MessageV2 {
                 },
               ]
             if (part.type === "tool") {
-              if (part.state.status === "completed")
+              if (part.state.status === "completed") {
+                if (part.state.attachments?.length) {
+                  result.push({
+                    id: Identifier.ascending("message"),
+                    role: "user",
+                    parts: [
+                      {
+                        type: "text",
+                        text: `Tool ${part.tool} returned an attachment:`,
+                      },
+                      ...part.state.attachments.map((attachment) => ({
+                        type: "file" as const,
+                        url: attachment.url,
+                        mediaType: attachment.mime,
+                        filename: attachment.filename,
+                      })),
+                    ],
+                  })
+                }
                 return [
                   {
                     type: ("tool-" + part.tool) as `tool-${string}`,
@@ -542,6 +561,7 @@ export namespace MessageV2 {
                     callProviderMetadata: part.metadata,
                   },
                 ]
+              }
               if (part.state.status === "error")
                 return [
                   {

+ 5 - 0
packages/opencode/src/session/prompt.ts

@@ -457,6 +457,10 @@ export namespace SessionPrompt {
             abort: options.abortSignal!,
             messageID: input.processor.message.id,
             callID: options.toolCallId,
+            extra: {
+              modelID: input.modelID,
+              providerID: input.providerID,
+            },
             agent: input.agent.name,
             metadata: async (val) => {
               const match = input.processor.partFromToolCall(options.toolCallId)
@@ -989,6 +993,7 @@ export namespace SessionPrompt {
                         start: match.state.time.start,
                         end: Date.now(),
                       },
+                      attachments: value.output.attachments,
                     },
                   })
                   delete toolcalls[value.toolCallId]

+ 41 - 4
packages/opencode/src/tool/read.ts

@@ -7,6 +7,8 @@ import { FileTime } from "../file/time"
 import DESCRIPTION from "./read.txt"
 import { Filesystem } from "../util/filesystem"
 import { Instance } from "../project/instance"
+import { Provider } from "../provider/provider"
+import { Identifier } from "../id/id"
 
 const DEFAULT_READ_LIMIT = 2000
 const MAX_LINE_LENGTH = 2000
@@ -23,6 +25,8 @@ export const ReadTool = Tool.define("read", {
     if (!path.isAbsolute(filepath)) {
       filepath = path.join(process.cwd(), filepath)
     }
+    const title = path.relative(Instance.worktree, filepath)
+
     if (!ctx.extra?.["bypassCwdCheck"] && !Filesystem.contains(Instance.directory, filepath)) {
       throw new Error(`File ${filepath} is not in the current working directory`)
     }
@@ -48,12 +52,45 @@ export const ReadTool = Tool.define("read", {
       throw new Error(`File not found: ${filepath}`)
     }
 
-    const limit = params.limit ?? DEFAULT_READ_LIMIT
-    const offset = params.offset || 0
     const isImage = isImageFile(filepath)
-    if (isImage) throw new Error(`This is an image file of type: ${isImage}\nUse a different tool to process images`)
+    const supportsImages = await (async () => {
+      if (!ctx.extra?.["providerID"] || !ctx.extra?.["modelID"]) return false
+      const providerID = ctx.extra["providerID"] as string
+      const modelID = ctx.extra["modelID"] as string
+      const model = await Provider.getModel(providerID, modelID).catch(() => undefined)
+      if (!model) return false
+      return model.info.modalities?.input?.includes("image") ?? false
+    })()
+    if (isImage) {
+      if (!supportsImages) {
+        throw new Error(`Failed to read image: ${filepath}, model may not be able to read images`)
+      }
+      const mime = file.type
+      const msg = "Image read successfully"
+      return {
+        title,
+        output: msg,
+        metadata: {
+          preview: msg,
+        },
+        attachments: [
+          {
+            id: Identifier.ascending("part"),
+            sessionID: ctx.sessionID,
+            messageID: ctx.messageID,
+            type: "file",
+            mime,
+            url: `data:${mime};base64,${Buffer.from(await file.bytes()).toString("base64")}`,
+          },
+        ],
+      }
+    }
+
     const isBinary = await isBinaryFile(filepath, file)
     if (isBinary) throw new Error(`Cannot read binary file: ${filepath}`)
+
+    const limit = params.limit ?? DEFAULT_READ_LIMIT
+    const offset = params.offset || 0
     const lines = await file.text().then((text) => text.split("\n"))
     const raw = lines.slice(offset, offset + limit).map((line) => {
       return line.length > MAX_LINE_LENGTH ? line.substring(0, MAX_LINE_LENGTH) + "..." : line
@@ -76,7 +113,7 @@ export const ReadTool = Tool.define("read", {
     FileTime.read(ctx.sessionID, filepath)
 
     return {
-      title: path.relative(Instance.worktree, filepath),
+      title,
       output,
       metadata: {
         preview,

+ 2 - 2
packages/opencode/src/tool/read.txt

@@ -7,6 +7,6 @@ Usage:
 - You can optionally specify a line offset and limit (especially handy for long files), but it's recommended to read the whole file by not providing these parameters
 - Any lines longer than 2000 characters will be truncated
 - Results are returned using cat -n format, with line numbers starting at 1
-- This tool cannot read binary files, including images
-- You have the capability to call multiple tools in a single response. It is always better to speculatively read multiple files as a batch that are potentially useful. 
+- You have the capability to call multiple tools in a single response. It is always better to speculatively read multiple files as a batch that are potentially useful.
 - If you read a file that exists but has empty contents you will receive a system reminder warning in place of file contents.
+- You can read image files using this tool.

+ 3 - 0
packages/opencode/src/tool/tool.ts

@@ -1,9 +1,11 @@
 import z from "zod/v4"
+import type { MessageV2 } from "../session/message-v2"
 
 export namespace Tool {
   interface Metadata {
     [key: string]: any
   }
+
   export type Context<M extends Metadata = Metadata> = {
     sessionID: string
     messageID: string
@@ -25,6 +27,7 @@ export namespace Tool {
         title: string
         metadata: M
         output: string
+        attachments?: MessageV2.FilePart[]
       }>
     }>
   }