fetch.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. package fetch
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "io"
  7. "net/http"
  8. "net/url"
  9. "strconv"
  10. "strings"
  11. "time"
  12. md "github.com/JohannesKaufmann/html-to-markdown"
  13. "github.com/go-shiori/go-readability"
  14. mcpservers "github.com/labring/aiproxy/mcp-servers"
  15. "github.com/mark3labs/mcp-go/mcp"
  16. "github.com/mark3labs/mcp-go/server"
  17. "github.com/temoto/robotstxt"
  18. )
  19. const (
  20. DefaultUserAgentAutonomous = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
  21. DefaultUserAgentManual = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
  22. )
  23. // extractContentFromHTML extracts and converts HTML content to Markdown format
  24. func extractContentFromHTML(htmlContent string) string {
  25. article, err := readability.FromReader(strings.NewReader(htmlContent), nil)
  26. if err != nil {
  27. return "<error>Page failed to be simplified from HTML</error>"
  28. }
  29. if article.Content == "" {
  30. return "<error>Page failed to be simplified from HTML</error>"
  31. }
  32. converter := md.NewConverter("", true, nil)
  33. markdown, err := converter.ConvertString(article.Content)
  34. if err != nil {
  35. return "<error>Failed to convert HTML to markdown</error>"
  36. }
  37. return markdown
  38. }
  39. // getRobotsTxtURL gets the robots.txt URL for a given website URL
  40. func getRobotsTxtURL(urlStr string) (string, error) {
  41. parsedURL, err := url.Parse(urlStr)
  42. if err != nil {
  43. return "", err
  44. }
  45. robotsURL := &url.URL{
  46. Scheme: parsedURL.Scheme,
  47. Host: parsedURL.Host,
  48. Path: "/robots.txt",
  49. }
  50. return robotsURL.String(), nil
  51. }
  52. // checkMayAutonomouslyFetchURL checks if the URL can be fetched according to robots.txt
  53. func checkMayAutonomouslyFetchURL(ctx context.Context, urlStr, userAgent, proxyURL string) error {
  54. robotsTxtURL, err := getRobotsTxtURL(urlStr)
  55. if err != nil {
  56. return fmt.Errorf("failed to construct robots.txt URL: %w", err)
  57. }
  58. client := &http.Client{
  59. Timeout: 30 * time.Second,
  60. }
  61. if proxyURL != "" {
  62. proxyURLParsed, err := url.Parse(proxyURL)
  63. if err != nil {
  64. return fmt.Errorf("invalid proxy URL: %w", err)
  65. }
  66. client.Transport = &http.Transport{
  67. Proxy: http.ProxyURL(proxyURLParsed),
  68. }
  69. }
  70. req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsTxtURL, nil)
  71. if err != nil {
  72. return fmt.Errorf("failed to create robots.txt request: %w", err)
  73. }
  74. req.Header.Set("User-Agent", userAgent)
  75. resp, err := client.Do(req)
  76. if err != nil {
  77. return fmt.Errorf(
  78. "failed to fetch robots.txt %s due to a connection issue: %w",
  79. robotsTxtURL,
  80. err,
  81. )
  82. }
  83. defer resp.Body.Close()
  84. if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
  85. return fmt.Errorf(
  86. "when fetching robots.txt (%s), received status %d so assuming that autonomous fetching is not allowed, the user can try manually fetching by using the fetch prompt",
  87. robotsTxtURL,
  88. resp.StatusCode,
  89. )
  90. }
  91. if resp.StatusCode >= 400 && resp.StatusCode < 500 {
  92. return nil // Assume robots.txt doesn't exist, allow fetching
  93. }
  94. robotsTxtBody, err := io.ReadAll(resp.Body)
  95. if err != nil {
  96. return fmt.Errorf("failed to read robots.txt: %w", err)
  97. }
  98. robots, err := robotstxt.FromBytes(robotsTxtBody)
  99. if err != nil {
  100. return nil // If we can't parse robots.txt, allow fetching
  101. }
  102. if !robots.TestAgent(urlStr, userAgent) {
  103. return fmt.Errorf(
  104. `the sites robots.txt (%s) specifies that autonomous fetching of this page is not allowed
  105. <useragent>%s</useragent>
  106. <url>%s</url>
  107. <robots>
  108. %s
  109. </robots>
  110. The assistant must let the user know that it failed to view the page. The assistant may provide further guidance based on the above information.
  111. The assistant can tell the user that they can try manually fetching the page by using the fetch prompt within their UI`,
  112. robotsTxtURL,
  113. userAgent,
  114. urlStr,
  115. string(robotsTxtBody),
  116. )
  117. }
  118. return nil
  119. }
  120. // fetchURL fetches the URL and returns the content in a form ready for the LLM
  121. func fetchURL(
  122. ctx context.Context,
  123. urlStr, userAgent string,
  124. forceRaw bool,
  125. proxyURL string,
  126. ) (string, string, error) {
  127. client := &http.Client{
  128. Timeout: 30 * time.Second,
  129. }
  130. if proxyURL != "" {
  131. proxyURLParsed, err := url.Parse(proxyURL)
  132. if err != nil {
  133. return "", "", fmt.Errorf("invalid proxy URL: %w", err)
  134. }
  135. client.Transport = &http.Transport{
  136. Proxy: http.ProxyURL(proxyURLParsed),
  137. }
  138. }
  139. req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
  140. if err != nil {
  141. return "", "", fmt.Errorf("failed to create request: %w", err)
  142. }
  143. req.Header.Set("User-Agent", userAgent)
  144. resp, err := client.Do(req)
  145. if err != nil {
  146. return "", "", fmt.Errorf("failed to fetch %s: %w", urlStr, err)
  147. }
  148. defer resp.Body.Close()
  149. if resp.StatusCode >= 400 {
  150. return "", "", fmt.Errorf("failed to fetch %s - status code %d", urlStr, resp.StatusCode)
  151. }
  152. pageRaw, err := io.ReadAll(resp.Body)
  153. if err != nil {
  154. return "", "", fmt.Errorf("failed to read response body: %w", err)
  155. }
  156. contentType := resp.Header.Get("Content-Type")
  157. pageRawStr := string(pageRaw)
  158. isPageHTML := strings.Contains(pageRawStr[:min(100, len(pageRawStr))], "<html") ||
  159. strings.Contains(contentType, "text/html") ||
  160. contentType == ""
  161. if isPageHTML && !forceRaw {
  162. content := extractContentFromHTML(pageRawStr)
  163. return content, "", nil
  164. }
  165. prefix := fmt.Sprintf(
  166. "Content type %s cannot be simplified to markdown, but here is the raw content:\n",
  167. contentType,
  168. )
  169. return pageRawStr, prefix, nil
  170. }
  171. // createFetchServer creates a new MCP server for fetch functionality
  172. func createFetchServer(
  173. customUserAgent string,
  174. ignoreRobotsTxt bool,
  175. proxyURL string,
  176. ) *server.MCPServer {
  177. mcpServer := server.NewMCPServer("mcp-fetch", "1.0.0")
  178. userAgentAutonomous := DefaultUserAgentAutonomous
  179. userAgentManual := DefaultUserAgentManual
  180. if customUserAgent != "" {
  181. userAgentAutonomous = customUserAgent
  182. userAgentManual = customUserAgent
  183. }
  184. // Add fetch tool
  185. fetchTool := mcp.NewTool(
  186. "fetch",
  187. mcp.WithString("url", mcp.Description("URL to fetch"), mcp.Required()),
  188. mcp.WithNumber("max_length",
  189. mcp.Description("Maximum number of characters to return"),
  190. mcp.DefaultNumber(5000)),
  191. mcp.WithNumber(
  192. "start_index",
  193. mcp.Description(
  194. "On return output starting at this character index, useful if a previous fetch was truncated and more context is required",
  195. ),
  196. mcp.DefaultNumber(0),
  197. ),
  198. mcp.WithBoolean(
  199. "raw",
  200. mcp.Description(
  201. "Get the actual HTML content of the requested page, without simplification",
  202. ),
  203. mcp.DefaultBool(false),
  204. ),
  205. mcp.WithDescription(
  206. `Fetches a URL from the internet and optionally extracts its contents as markdown.
  207. Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.`,
  208. ),
  209. )
  210. mcpServer.AddTool(
  211. fetchTool,
  212. func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
  213. args := request.GetArguments()
  214. urlStr, ok := args["url"].(string)
  215. if !ok || urlStr == "" {
  216. return nil, errors.New("URL is required")
  217. }
  218. maxLength := 5000
  219. if ml, ok := args["max_length"].(float64); ok {
  220. maxLength = int(ml)
  221. }
  222. startIndex := 0
  223. if si, ok := args["start_index"].(float64); ok {
  224. startIndex = int(si)
  225. }
  226. raw := false
  227. if r, ok := args["raw"].(bool); ok {
  228. raw = r
  229. }
  230. // Validate max_length and start_index
  231. if maxLength <= 0 || maxLength >= 1000000 {
  232. return nil, errors.New("max_length must be between 1 and 999999")
  233. }
  234. if startIndex < 0 {
  235. return nil, errors.New("start_index must be >= 0")
  236. }
  237. // Check robots.txt if not ignored
  238. if !ignoreRobotsTxt {
  239. if err := checkMayAutonomouslyFetchURL(ctx, urlStr, userAgentAutonomous, proxyURL); err != nil {
  240. return nil, err
  241. }
  242. }
  243. // Fetch the URL
  244. content, prefix, err := fetchURL(ctx, urlStr, userAgentAutonomous, raw, proxyURL)
  245. if err != nil {
  246. return nil, err
  247. }
  248. originalLength := len(content)
  249. if startIndex >= originalLength {
  250. content = "<error>No more content available.</error>"
  251. } else {
  252. truncatedContent := content[startIndex:]
  253. if len(truncatedContent) > maxLength {
  254. truncatedContent = truncatedContent[:maxLength]
  255. }
  256. if truncatedContent == "" {
  257. content = "<error>No more content available.</error>"
  258. } else {
  259. content = truncatedContent
  260. actualContentLength := len(truncatedContent)
  261. remainingContent := originalLength - (startIndex + actualContentLength)
  262. // Only add the prompt to continue fetching if there is still remaining content
  263. if actualContentLength == maxLength && remainingContent > 0 {
  264. nextStart := startIndex + actualContentLength
  265. content += fmt.Sprintf("\n\n<error>Content truncated. Call the fetch tool with a start_index of %d to get more content.</error>", nextStart)
  266. }
  267. }
  268. }
  269. result := fmt.Sprintf("%sContents of %s:\n%s", prefix, urlStr, content)
  270. return mcp.NewToolResultText(result), nil
  271. },
  272. )
  273. // Add fetch prompt
  274. fetchPrompt := mcp.NewPrompt("fetch",
  275. mcp.WithPromptDescription("Fetch a URL and extract its contents as markdown"),
  276. mcp.WithArgument("url", mcp.ArgumentDescription("URL to fetch"), mcp.RequiredArgument()),
  277. )
  278. mcpServer.AddPrompt(
  279. fetchPrompt,
  280. func(ctx context.Context, request mcp.GetPromptRequest) (*mcp.GetPromptResult, error) {
  281. args := request.Params.Arguments
  282. if args == nil {
  283. return nil, errors.New("URL is required")
  284. }
  285. urlStr, ok := args["url"]
  286. if !ok || urlStr == "" {
  287. return nil, errors.New("URL is required")
  288. }
  289. content, prefix, err := fetchURL(ctx, urlStr, userAgentManual, false, proxyURL)
  290. if err != nil {
  291. return &mcp.GetPromptResult{
  292. Description: "Failed to fetch " + urlStr,
  293. Messages: []mcp.PromptMessage{
  294. {
  295. Role: mcp.RoleUser,
  296. Content: mcp.TextContent{
  297. Type: "text",
  298. Text: err.Error(),
  299. },
  300. },
  301. },
  302. }, nil
  303. }
  304. return &mcp.GetPromptResult{
  305. Description: "Contents of " + urlStr,
  306. Messages: []mcp.PromptMessage{
  307. {
  308. Role: mcp.RoleUser,
  309. Content: mcp.TextContent{
  310. Type: "text",
  311. Text: prefix + content,
  312. },
  313. },
  314. },
  315. }, nil
  316. },
  317. )
  318. return mcpServer
  319. }
  320. var configTemplates = mcpservers.ConfigTemplates{
  321. "user-agent": {
  322. Name: "user-agent",
  323. Required: mcpservers.ConfigRequiredTypeInitOptional,
  324. Example: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  325. Description: "Custom User-Agent string to use for requests",
  326. },
  327. "ignore-robots": {
  328. Name: "ignore-robots",
  329. Required: mcpservers.ConfigRequiredTypeInitOptional,
  330. Example: "true",
  331. Description: "Whether to ignore robots.txt restrictions",
  332. Validator: func(value string) error {
  333. _, err := strconv.ParseBool(value)
  334. return err
  335. },
  336. },
  337. "proxy": {
  338. Name: "proxy",
  339. Required: mcpservers.ConfigRequiredTypeInitOptional,
  340. Example: "http://127.0.0.1:7890",
  341. Description: "Proxy URL to use for requests",
  342. Validator: func(value string) error {
  343. _, err := url.Parse(value)
  344. return err
  345. },
  346. },
  347. }
  348. func NewServer(config, _ map[string]string) (mcpservers.Server, error) {
  349. customUserAgent := config["user-agent"]
  350. ignoreRobotsTxt, _ := strconv.ParseBool(config["ignore-robots"])
  351. proxyURL := config["proxy"]
  352. return createFetchServer(customUserAgent, ignoreRobotsTxt, proxyURL), nil
  353. }
  354. func ListTools(ctx context.Context) ([]mcp.Tool, error) {
  355. fetchServer := createFetchServer("", false, "")
  356. return mcpservers.ListServerTools(ctx, fetchServer)
  357. }