engine.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. package bingcn
  2. import (
  3. "context"
  4. "crypto/tls"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "net/http"
  9. "net/url"
  10. "strings"
  11. "sync"
  12. "time"
  13. "github.com/PuerkitoBio/goquery"
  14. "golang.org/x/text/encoding/simplifiedchinese"
  15. "golang.org/x/text/transform"
  16. )
  17. type SearchResult struct {
  18. ID string `json:"id"`
  19. Title string `json:"title"`
  20. Link string `json:"link"`
  21. Snippet string `json:"snippet"`
  22. }
  23. const (
  24. DefaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  25. )
  26. // SearchEngine represents a Bing search engine
  27. type SearchEngine struct {
  28. userAgent string
  29. client *http.Client
  30. searchResults sync.Map // map[string]*SearchResult
  31. }
  32. // SearchOptions contains options for search
  33. type SearchOptions struct {
  34. Query string
  35. NumResults int
  36. Language string
  37. }
  38. // NewSearchEngine creates a new search engine instance
  39. func NewSearchEngine(userAgent string, timeout time.Duration) *SearchEngine {
  40. if userAgent == "" {
  41. userAgent = DefaultUserAgent
  42. }
  43. client := &http.Client{
  44. Timeout: timeout,
  45. Transport: &http.Transport{
  46. TLSClientConfig: &tls.Config{
  47. InsecureSkipVerify: false,
  48. MinVersion: tls.VersionTLS12,
  49. },
  50. },
  51. }
  52. return &SearchEngine{
  53. userAgent: userAgent,
  54. client: client,
  55. }
  56. }
  57. // Search performs a Bing search and returns results
  58. func (e *SearchEngine) Search(ctx context.Context, options SearchOptions) ([]*SearchResult, error) {
  59. if options.Query == "" {
  60. return nil, errors.New("query cannot be empty")
  61. }
  62. if options.NumResults <= 0 {
  63. options.NumResults = 5
  64. }
  65. // Build search URL
  66. searchURL := e.buildSearchURL(options.Query, options.Language)
  67. // Create and execute request
  68. req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
  69. if err != nil {
  70. return nil, fmt.Errorf("failed to create request: %w", err)
  71. }
  72. e.setSearchHeaders(req)
  73. resp, err := e.client.Do(req)
  74. if err != nil {
  75. return nil, fmt.Errorf("failed to execute request: %w", err)
  76. }
  77. defer resp.Body.Close()
  78. if resp.StatusCode != http.StatusOK {
  79. return nil, fmt.Errorf("search request failed with status: %d", resp.StatusCode)
  80. }
  81. // Read and decode response
  82. body, err := io.ReadAll(resp.Body)
  83. if err != nil {
  84. return nil, fmt.Errorf("failed to read response body: %w", err)
  85. }
  86. content := e.decodeContent(body, resp.Header.Get("Content-Type"))
  87. // Parse HTML
  88. doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
  89. if err != nil {
  90. return nil, fmt.Errorf("failed to parse HTML: %w", err)
  91. }
  92. // Extract results
  93. return e.extractSearchResults(doc, options.Query, options.NumResults), nil
  94. }
  95. // GetSearchResult retrieves a stored search result by ID
  96. func (e *SearchEngine) GetSearchResult(resultID string) (*SearchResult, bool) {
  97. value, ok := e.searchResults.Load(resultID)
  98. if !ok {
  99. return nil, false
  100. }
  101. result, ok := value.(*SearchResult)
  102. return result, ok
  103. }
  104. // buildSearchURL constructs the Bing search URL
  105. func (e *SearchEngine) buildSearchURL(query, language string) string {
  106. baseURL := "https://cn.bing.com/search"
  107. params := url.Values{}
  108. params.Set("q", query)
  109. params.Set("setlang", "zh-CN")
  110. params.Set("ensearch", "0")
  111. if language != "" {
  112. params.Set("setlang", language)
  113. }
  114. return fmt.Sprintf("%s?%s", baseURL, params.Encode())
  115. }
  116. // setSearchHeaders sets appropriate headers for search requests
  117. func (e *SearchEngine) setSearchHeaders(req *http.Request) {
  118. headers := map[string]string{
  119. "User-Agent": e.userAgent,
  120. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
  121. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  122. "Cache-Control": "no-cache",
  123. "Pragma": "no-cache",
  124. "Sec-Fetch-Dest": "document",
  125. "Sec-Fetch-Mode": "navigate",
  126. "Sec-Fetch-Site": "none",
  127. "Sec-Fetch-User": "?1",
  128. "Upgrade-Insecure-Requests": "1",
  129. "Cookie": "SRCHHPGUSR=SRCHLANG=zh-Hans; _EDGE_S=ui=zh-cn; _EDGE_V=1",
  130. }
  131. for key, value := range headers {
  132. req.Header.Set(key, value)
  133. }
  134. }
  135. // decodeContent attempts to properly decode content based on encoding
  136. func (e *SearchEngine) decodeContent(body []byte, contentType string) string {
  137. content := string(body)
  138. // Check if content type suggests GBK encoding
  139. if strings.Contains(strings.ToLower(contentType), "gbk") ||
  140. strings.Contains(strings.ToLower(contentType), "gb2312") {
  141. if decoded, err := e.decodeGBK(body); err == nil {
  142. content = decoded
  143. }
  144. }
  145. return content
  146. }
  147. // decodeGBK decodes GBK encoded content to UTF-8
  148. func (e *SearchEngine) decodeGBK(data []byte) (string, error) {
  149. reader := transform.NewReader(
  150. strings.NewReader(string(data)),
  151. simplifiedchinese.GBK.NewDecoder(),
  152. )
  153. decoded, err := io.ReadAll(reader)
  154. if err != nil {
  155. return "", err
  156. }
  157. return string(decoded), nil
  158. }
  159. // extractSearchResults extracts search results from parsed HTML
  160. func (e *SearchEngine) extractSearchResults(
  161. doc *goquery.Document,
  162. query string,
  163. numResults int,
  164. ) []*SearchResult {
  165. var results []*SearchResult
  166. // Try different selectors for Bing search results
  167. selectors := []string{
  168. "#b_results > li.b_algo",
  169. "#b_results > .b_ans",
  170. "#b_results > li",
  171. }
  172. for _, selector := range selectors {
  173. doc.Find(selector).Each(func(i int, element *goquery.Selection) {
  174. if len(results) >= numResults {
  175. return
  176. }
  177. result := e.parseSearchResultElement(element, i)
  178. if result != nil {
  179. // Store result for later retrieval
  180. e.searchResults.Store(result.ID, result)
  181. results = append(results, result)
  182. }
  183. })
  184. // If we found results with this selector, stop trying others
  185. if len(results) > 0 {
  186. break
  187. }
  188. }
  189. // If no results found, create a fallback result
  190. if len(results) == 0 {
  191. fallbackResult := e.createFallbackResult(query)
  192. e.searchResults.Store(fallbackResult.ID, fallbackResult)
  193. results = append(results, fallbackResult)
  194. }
  195. return results
  196. }
  197. // parseSearchResultElement parses a single search result element
  198. func (e *SearchEngine) parseSearchResultElement(
  199. element *goquery.Selection,
  200. index int,
  201. ) *SearchResult {
  202. // Skip ads
  203. if element.HasClass("b_ad") {
  204. return nil
  205. }
  206. title, link := e.extractTitleAndLink(element)
  207. snippet := e.extractSnippet(element, title)
  208. // Fix incomplete links
  209. if link != "" && !strings.HasPrefix(link, "http") {
  210. link = e.fixIncompleteLink(link)
  211. }
  212. // Skip if no meaningful content
  213. if title == "" && snippet == "" {
  214. return nil
  215. }
  216. // Create unique ID
  217. id := fmt.Sprintf("result_%d_%d", time.Now().UnixNano(), index)
  218. return &SearchResult{
  219. ID: id,
  220. Title: title,
  221. Link: link,
  222. Snippet: snippet,
  223. }
  224. }
  225. // extractTitleAndLink extracts title and link from a search result element
  226. func (e *SearchEngine) extractTitleAndLink(element *goquery.Selection) (string, string) {
  227. // Try to find title and link in h2 a
  228. titleElement := element.Find("h2 a").First()
  229. if titleElement.Length() > 0 {
  230. title := strings.TrimSpace(titleElement.Text())
  231. link, _ := titleElement.Attr("href")
  232. return title, link
  233. }
  234. // Try alternative selectors
  235. altTitleElement := element.Find(".b_title a, a.tilk, a strong").First()
  236. if altTitleElement.Length() > 0 {
  237. title := strings.TrimSpace(altTitleElement.Text())
  238. link, _ := altTitleElement.Attr("href")
  239. return title, link
  240. }
  241. return "", ""
  242. }
  243. // extractSnippet extracts snippet from a search result element
  244. func (e *SearchEngine) extractSnippet(element *goquery.Selection, title string) string {
  245. // Try to find snippet in common Bing snippet selectors
  246. snippetElement := element.Find(".b_caption p, .b_snippet, .b_algoSlug").First()
  247. if snippetElement.Length() > 0 {
  248. return strings.TrimSpace(snippetElement.Text())
  249. }
  250. // If no snippet found, use entire element text and clean it up
  251. snippet := strings.TrimSpace(element.Text())
  252. // Remove title from snippet
  253. if title != "" && strings.Contains(snippet, title) {
  254. snippet = strings.ReplaceAll(snippet, title, "")
  255. snippet = strings.TrimSpace(snippet)
  256. }
  257. // Truncate if too long
  258. if len(snippet) > 150 {
  259. snippet = snippet[:150] + "..."
  260. }
  261. return snippet
  262. }
  263. // fixIncompleteLink fixes incomplete URLs
  264. func (e *SearchEngine) fixIncompleteLink(link string) string {
  265. if strings.HasPrefix(link, "/") {
  266. return "https://cn.bing.com" + link
  267. }
  268. return "https://cn.bing.com/" + link
  269. }
  270. // createFallbackResult creates a fallback result when no results are found
  271. func (e *SearchEngine) createFallbackResult(query string) *SearchResult {
  272. id := fmt.Sprintf("result_%d_fallback", time.Now().UnixNano())
  273. return &SearchResult{
  274. ID: id,
  275. Title: "搜索结果: " + query,
  276. Link: "https://cn.bing.com/search?q=" + url.QueryEscape(query),
  277. Snippet: fmt.Sprintf("未能解析关于 \"%s\" 的搜索结果,但您可以直接访问必应搜索页面查看。", query),
  278. }
  279. }