segmenter.go 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. package segmenter
  2. import (
  3. "bufio"
  4. "os"
  5. "path/filepath"
  6. "regexp"
  7. "strings"
  8. "sync"
  9. "unicode"
  10. "github.com/beego/beego/v2/core/logs"
  11. "github.com/mindoc-org/mindoc/conf"
  12. "github.com/yanyiwu/gojieba"
  13. )
  14. var (
  15. // jieba 分词器实例
  16. segmenterOnce sync.Once
  17. jiebaCut *gojieba.Jieba
  18. // 停用词集合
  19. stopWords map[string]bool
  20. technicalTermPattern = regexp.MustCompile(`(?i)[a-z0-9][a-z0-9+#._/-]{1,63}`)
  21. )
  22. // techTermWhitelist 技术术语白名单
  23. // 这些词虽然是常见英语词汇,但同时也是 Linux/Unix 命令、编程语言
  24. // 或重要技术术语,不应被停用词过滤,否则用户搜索相关命令时将无法找到文档
  25. var techTermWhitelist = map[string]bool{
  26. // Linux/Unix 常用命令(同时也是英语常见词)
  27. "find": true, "top": true, "last": true, "more": true, "less": true,
  28. "who": true, "which": true, "done": true, "move": true, "give": true,
  29. "make": true, "take": true, "fill": true, "split": true, "cut": true,
  30. // 编程语言/框架名称
  31. "go": true, "net": true, "next": true,
  32. // HTTP 方法 / 数据库操作
  33. "get": true, "put": true, "call": true, "show": true, "describe": true,
  34. "like": true,
  35. // 系统/运维/容器/网络相关
  36. "system": true, "volume": true, "name": true, "save": true, "keep": true,
  37. "re": true, "mine": true, "near": true, "fire": true, "front": true,
  38. "full": true, "empty": true, "computer": true, "detail": true, "part": true,
  39. "back": true, "down": true, "up": true, "bar": true, "round": true,
  40. "side": true, "bottom": true,
  41. // 工具/软件名称(同时也是英语单词)
  42. "everything": true,
  43. }
  44. // getDictDir 获取词典目录
  45. func getDictDir() string {
  46. // 使用项目根目录下的 lib/jieba 目录
  47. return filepath.Join(conf.WorkingDirectory, "lib", "jieba")
  48. }
  49. // ensureDictDir 确保词典目录存在
  50. func ensureDictDir() error {
  51. dictDir := getDictDir()
  52. if _, err := os.Stat(dictDir); os.IsNotExist(err) {
  53. return os.MkdirAll(dictDir, 0755)
  54. }
  55. return nil
  56. }
  57. // initJieba 初始化 jieba 分词器
  58. func initJieba() {
  59. segmenterOnce.Do(func() {
  60. dictDir := getDictDir()
  61. // 显式传入词典路径
  62. jiebaDict := filepath.Join(dictDir, "jieba.dict.utf8")
  63. hmmDict := filepath.Join(dictDir, "hmm_model.utf8")
  64. userDict := filepath.Join(dictDir, "user.dict.utf8")
  65. idfDict := filepath.Join(dictDir, "idf.utf8")
  66. stopWordsDict := filepath.Join(dictDir, "stop_words.utf8")
  67. // 确保词典目录存在
  68. if err := ensureDictDir(); err != nil {
  69. logs.Error("创建词典目录失败 ->", err)
  70. }
  71. // 创建分词器
  72. jiebaCut = gojieba.NewJieba(jiebaDict, hmmDict, userDict, idfDict, stopWordsDict)
  73. // 加载停用词表
  74. stopWords = loadStopWords(stopWordsDict)
  75. logs.Info("jieba分词器初始化完成, 停用词数:", len(stopWords))
  76. })
  77. }
  78. // loadStopWords 从文件加载停用词集合
  79. func loadStopWords(filePath string) map[string]bool {
  80. sw := make(map[string]bool)
  81. f, err := os.Open(filePath)
  82. if err != nil {
  83. logs.Error("加载停用词表失败 ->", err)
  84. return sw
  85. }
  86. defer f.Close()
  87. scanner := bufio.NewScanner(f)
  88. for scanner.Scan() {
  89. word := strings.TrimSpace(scanner.Text())
  90. if word != "" {
  91. sw[strings.ToLower(word)] = true
  92. }
  93. }
  94. return sw
  95. }
  96. // Segment 中文分词器
  97. // 使用 jieba 分词库的搜索引擎模式进行分词
  98. func Segment(text string) []string {
  99. text = strings.TrimSpace(text)
  100. if text == "" {
  101. return []string{}
  102. }
  103. // 初始化分词器
  104. initJieba()
  105. // 使用 jieba 分词,搜索引擎模式
  106. // CutForSearch 第二个参数为 true 表示使用 HMM 模型
  107. words := jiebaCut.CutForSearch(text, true)
  108. // 过滤结果
  109. result := make([]string, 0)
  110. for _, word := range words {
  111. word = strings.TrimSpace(word)
  112. if word == "" {
  113. continue
  114. }
  115. // 转小写(英文)
  116. word = strings.ToLower(word)
  117. // 过滤单字符标点符号/特殊字符,避免匹配大量无关文档
  118. runes := []rune(word)
  119. if len(runes) == 1 && !unicode.IsLetter(runes[0]) && !unicode.IsDigit(runes[0]) {
  120. continue
  121. }
  122. // 过滤停用词(白名单中的技术术语不过滤)
  123. if stopWords[word] && !techTermWhitelist[word] {
  124. continue
  125. }
  126. result = append(result, word)
  127. }
  128. for _, word := range extractTechnicalTerms(text) {
  129. if stopWords[word] && !techTermWhitelist[word] {
  130. continue
  131. }
  132. result = append(result, word)
  133. }
  134. return result
  135. }
  136. func extractTechnicalTerms(text string) []string {
  137. matches := technicalTermPattern.FindAllString(text, -1)
  138. if len(matches) == 0 {
  139. return nil
  140. }
  141. result := make([]string, 0, len(matches))
  142. for _, match := range matches {
  143. word := strings.ToLower(strings.TrimSpace(match))
  144. if word == "" {
  145. continue
  146. }
  147. if len([]rune(word)) < 2 {
  148. continue
  149. }
  150. if !strings.ContainsAny(word, ".+#/_-") {
  151. continue
  152. }
  153. hasAlphaNumeric := false
  154. for _, r := range word {
  155. if unicode.IsLetter(r) || unicode.IsDigit(r) {
  156. hasAlphaNumeric = true
  157. break
  158. }
  159. }
  160. if !hasAlphaNumeric {
  161. continue
  162. }
  163. result = append(result, word)
  164. }
  165. return result
  166. }