html.go 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. package utils
  2. import (
  3. "bytes"
  4. "regexp"
  5. "strings"
  6. "github.com/PuerkitoBio/goquery"
  7. "github.com/mindoc-org/mindoc/conf"
  8. )
  9. func StripTags(s string) string {
  10. //将HTML标签全转换成小写
  11. re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
  12. src := re.ReplaceAllStringFunc(s, strings.ToLower)
  13. //去除STYLE
  14. re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
  15. src = re.ReplaceAllString(src, "")
  16. //去除SCRIPT
  17. re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
  18. src = re.ReplaceAllString(src, "")
  19. //去除所有尖括号内的HTML代码,并换成换行符
  20. re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
  21. src = re.ReplaceAllString(src, "\n")
  22. //去除连续的换行符
  23. re, _ = regexp.Compile("\\s{2,}")
  24. src = re.ReplaceAllString(src, "\n")
  25. return src
  26. }
  27. //自动提取文章摘要
  28. func AutoSummary(body string, l int) string {
  29. //匹配图片,如果图片语法是在代码块中,这里同样会处理
  30. re := regexp.MustCompile(`<p>(.*?)</p>`)
  31. contents := re.FindAllString(body, -1)
  32. if len(contents) <= 0 {
  33. return ""
  34. }
  35. content := ""
  36. for _, s := range contents {
  37. b := strings.Replace(StripTags(s), "\n", "", -1)
  38. if l <= 0 {
  39. break
  40. }
  41. l = l - len([]rune(b))
  42. content += b
  43. }
  44. return content
  45. }
  46. //安全处理HTML文档,过滤危险标签和属性.
  47. func SafetyProcessor(html string) string {
  48. //安全过滤,移除危险标签和属性
  49. if docQuery, err := goquery.NewDocumentFromReader(bytes.NewBufferString(html)); err == nil {
  50. docQuery.Find("script").Remove()
  51. docQuery.Find("form").Remove()
  52. docQuery.Find("link").Remove()
  53. docQuery.Find("applet").Remove()
  54. docQuery.Find("frame").Remove()
  55. docQuery.Find("meta").Remove()
  56. if !conf.GetEnableIframe() {
  57. docQuery.Find("iframe").Remove()
  58. }
  59. docQuery.Find("*").Each(func(i int, selection *goquery.Selection) {
  60. if href, ok := selection.Attr("href"); ok && strings.HasPrefix(href, "javascript:") {
  61. selection.SetAttr("href", "#")
  62. }
  63. if src, ok := selection.Attr("src"); ok && strings.HasPrefix(src, "javascript:") {
  64. selection.SetAttr("src", "#")
  65. }
  66. selection.RemoveAttr("onafterprint").
  67. RemoveAttr("onbeforeprint").
  68. RemoveAttr("onbeforeunload").
  69. RemoveAttr("onload").
  70. RemoveAttr("onclick").
  71. RemoveAttr("onkeydown").
  72. RemoveAttr("onkeypress").
  73. RemoveAttr("onkeyup").
  74. RemoveAttr("ondblclick").
  75. RemoveAttr("onmousedown").
  76. RemoveAttr("onmousemove").
  77. RemoveAttr("onmouseout").
  78. RemoveAttr("onmouseover").
  79. RemoveAttr("onmouseup")
  80. })
  81. //处理外链
  82. docQuery.Find("a").Each(func(i int, contentSelection *goquery.Selection) {
  83. if src, ok := contentSelection.Attr("href"); ok {
  84. if strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") {
  85. if conf.BaseUrl != "" && !strings.HasPrefix(src, conf.BaseUrl) {
  86. contentSelection.SetAttr("target", "_blank")
  87. }
  88. }
  89. }
  90. })
  91. //添加文档标签包裹
  92. if selector := docQuery.Find("article.markdown-article-inner").First(); selector.Size() <= 0 {
  93. docQuery.Children().WrapAllHtml("<article class=\"markdown-article-inner\"></article>")
  94. }
  95. //解决文档内容缺少包裹标签的问题
  96. if selector := docQuery.Find("div.markdown-article").First(); selector.Size() <= 0 {
  97. if selector := docQuery.Find("div.markdown-toc").First(); selector.Size() > 0 {
  98. docQuery.Find("div.markdown-toc").NextAll().WrapAllHtml("<div class=\"markdown-article\"></div>")
  99. }
  100. }
  101. if html, err := docQuery.Html(); err == nil {
  102. return strings.TrimSuffix(strings.TrimPrefix(strings.TrimSpace(html), "<html><head></head><body>"), "</body></html>")
  103. }
  104. }
  105. return html
  106. }