html.go 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. package utils
  2. import (
  3. "bytes"
  4. "regexp"
  5. "strings"
  6. "github.com/PuerkitoBio/goquery"
  7. "github.com/mindoc-org/mindoc/conf"
  8. )
  9. func StripTags(s string) string {
  10. //将HTML标签全转换成小写
  11. re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
  12. src := re.ReplaceAllStringFunc(s, strings.ToLower)
  13. //去除STYLE
  14. re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
  15. src = re.ReplaceAllString(src, "")
  16. //去除SCRIPT
  17. re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
  18. src = re.ReplaceAllString(src, "")
  19. //去除所有尖括号内的HTML代码,并换成换行符
  20. re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
  21. src = re.ReplaceAllString(src, "\n")
  22. //去除连续的换行符
  23. re, _ = regexp.Compile("\\s{2,}")
  24. src = re.ReplaceAllString(src, "\n")
  25. return src
  26. }
  27. //自动提取文章摘要
  28. func AutoSummary(body string,l int) string {
  29. //匹配图片,如果图片语法是在代码块中,这里同样会处理
  30. re := regexp.MustCompile(`<p>(.*?)</p>`)
  31. contents := re.FindAllString(body, -1)
  32. if len(contents) <= 0 {
  33. return ""
  34. }
  35. content := ""
  36. for _,s := range contents {
  37. b := strings.Replace(StripTags(s),"\n","", -1)
  38. if l <= 0 {
  39. break
  40. }
  41. l = l - len([]rune(b))
  42. content += b
  43. }
  44. return content
  45. }
  46. //安全处理HTML文档,过滤危险标签和属性.
  47. func SafetyProcessor(html string) string {
  48. //安全过滤,移除危险标签和属性
  49. if docQuery, err := goquery.NewDocumentFromReader(bytes.NewBufferString(html)); err == nil {
  50. docQuery.Find("script").Remove()
  51. docQuery.Find("form").Remove()
  52. docQuery.Find("link").Remove()
  53. docQuery.Find("applet").Remove()
  54. docQuery.Find("frame").Remove()
  55. docQuery.Find("meta").Remove()
  56. docQuery.Find("iframe").Remove()
  57. docQuery.Find("*").Each(func(i int, selection *goquery.Selection) {
  58. if href, ok := selection.Attr("href"); ok && strings.HasPrefix(href, "javascript:") {
  59. selection.SetAttr("href", "#")
  60. }
  61. if src, ok := selection.Attr("src"); ok && strings.HasPrefix(src, "javascript:") {
  62. selection.SetAttr("src", "#")
  63. }
  64. selection.RemoveAttr("onafterprint").
  65. RemoveAttr("onbeforeprint").
  66. RemoveAttr("onbeforeunload").
  67. RemoveAttr("onload").
  68. RemoveAttr("onclick").
  69. RemoveAttr("onkeydown").
  70. RemoveAttr("onkeypress").
  71. RemoveAttr("onkeyup").
  72. RemoveAttr("ondblclick").
  73. RemoveAttr("onmousedown").
  74. RemoveAttr("onmousemove").
  75. RemoveAttr("onmouseout").
  76. RemoveAttr("onmouseover").
  77. RemoveAttr("onmouseup")
  78. })
  79. //处理外链
  80. docQuery.Find("a").Each(func(i int, contentSelection *goquery.Selection) {
  81. if src, ok := contentSelection.Attr("href"); ok {
  82. if strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") {
  83. if conf.BaseUrl != "" && !strings.HasPrefix(src, conf.BaseUrl) {
  84. contentSelection.SetAttr("target", "_blank")
  85. }
  86. }
  87. }
  88. })
  89. //添加文档标签包裹
  90. if selector := docQuery.Find("article.markdown-article-inner").First(); selector.Size() <= 0 {
  91. docQuery.Children().WrapAllHtml("<article class=\"markdown-article-inner\"></article>")
  92. }
  93. //解决文档内容缺少包裹标签的问题
  94. if selector := docQuery.Find("div.markdown-article").First(); selector.Size() <= 0 {
  95. if selector := docQuery.Find("div.markdown-toc").First(); selector.Size() > 0 {
  96. docQuery.Find("div.markdown-toc").NextAll().WrapAllHtml("<div class=\"markdown-article\"></div>")
  97. }
  98. }
  99. if html, err := docQuery.Html(); err == nil {
  100. return strings.TrimSuffix(strings.TrimPrefix(strings.TrimSpace(html), "<html><head></head><body>"), "</body></html>")
  101. }
  102. }
  103. return html
  104. }