HtmlTools.cs 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. using AngleSharp;
  2. using AngleSharp.Dom;
  3. using Ganss.Xss;
  4. using System.Collections.Generic;
  5. using System.Linq;
  6. using System.Text.RegularExpressions;
  7. namespace Masuit.Tools.Html
  8. {
  9. /// <summary>
  10. /// html工具类
  11. /// </summary>
  12. public static class HtmlTools
  13. {
  14. /// <summary>
  15. /// 标准的防止html的xss净化器
  16. /// </summary>
  17. /// <param name="html"></param>
  18. /// <returns></returns>
  19. public static string HtmlSanitizerStandard(this string html)
  20. {
  21. var sanitizer = new HtmlSanitizer
  22. {
  23. KeepChildNodes = true
  24. };
  25. sanitizer.AllowedAttributes.Remove("id");
  26. sanitizer.AllowedAttributes.Remove("alt");
  27. sanitizer.AllowedCssProperties.Remove("font-family");
  28. sanitizer.AllowedTags.Remove("input");
  29. sanitizer.AllowedTags.Remove("button");
  30. sanitizer.AllowedTags.Remove("iframe");
  31. sanitizer.AllowedTags.Remove("frame");
  32. sanitizer.AllowedTags.Remove("textarea");
  33. sanitizer.AllowedTags.Remove("select");
  34. sanitizer.AllowedTags.Remove("form");
  35. sanitizer.AllowedAttributes.Add("src");
  36. sanitizer.AllowedAttributes.Add("class");
  37. sanitizer.AllowedAttributes.Add("style");
  38. return sanitizer.Sanitize(html);
  39. }
  40. /// <summary>
  41. /// 自定义的防止html的xss净化器
  42. /// </summary>
  43. /// <param name="html">源html</param>
  44. /// <param name="labels">需要移除的标签集合</param>
  45. /// <param name="attributes">需要移除的属性集合</param>
  46. /// <param name="styles">需要移除的样式集合</param>
  47. /// <returns></returns>
  48. public static string HtmlSanitizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
  49. {
  50. var sanitizer = new HtmlSanitizer
  51. {
  52. KeepChildNodes = true
  53. };
  54. sanitizer.AllowedAttributes.Remove("id");
  55. sanitizer.AllowedAttributes.Remove("alt");
  56. sanitizer.AllowedCssProperties.Remove("font-family");
  57. sanitizer.AllowedTags.Remove("input");
  58. sanitizer.AllowedTags.Remove("button");
  59. sanitizer.AllowedTags.Remove("iframe");
  60. sanitizer.AllowedTags.Remove("frame");
  61. sanitizer.AllowedTags.Remove("textarea");
  62. sanitizer.AllowedTags.Remove("select");
  63. sanitizer.AllowedTags.Remove("form");
  64. sanitizer.AllowedAttributes.Add("src");
  65. sanitizer.AllowedAttributes.Add("class");
  66. sanitizer.AllowedAttributes.Add("style");
  67. if (labels != null)
  68. {
  69. foreach (string label in labels)
  70. {
  71. sanitizer.AllowedTags.Remove(label);
  72. }
  73. }
  74. if (attributes != null)
  75. {
  76. foreach (string attr in attributes)
  77. {
  78. sanitizer.AllowedAttributes.Remove(attr);
  79. }
  80. }
  81. if (styles != null)
  82. {
  83. foreach (string p in styles)
  84. {
  85. sanitizer.AllowedCssProperties.Remove(p);
  86. }
  87. }
  88. sanitizer.KeepChildNodes = true;
  89. return sanitizer.Sanitize(html);
  90. }
  91. /// <summary>
  92. /// 去除html标签后并截取字符串
  93. /// </summary>
  94. /// <param name="html">源html</param>
  95. /// <param name="length">截取长度</param>
  96. /// <returns></returns>
  97. public static string RemoveHtmlTag(this string html, int length = 0)
  98. {
  99. var context = BrowsingContext.New(Configuration.Default);
  100. var doc = context.OpenAsync(req => req.Content(html)).Result;
  101. var strText = doc.Body.TextContent;
  102. if (length > 0 && strText.Length > length)
  103. {
  104. return strText.Substring(0, length);
  105. }
  106. return strText;
  107. }
  108. /// <summary>
  109. /// 替换html的img路径为绝对路径
  110. /// </summary>
  111. /// <param name="html"></param>
  112. /// <param name="imgDest"></param>
  113. /// <returns></returns>
  114. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  115. /// <summary>
  116. /// 将src的绝对路径换成相对路径
  117. /// </summary>
  118. /// <param name="s"></param>
  119. /// <returns></returns>
  120. public static string ConvertImgSrcToRelativePath(this string s)
  121. {
  122. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  123. }
  124. /// <summary>
  125. /// 匹配html的所有img标签集合
  126. /// </summary>
  127. /// <param name="html"></param>
  128. /// <returns></returns>
  129. public static IHtmlCollection<IElement> MatchImgTags(this string html)
  130. {
  131. var context = BrowsingContext.New(Configuration.Default);
  132. var doc = context.OpenAsync(req => req.Content(html)).Result;
  133. return doc.Body.GetElementsByTagName("img");
  134. }
  135. /// <summary>
  136. /// 匹配html的所有img标签的src集合
  137. /// </summary>
  138. /// <param name="html"></param>
  139. /// <returns></returns>
  140. public static IEnumerable<string> MatchImgSrcs(this string html)
  141. {
  142. return MatchImgTags(html).Where(n => n.HasAttribute("src")).Select(n => n.GetAttribute("src"));
  143. }
  144. /// <summary>
  145. /// 获取html中第一个img标签的src
  146. /// </summary>
  147. /// <param name="html"></param>
  148. /// <returns></returns>
  149. public static string MatchFirstImgSrc(this string html)
  150. {
  151. return MatchImgSrcs(html).FirstOrDefault();
  152. }
  153. /// <summary>
  154. /// 随机获取html代码中的img标签的src属性
  155. /// </summary>
  156. /// <param name="html"></param>
  157. /// <returns></returns>
  158. public static string MatchRandomImgSrc(this string html)
  159. {
  160. return MatchImgSrcs(html).OrderByRandom().FirstOrDefault();
  161. }
  162. /// <summary>
  163. /// 替换html字符
  164. /// </summary>
  165. /// <param name="strHtml">html</param>
  166. public static string EncodeHtml(this string strHtml)
  167. {
  168. if (strHtml != "")
  169. {
  170. return strHtml.Replace(",", "&def").Replace("'", "&dot").Replace(";", "&dec");
  171. }
  172. return "";
  173. }
  174. }
  175. }