HtmlTools.cs 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. using AngleSharp;
  2. using AngleSharp.Dom;
  3. using Ganss.Xss;
  4. using Masuit.Tools.RandomSelector;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Linq;
  8. using System.Text.RegularExpressions;
  9. namespace Masuit.Tools.Html
  10. {
  11. /// <summary>
  12. /// html工具类
  13. /// </summary>
  14. public static class HtmlTools
  15. {
  16. /// <summary>
  17. /// 标准的防止html的xss净化器
  18. /// </summary>
  19. /// <param name="html"></param>
  20. /// <returns></returns>
  21. public static string HtmlSanitizerStandard(this string html)
  22. {
  23. var sanitizer = new HtmlSanitizer
  24. {
  25. KeepChildNodes = true
  26. };
  27. sanitizer.AllowedAttributes.Remove("id");
  28. sanitizer.AllowedAttributes.Remove("alt");
  29. sanitizer.AllowedCssProperties.Remove("font-family");
  30. sanitizer.AllowedTags.Remove("input");
  31. sanitizer.AllowedTags.Remove("button");
  32. sanitizer.AllowedTags.Remove("iframe");
  33. sanitizer.AllowedTags.Remove("frame");
  34. sanitizer.AllowedTags.Remove("textarea");
  35. sanitizer.AllowedTags.Remove("select");
  36. sanitizer.AllowedTags.Remove("form");
  37. sanitizer.AllowedAttributes.Add("src");
  38. sanitizer.AllowedAttributes.Add("class");
  39. sanitizer.AllowedAttributes.Add("style");
  40. return sanitizer.Sanitize(html);
  41. }
  42. /// <summary>
  43. /// 自定义的防止html的xss净化器
  44. /// </summary>
  45. /// <param name="html">源html</param>
  46. /// <param name="labels">需要移除的标签集合</param>
  47. /// <param name="attributes">需要移除的属性集合</param>
  48. /// <param name="styles">需要移除的样式集合</param>
  49. /// <returns></returns>
  50. public static string HtmlSanitizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
  51. {
  52. var sanitizer = new HtmlSanitizer
  53. {
  54. KeepChildNodes = true
  55. };
  56. sanitizer.AllowedAttributes.Remove("id");
  57. sanitizer.AllowedAttributes.Remove("alt");
  58. sanitizer.AllowedCssProperties.Remove("font-family");
  59. sanitizer.AllowedTags.Remove("input");
  60. sanitizer.AllowedTags.Remove("button");
  61. sanitizer.AllowedTags.Remove("iframe");
  62. sanitizer.AllowedTags.Remove("frame");
  63. sanitizer.AllowedTags.Remove("textarea");
  64. sanitizer.AllowedTags.Remove("select");
  65. sanitizer.AllowedTags.Remove("form");
  66. sanitizer.AllowedAttributes.Add("src");
  67. sanitizer.AllowedAttributes.Add("class");
  68. sanitizer.AllowedAttributes.Add("style");
  69. if (labels != null)
  70. {
  71. foreach (string label in labels)
  72. {
  73. sanitizer.AllowedTags.Remove(label);
  74. }
  75. }
  76. if (attributes != null)
  77. {
  78. foreach (string attr in attributes)
  79. {
  80. sanitizer.AllowedAttributes.Remove(attr);
  81. }
  82. }
  83. if (styles != null)
  84. {
  85. foreach (string p in styles)
  86. {
  87. sanitizer.AllowedCssProperties.Remove(p);
  88. }
  89. }
  90. sanitizer.KeepChildNodes = true;
  91. return sanitizer.Sanitize(html);
  92. }
  93. /// <summary>
  94. /// 去除html标签后并截取字符串
  95. /// </summary>
  96. /// <param name="html">源html</param>
  97. /// <param name="length">截取长度</param>
  98. /// <returns></returns>
  99. public static string RemoveHtmlTag(this string html, int length = 0)
  100. {
  101. var context = BrowsingContext.New(Configuration.Default);
  102. var doc = context.OpenAsync(req => req.Content(html)).Result;
  103. var strText = doc.Body.TextContent;
  104. if (length > 0 && strText.Length > length)
  105. {
  106. return strText.Substring(0, length);
  107. }
  108. return strText;
  109. }
  110. /// <summary>
  111. /// 替换html的img路径为绝对路径
  112. /// </summary>
  113. /// <param name="html"></param>
  114. /// <param name="imgDest"></param>
  115. /// <returns></returns>
  116. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  117. /// <summary>
  118. /// 将src的绝对路径换成相对路径
  119. /// </summary>
  120. /// <param name="s"></param>
  121. /// <returns></returns>
  122. public static string ConvertImgSrcToRelativePath(this string s)
  123. {
  124. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  125. }
  126. /// <summary>
  127. /// 匹配html的所有img标签集合
  128. /// </summary>
  129. /// <param name="html"></param>
  130. /// <returns></returns>
  131. public static IHtmlCollection<IElement> MatchImgTags(this string html)
  132. {
  133. var context = BrowsingContext.New(Configuration.Default);
  134. var doc = context.OpenAsync(req => req.Content(html)).Result;
  135. return doc.Body.GetElementsByTagName("img");
  136. }
  137. /// <summary>
  138. /// 匹配html的所有img标签的src集合
  139. /// </summary>
  140. /// <param name="html"></param>
  141. /// <returns></returns>
  142. public static IEnumerable<string> MatchImgSrcs(this string html)
  143. {
  144. return MatchImgTags(html).Where(n => n.HasAttribute("src")).Select(n => n.GetAttribute("src"));
  145. }
  146. /// <summary>
  147. /// 获取html中第一个img标签的src
  148. /// </summary>
  149. /// <param name="html"></param>
  150. /// <returns></returns>
  151. public static string MatchFirstImgSrc(this string html)
  152. {
  153. return MatchImgSrcs(html).FirstOrDefault();
  154. }
  155. /// <summary>
  156. /// 随机获取html代码中的img标签的src属性
  157. /// </summary>
  158. /// <param name="html"></param>
  159. /// <returns></returns>
  160. public static string MatchRandomImgSrc(this string html)
  161. {
  162. var srcs = MatchImgSrcs(html).ToList();
  163. var rnd = new Random();
  164. return srcs.Count > 0 ? srcs[rnd.Next(srcs.Count)] : default;
  165. }
  166. /// <summary>
  167. /// 按顺序优先获取html代码中的img标签的src属性
  168. /// </summary>
  169. /// <param name="html"></param>
  170. /// <returns></returns>
  171. public static string MatchSeqRandomImgSrc(this string html)
  172. {
  173. var srcs = MatchImgSrcs(html).ToList();
  174. return srcs.Count > 0 ? srcs.Select((s, i) => new WeightedItem<string>(s, srcs.Count - i)).WeightedItem() : default;
  175. }
  176. /// <summary>
  177. /// 替换html字符
  178. /// </summary>
  179. /// <param name="strHtml">html</param>
  180. public static string EncodeHtml(this string strHtml)
  181. {
  182. if (strHtml != "")
  183. {
  184. return strHtml.Replace(",", "&def").Replace("'", "&dot").Replace(";", "&dec");
  185. }
  186. return "";
  187. }
  188. }
  189. }