HtmlTools.cs 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. using AngleSharp;
  2. using AngleSharp.Dom;
  3. using Ganss.Xss;
  4. using Masuit.Tools.RandomSelector;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Linq;
  8. using System.Text.RegularExpressions;
  9. namespace Masuit.Tools.Html
  10. {
  11. /// <summary>
  12. /// html工具类
  13. /// </summary>
  14. public static partial class HtmlTools
  15. {
  16. private static readonly HtmlSanitizer Sanitizer = new HtmlSanitizer();
  17. static HtmlTools()
  18. {
  19. Sanitizer.AllowedAttributes.Remove("id");
  20. Sanitizer.AllowedAttributes.Remove("alt");
  21. Sanitizer.AllowedCssProperties.Remove("font-family");
  22. Sanitizer.AllowedCssProperties.Remove("background-color");
  23. Sanitizer.KeepChildNodes = true;
  24. Sanitizer.AllowedTags.Remove("input");
  25. Sanitizer.AllowedTags.Remove("button");
  26. Sanitizer.AllowedTags.Remove("iframe");
  27. Sanitizer.AllowedTags.Remove("frame");
  28. Sanitizer.AllowedTags.Remove("textarea");
  29. Sanitizer.AllowedTags.Remove("select");
  30. Sanitizer.AllowedTags.Remove("form");
  31. Sanitizer.AllowedAttributes.Add("src");
  32. Sanitizer.AllowedAttributes.Add("class");
  33. Sanitizer.AllowedAttributes.Add("style");
  34. }
  35. /// <summary>
  36. /// 标准的防止html的xss净化器
  37. /// </summary>
  38. /// <param name="html"></param>
  39. /// <returns></returns>
  40. public static string HtmlSantinizerStandard(this string html)
  41. {
  42. return Sanitizer.Sanitize(html);
  43. }
  44. /// <summary>
  45. /// 自定义的防止html的xss净化器
  46. /// </summary>
  47. /// <param name="html">源html</param>
  48. /// <param name="labels">需要移除的标签集合</param>
  49. /// <param name="attributes">需要移除的属性集合</param>
  50. /// <param name="styles">需要移除的样式集合</param>
  51. /// <returns></returns>
  52. public static string HtmlSantinizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
  53. {
  54. if (labels != null)
  55. {
  56. foreach (string label in labels)
  57. {
  58. Sanitizer.AllowedTags.Remove(label);
  59. }
  60. }
  61. if (attributes != null)
  62. {
  63. foreach (string attr in attributes)
  64. {
  65. Sanitizer.AllowedAttributes.Remove(attr);
  66. }
  67. }
  68. if (styles != null)
  69. {
  70. foreach (string p in styles)
  71. {
  72. Sanitizer.AllowedCssProperties.Remove(p);
  73. }
  74. }
  75. Sanitizer.KeepChildNodes = true;
  76. return Sanitizer.Sanitize(html);
  77. }
  78. /// <summary>
  79. /// 去除html标签后并截取字符串
  80. /// </summary>
  81. /// <param name="html">源html</param>
  82. /// <param name="length">截取长度</param>
  83. /// <returns></returns>
  84. public static string RemoveHtmlTag(this string html, int length = 0)
  85. {
  86. var context = BrowsingContext.New(Configuration.Default);
  87. var doc = context.OpenAsync(req => req.Content(html)).Result;
  88. var strText = doc.Body.TextContent;
  89. if (length > 0 && strText.Length > length)
  90. {
  91. return strText.Substring(0, length);
  92. }
  93. return strText;
  94. }
  95. /// <summary>
  96. /// 替换html的img路径为绝对路径
  97. /// </summary>
  98. /// <param name="html"></param>
  99. /// <param name="imgDest"></param>
  100. /// <returns></returns>
  101. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  102. /// <summary>
  103. /// 将src的绝对路径换成相对路径
  104. /// </summary>
  105. /// <param name="s"></param>
  106. /// <returns></returns>
  107. public static string ConvertImgSrcToRelativePath(this string s)
  108. {
  109. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  110. }
  111. /// <summary>
  112. /// 匹配html的所有img标签集合
  113. /// </summary>
  114. /// <param name="html"></param>
  115. /// <returns></returns>
  116. public static IHtmlCollection<IElement> MatchImgTags(this string html)
  117. {
  118. var context = BrowsingContext.New(Configuration.Default);
  119. var doc = context.OpenAsync(req => req.Content(html)).Result;
  120. return doc.Body.GetElementsByTagName("img");
  121. }
  122. /// <summary>
  123. /// 匹配html的所有img标签的src集合
  124. /// </summary>
  125. /// <param name="html"></param>
  126. /// <returns></returns>
  127. public static IEnumerable<string> MatchImgSrcs(this string html)
  128. {
  129. return MatchImgTags(html).Where(n => n.HasAttribute("src")).Select(n => n.GetAttribute("src"));
  130. }
  131. /// <summary>
  132. /// 获取html中第一个img标签的src
  133. /// </summary>
  134. /// <param name="html"></param>
  135. /// <returns></returns>
  136. public static string MatchFirstImgSrc(this string html)
  137. {
  138. return MatchImgSrcs(html).FirstOrDefault();
  139. }
  140. /// <summary>
  141. /// 随机获取html代码中的img标签的src属性
  142. /// </summary>
  143. /// <param name="html"></param>
  144. /// <returns></returns>
  145. public static string MatchRandomImgSrc(this string html)
  146. {
  147. var srcs = MatchImgSrcs(html).ToList();
  148. var rnd = new Random();
  149. return srcs.Count > 0 ? srcs[rnd.Next(srcs.Count)] : default;
  150. }
  151. /// <summary>
  152. /// 按顺序优先获取html代码中的img标签的src属性
  153. /// </summary>
  154. /// <param name="html"></param>
  155. /// <returns></returns>
  156. public static string MatchSeqRandomImgSrc(this string html)
  157. {
  158. var srcs = MatchImgSrcs(html).ToList();
  159. return srcs.Count > 0 ? srcs.Select((s, i) => new WeightedItem<string>(s, srcs.Count - i)).WeightedItem() : default;
  160. }
  161. /// <summary>
  162. /// 替换回车换行符为html换行符
  163. /// </summary>
  164. /// <param name="str">html</param>
  165. public static string StrFormat(this string str)
  166. {
  167. return str.Replace("\r\n", "<br />").Replace("\n", "<br />");
  168. }
  169. /// <summary>
  170. /// 替换html字符
  171. /// </summary>
  172. /// <param name="strHtml">html</param>
  173. public static string EncodeHtml(this string strHtml)
  174. {
  175. if (strHtml != "")
  176. {
  177. return strHtml.Replace(",", "&def").Replace("'", "&dot").Replace(";", "&dec");
  178. }
  179. return "";
  180. }
  181. }
  182. }