HtmlTools.cs 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. using Ganss.XSS;
  2. using HtmlAgilityPack;
  3. using Masuit.Tools.RandomSelector;
  4. using System;
  5. using System.Collections.Generic;
  6. using System.Linq;
  7. using System.Text.RegularExpressions;
  8. namespace Masuit.Tools.Html
  9. {
  10. /// <summary>
  11. /// html工具类
  12. /// </summary>
  13. public static partial class HtmlTools
  14. {
  15. private static readonly HtmlSanitizer Sanitizer = new HtmlSanitizer();
  16. static HtmlTools()
  17. {
  18. Sanitizer.AllowedAttributes.Remove("id");
  19. Sanitizer.AllowedAttributes.Remove("alt");
  20. Sanitizer.AllowedCssProperties.Remove("font-family");
  21. Sanitizer.AllowedCssProperties.Remove("background-color");
  22. Sanitizer.KeepChildNodes = true;
  23. Sanitizer.AllowedTags.Remove("input");
  24. Sanitizer.AllowedTags.Remove("button");
  25. Sanitizer.AllowedTags.Remove("iframe");
  26. Sanitizer.AllowedTags.Remove("frame");
  27. Sanitizer.AllowedTags.Remove("textarea");
  28. Sanitizer.AllowedTags.Remove("select");
  29. Sanitizer.AllowedTags.Remove("form");
  30. Sanitizer.AllowedAttributes.Add("src");
  31. Sanitizer.AllowedAttributes.Add("class");
  32. Sanitizer.AllowedAttributes.Add("style");
  33. }
  34. /// <summary>
  35. /// 标准的防止html的xss净化器
  36. /// </summary>
  37. /// <param name="html"></param>
  38. /// <returns></returns>
  39. public static string HtmlSantinizerStandard(this string html)
  40. {
  41. return Sanitizer.Sanitize(html);
  42. }
  43. /// <summary>
  44. /// 自定义的防止html的xss净化器
  45. /// </summary>
  46. /// <param name="html">源html</param>
  47. /// <param name="labels">需要移除的标签集合</param>
  48. /// <param name="attributes">需要移除的属性集合</param>
  49. /// <param name="styles">需要移除的样式集合</param>
  50. /// <returns></returns>
  51. public static string HtmlSantinizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
  52. {
  53. if (labels != null)
  54. {
  55. foreach (string label in labels)
  56. {
  57. Sanitizer.AllowedTags.Remove(label);
  58. }
  59. }
  60. if (attributes != null)
  61. {
  62. foreach (string attr in attributes)
  63. {
  64. Sanitizer.AllowedAttributes.Remove(attr);
  65. }
  66. }
  67. if (styles != null)
  68. {
  69. foreach (string p in styles)
  70. {
  71. Sanitizer.AllowedCssProperties.Remove(p);
  72. }
  73. }
  74. Sanitizer.KeepChildNodes = true;
  75. return Sanitizer.Sanitize(html);
  76. }
  77. /// <summary>
  78. /// 去除html标签后并截取字符串
  79. /// </summary>
  80. /// <param name="html">源html</param>
  81. /// <param name="length">截取长度</param>
  82. /// <returns></returns>
  83. public static string RemoveHtmlTag(this string html, int length = 0)
  84. {
  85. var doc = new HtmlDocument();
  86. doc.LoadHtml(html);
  87. var strText = doc.DocumentNode.InnerText;
  88. if (length > 0 && strText.Length > length)
  89. {
  90. return strText.Substring(0, length);
  91. }
  92. return strText;
  93. }
  94. /// <summary>
  95. /// 清理Word文档转html后的冗余标签属性
  96. /// </summary>
  97. /// <param name="html"></param>
  98. /// <returns></returns>
  99. public static string ClearHtml(this string html)
  100. {
  101. string s = Regex.Match(Regex.Replace(html, @"background-color:#?\w{3,7}|font-family:'?[\w|\(|\)]*'?;?", string.Empty), @"<body[^>]*>([\s\S]*)<\/body>").Groups[1].Value.Replace("&#xa0;", string.Empty);
  102. s = Regex.Replace(s, @"\w+-?\w+:0\w+;?", string.Empty); //去除多余的零值属性
  103. s = Regex.Replace(s, "alt=\"(.+?)\"", string.Empty); //除去alt属性
  104. s = Regex.Replace(s, @"-aw.+?\s", string.Empty); //去除Word产生的-aw属性
  105. return s;
  106. }
  107. /// <summary>
  108. /// 替换html的img路径为绝对路径
  109. /// </summary>
  110. /// <param name="html"></param>
  111. /// <param name="imgDest"></param>
  112. /// <returns></returns>
  113. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  114. /// <summary>
  115. /// 将src的绝对路径换成相对路径
  116. /// </summary>
  117. /// <param name="s"></param>
  118. /// <returns></returns>
  119. public static string ConvertImgSrcToRelativePath(this string s)
  120. {
  121. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  122. }
  123. /// <summary>
  124. /// 匹配html的所有img标签集合
  125. /// </summary>
  126. /// <param name="html"></param>
  127. /// <returns></returns>
  128. public static IEnumerable<HtmlNode> MatchImgTags(this string html)
  129. {
  130. var doc = new HtmlDocument();
  131. doc.LoadHtml(html);
  132. var nodes = doc.DocumentNode.Descendants("img");
  133. return nodes;
  134. }
  135. /// <summary>
  136. /// 匹配html的所有img标签的src集合
  137. /// </summary>
  138. /// <param name="html"></param>
  139. /// <returns></returns>
  140. public static IEnumerable<string> MatchImgSrcs(this string html)
  141. {
  142. return MatchImgTags(html).Where(n => n.Attributes.Contains("src")).Select(n => n.Attributes["src"].Value);
  143. }
  144. /// <summary>
  145. /// 获取html中第一个img标签的src
  146. /// </summary>
  147. /// <param name="html"></param>
  148. /// <returns></returns>
  149. public static string MatchFirstImgSrc(this string html)
  150. {
  151. return MatchImgSrcs(html).FirstOrDefault();
  152. }
  153. /// <summary>
  154. /// 随机获取html代码中的img标签的src属性
  155. /// </summary>
  156. /// <param name="html"></param>
  157. /// <returns></returns>
  158. public static string MatchRandomImgSrc(this string html)
  159. {
  160. var srcs = MatchImgSrcs(html).ToList();
  161. var rnd = new Random();
  162. return srcs.Count > 0 ? srcs[rnd.Next(srcs.Count)] : default;
  163. }
  164. /// <summary>
  165. /// 按顺序优先获取html代码中的img标签的src属性
  166. /// </summary>
  167. /// <param name="html"></param>
  168. /// <returns></returns>
  169. public static string MatchSeqRandomImgSrc(this string html)
  170. {
  171. var srcs = MatchImgSrcs(html).ToList();
  172. return srcs.Count > 0 ? srcs.Select((s, i) => new WeightedItem<string>(s, srcs.Count - i)).WeightedItem() : default;
  173. }
  174. /// <summary>
  175. /// 替换回车换行符为html换行符
  176. /// </summary>
  177. /// <param name="str">html</param>
  178. public static string StrFormat(this string str)
  179. {
  180. return str.Replace("\r\n", "<br />").Replace("\n", "<br />");
  181. }
  182. /// <summary>
  183. /// 替换html字符
  184. /// </summary>
  185. /// <param name="strHtml">html</param>
  186. public static string EncodeHtml(this string strHtml)
  187. {
  188. if (strHtml != "")
  189. {
  190. return strHtml.Replace(",", "&def").Replace("'", "&dot").Replace(";", "&dec");
  191. }
  192. return "";
  193. }
  194. }
  195. }