HtmlTools.cs 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. using Ganss.XSS;
  2. using HtmlAgilityPack;
  3. using System;
  4. using System.Collections.Generic;
  5. using System.Linq;
  6. using System.Text.RegularExpressions;
  7. namespace Masuit.Tools.Html
  8. {
  9. /// <summary>
  10. /// html工具类
  11. /// </summary>
  12. public static partial class HtmlTools
  13. {
  14. /// <summary>
  15. /// 标准的防止html的xss净化器
  16. /// </summary>
  17. /// <param name="html"></param>
  18. /// <returns></returns>
  19. public static string HtmlSantinizerStandard(this string html)
  20. {
  21. var sanitizer = new HtmlSanitizer();
  22. sanitizer.AllowedAttributes.Remove("id");
  23. sanitizer.AllowedAttributes.Remove("alt");
  24. sanitizer.AllowedCssProperties.Remove("font-family");
  25. sanitizer.AllowedCssProperties.Remove("background-color");
  26. sanitizer.KeepChildNodes = true;
  27. sanitizer.AllowedTags.Remove("input");
  28. sanitizer.AllowedTags.Remove("button");
  29. sanitizer.AllowedTags.Remove("iframe");
  30. sanitizer.AllowedTags.Remove("frame");
  31. sanitizer.AllowedTags.Remove("textarea");
  32. sanitizer.AllowedTags.Remove("select");
  33. sanitizer.AllowedTags.Remove("form");
  34. return sanitizer.Sanitize(html);
  35. }
  36. /// <summary>
  37. /// 自定义的防止html的xss净化器
  38. /// </summary>
  39. /// <param name="html">源html</param>
  40. /// <param name="labels">需要移除的标签集合</param>
  41. /// <param name="attributes">需要移除的属性集合</param>
  42. /// <param name="styles">需要移除的样式集合</param>
  43. /// <returns></returns>
  44. public static string HtmlSantinizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
  45. {
  46. var sanitizer = new HtmlSanitizer();
  47. if (labels != null)
  48. {
  49. foreach (string label in labels)
  50. {
  51. sanitizer.AllowedTags.Remove(label);
  52. }
  53. }
  54. if (attributes != null)
  55. {
  56. foreach (string attr in attributes)
  57. {
  58. sanitizer.AllowedAttributes.Remove(attr);
  59. }
  60. }
  61. if (styles != null)
  62. {
  63. foreach (string p in styles)
  64. {
  65. sanitizer.AllowedCssProperties.Remove(p);
  66. }
  67. }
  68. sanitizer.KeepChildNodes = true;
  69. return sanitizer.Sanitize(html);
  70. }
  71. /// <summary>
  72. /// 去除html标签后并截取字符串
  73. /// </summary>
  74. /// <param name="html">源html</param>
  75. /// <param name="length">截取长度</param>
  76. /// <returns></returns>
  77. public static string RemoveHtmlTag(this string html, int length = 0)
  78. {
  79. var doc = new HtmlDocument();
  80. doc.LoadHtml(html);
  81. var strText = doc.DocumentNode.InnerText;
  82. if (length > 0 && strText.Length > length)
  83. {
  84. return strText.Substring(0, length);
  85. }
  86. return strText;
  87. }
  88. /// <summary>
  89. /// 清理Word文档转html后的冗余标签属性
  90. /// </summary>
  91. /// <param name="html"></param>
  92. /// <returns></returns>
  93. public static string ClearHtml(this string html)
  94. {
  95. string s = Regex.Match(Regex.Replace(html, @"background-color:#?\w{3,7}|font-family:'?[\w|\(|\)]*'?;?", string.Empty), @"<body[^>]*>([\s\S]*)<\/body>").Groups[1].Value.Replace("&#xa0;", string.Empty);
  96. s = Regex.Replace(s, @"\w+-?\w+:0\w+;?", string.Empty); //去除多余的零值属性
  97. s = Regex.Replace(s, "alt=\"(.+?)\"", string.Empty); //除去alt属性
  98. s = Regex.Replace(s, @"-aw.+?\s", string.Empty); //去除Word产生的-aw属性
  99. return s;
  100. }
  101. /// <summary>
  102. /// 替换html的img路径为绝对路径
  103. /// </summary>
  104. /// <param name="html"></param>
  105. /// <param name="imgDest"></param>
  106. /// <returns></returns>
  107. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  108. /// <summary>
  109. /// 将src的绝对路径换成相对路径
  110. /// </summary>
  111. /// <param name="s"></param>
  112. /// <returns></returns>
  113. public static string ConvertImgSrcToRelativePath(this string s)
  114. {
  115. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  116. }
  117. /// <summary>
  118. /// 匹配html的所有img标签集合
  119. /// </summary>
  120. /// <param name="html"></param>
  121. /// <returns></returns>
  122. public static IEnumerable<HtmlNode> MatchImgTags(this string html)
  123. {
  124. var doc = new HtmlDocument();
  125. doc.LoadHtml(html);
  126. var nodes = doc.DocumentNode.Descendants("img");
  127. return nodes;
  128. }
  129. /// <summary>
  130. /// 匹配html的所有img标签的src集合
  131. /// </summary>
  132. /// <param name="html"></param>
  133. /// <returns></returns>
  134. public static IEnumerable<string> MatchImgSrcs(this string html)
  135. {
  136. return MatchImgTags(html).Where(n => n.Attributes.Contains("src")).Select(n => n.Attributes["src"].Value);
  137. }
  138. /// <summary>
  139. /// 获取html中第一个img标签的src
  140. /// </summary>
  141. /// <param name="html"></param>
  142. /// <returns></returns>
  143. public static string MatchFirstImgSrc(this string html)
  144. {
  145. return MatchImgSrcs(html).FirstOrDefault();
  146. }
  147. /// <summary>
  148. /// 随机获取html代码中的img标签的src属性
  149. /// </summary>
  150. /// <param name="html"></param>
  151. /// <returns></returns>
  152. public static string MatchRandomImgSrc(this string html)
  153. {
  154. int count = MatchImgSrcs(html).Count();
  155. var rnd = new Random();
  156. return MatchImgSrcs(html).ElementAtOrDefault(rnd.Next(count));
  157. }
  158. /// <summary>
  159. /// 替换回车换行符为html换行符
  160. /// </summary>
  161. /// <param name="str">html</param>
  162. public static string StrFormat(this string str)
  163. {
  164. str = str.Replace("\r\n", "<br />");
  165. str = str.Replace("\n", "<br />");
  166. var str2 = str;
  167. return str2;
  168. }
  169. /// <summary>
  170. /// 替换html字符
  171. /// </summary>
  172. /// <param name="strHtml">html</param>
  173. public static string EncodeHtml(this string strHtml)
  174. {
  175. if (strHtml != "")
  176. {
  177. strHtml = strHtml.Replace(",", "&def");
  178. strHtml = strHtml.Replace("'", "&dot");
  179. strHtml = strHtml.Replace(";", "&dec");
  180. return strHtml;
  181. }
  182. return "";
  183. }
  184. }
  185. }