HtmlSanitizer.cs 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951
  1. using AngleSharp;
  2. using AngleSharp.Css;
  3. using AngleSharp.Css.Dom;
  4. using AngleSharp.Css.Parser;
  5. using AngleSharp.Dom;
  6. using AngleSharp.Html.Dom;
  7. using AngleSharp.Html.Parser;
  8. using System;
  9. using System.Collections.Generic;
  10. using System.Globalization;
  11. using System.IO;
  12. using System.Linq;
  13. using System.Text;
  14. using System.Text.RegularExpressions;
  15. namespace Ganss.Xss
  16. {
  17. /// <summary>
  18. /// Cleans HTML documents and fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
  19. /// </summary>
  20. /// <remarks>
  21. /// XSS attacks can occur at several levels within an HTML document or fragment:
  22. /// <list type="bullet">
  23. /// <item>HTML tags (e.g. the &lt;script&gt; tag)</item>
  24. /// <item>HTML attributes (e.g. the "onload" attribute)</item>
  25. /// <item>CSS styles (url property values)</item>
  26. /// <item>malformed HTML or HTML that exploits parser bugs in specific browsers</item>
  27. /// </list>
  28. /// <para>
  29. /// The HtmlSanitizer class addresses all of these possible attack vectors by using a sophisticated HTML parser (<a href="https://github.com/AngleSharp/AngleSharp">AngleSharp</a>).
  30. /// </para>
  31. /// <para>
  32. /// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
  33. /// <list type="bullet">
  34. /// <item>You can specify the allowed HTML tags through the property <see cref="AllowedTags"/>. All other tags will be stripped.</item>
  35. /// <item>You can specify the allowed HTML attributes through the property <see cref="AllowedAttributes"/>. All other attributes will be stripped.</item>
  36. /// <item>You can specify the allowed CSS property names through the property <see cref="AllowedCssProperties"/>. All other styles will be stripped.</item>
  37. /// <item>You can specify the allowed URI schemes through the property <see cref="AllowedSchemes"/>. All other URIs will be stripped.</item>
  38. /// <item>You can specify the HTML attributes that contain URIs (such as "src", "href" etc.) through the property <see cref="UriAttributes"/>.</item>
  39. /// </list>
  40. /// </para>
  41. /// </remarks>
  42. /// <example>
  43. /// <code>
  44. /// <![CDATA[
  45. /// var sanitizer = new HtmlSanitizer();
  46. /// var html = @"<script>alert('xss')</script><div onload=""alert('xss')"" style=""background-color: test"">Test<img src=""test.gif"" style=""background-image: url(javascript:alert('xss')); margin: 10px""></div>";
  47. /// var sanitized = sanitizer.Sanitize(html, "http://www.example.com");
  48. /// // -> "<div style="background-color: test">Test<img style="margin: 10px" src="http://www.example.com/test.gif"></div>"
  49. /// ]]>
  50. /// </code>
  51. /// </example>
  52. public class HtmlSanitizer : IHtmlSanitizer
  53. {
  54. // from http://genshi.edgewall.org/
  55. private static readonly Regex CssUnicodeEscapes = new(@"\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'""{};:()#*])", RegexOptions.Compiled);
  56. private static readonly Regex CssComments = new(@"/\*.*?\*/", RegexOptions.Compiled);
  57. // IE6 <http://heideri.ch/jso/#80>
  58. private static readonly Regex CssExpression = new(@"[eE\uFF25\uFF45][xX\uFF38\uFF58][pP\uFF30\uFF50][rR\u0280\uFF32\uFF52][eE\uFF25\uFF45][sS\uFF33\uFF53]{2}[iI\u026A\uFF29\uFF49][oO\uFF2F\uFF4F][nN\u0274\uFF2E\uFF4E]", RegexOptions.Compiled);
  59. private static readonly Regex CssUrl = new(@"[Uu][Rr\u0280][Ll\u029F]\((['""]?)([^'"")]+)(['""]?)", RegexOptions.Compiled);
  60. private static readonly Regex WhitespaceRegex = new(@"\s*", RegexOptions.Compiled);
  61. private static readonly IConfiguration defaultConfiguration = Configuration.Default.WithCss(new CssParserOptions
  62. {
  63. IsIncludingUnknownDeclarations = true,
  64. IsIncludingUnknownRules = true,
  65. IsToleratingInvalidSelectors = true,
  66. });
  67. private static readonly HtmlParser defaultHtmlParser = new(new HtmlParserOptions { IsScripting = true }, BrowsingContext.New(defaultConfiguration));
  68. /// <summary>
  69. /// Initializes a new instance of the <see cref="HtmlSanitizer"/> class
  70. /// with the default options.
  71. /// </summary>
  72. public HtmlSanitizer()
  73. {
  74. AllowedTags = new HashSet<string>(HtmlSanitizerDefaults.AllowedTags, StringComparer.OrdinalIgnoreCase);
  75. AllowedSchemes = new HashSet<string>(HtmlSanitizerDefaults.AllowedSchemes, StringComparer.OrdinalIgnoreCase);
  76. AllowedAttributes = new HashSet<string>(HtmlSanitizerDefaults.AllowedAttributes, StringComparer.OrdinalIgnoreCase);
  77. UriAttributes = new HashSet<string>(HtmlSanitizerDefaults.UriAttributes, StringComparer.OrdinalIgnoreCase);
  78. AllowedCssProperties = new HashSet<string>(HtmlSanitizerDefaults.AllowedCssProperties, StringComparer.OrdinalIgnoreCase);
  79. AllowedAtRules = new HashSet<CssRuleType>(HtmlSanitizerDefaults.AllowedAtRules);
  80. AllowedClasses = new HashSet<string>(HtmlSanitizerDefaults.AllowedClasses);
  81. }
  82. /// <summary>
  83. /// Initializes a new instance of the <see cref="HtmlSanitizer"/> class
  84. /// with the given options.
  85. /// </summary>
  86. /// <param name="options">Options to control the sanitizing.</param>
  87. public HtmlSanitizer(HtmlSanitizerOptions options)
  88. {
  89. AllowedTags = new HashSet<string>(options.AllowedTags, StringComparer.OrdinalIgnoreCase);
  90. AllowedSchemes = new HashSet<string>(options.AllowedSchemes, StringComparer.OrdinalIgnoreCase);
  91. AllowedAttributes = new HashSet<string>(options.AllowedAttributes, StringComparer.OrdinalIgnoreCase);
  92. UriAttributes = new HashSet<string>(options.UriAttributes, StringComparer.OrdinalIgnoreCase);
  93. AllowedClasses = new HashSet<string>(options.AllowedCssClasses, StringComparer.OrdinalIgnoreCase);
  94. AllowedCssProperties = new HashSet<string>(options.AllowedCssProperties, StringComparer.OrdinalIgnoreCase);
  95. AllowedAtRules = new HashSet<CssRuleType>(options.AllowedAtRules);
  96. }
  97. /// <summary>
  98. /// Gets or sets the default <see cref="Action{IComment}"/> method that encodes comments.
  99. /// </summary>
  100. public Action<IComment> EncodeComment { get; set; } = DefaultEncodeComment;
  101. /// <summary>
  102. /// Gets or sets the default <see cref="Action{IElement}"/> method that encodes literal text content.
  103. /// </summary>
  104. public Action<IElement> EncodeLiteralTextElementContent { get; set; } = DefaultEncodeLiteralTextElementContent;
  105. /// <summary>
  106. /// Gets or sets the default value indicating whether to keep child nodes of elements that are removed. Default is false.
  107. /// </summary>
  108. public static bool DefaultKeepChildNodes { get; set; } = false;
  109. /// <summary>
  110. /// Gets or sets a value indicating whether to keep child nodes of elements that are removed. Default is <see cref="DefaultKeepChildNodes"/>.
  111. /// </summary>
  112. public bool KeepChildNodes { get; set; } = DefaultKeepChildNodes;
  113. /// <summary>
  114. /// Gets or sets the default <see cref="Func{HtmlParser}"/> object that creates the parser used for parsing the input.
  115. /// </summary>
  116. public static Func<HtmlParser> DefaultHtmlParserFactory { get; set; } = () => defaultHtmlParser;
  117. /// <summary>
  118. /// Gets or sets the <see cref="Func{HtmlParser}"/> object the creates the parser used for parsing the input.
  119. /// </summary>
  120. public Func<HtmlParser> HtmlParserFactory { get; set; } = DefaultHtmlParserFactory;
  121. /// <summary>
  122. /// Gets or sets the default <see cref="IMarkupFormatter"/> object used for generating output. Default is <see cref="HtmlFormatter.Instance"/>.
  123. /// </summary>
  124. public static IMarkupFormatter DefaultOutputFormatter { get; set; } = HtmlFormatter.Instance;
  125. /// <summary>
  126. /// Gets or sets the <see cref="IMarkupFormatter"/> object used for generating output. Default is <see cref="DefaultOutputFormatter"/>.
  127. /// </summary>
  128. public IMarkupFormatter OutputFormatter { get; set; } = DefaultOutputFormatter;
  129. /// <summary>
  130. /// Gets or sets the default <see cref="IStyleFormatter"/> object used for generating CSS output. Default is <see cref="CssStyleFormatter.Instance"/>.
  131. /// </summary>
  132. public static IStyleFormatter DefaultStyleFormatter { get; set; } = CssStyleFormatter.Instance;
  133. /// <summary>
  134. /// Gets or sets the <see cref="IStyleFormatter"/> object used for generating CSS output. Default is <see cref="DefaultStyleFormatter"/>.
  135. /// </summary>
  136. public IStyleFormatter StyleFormatter { get; set; } = DefaultStyleFormatter;
  137. /// <summary>
  138. /// Gets or sets the allowed CSS at-rules such as "@media" and "@font-face".
  139. /// </summary>
  140. /// <value>
  141. /// The allowed CSS at-rules.
  142. /// </value>
  143. public ISet<CssRuleType> AllowedAtRules { get; private set; }
  144. /// <summary>
  145. /// Gets or sets the allowed URI schemes such as "http" and "https".
  146. /// </summary>
  147. /// <value>
  148. /// The allowed URI schemes.
  149. /// </value>
  150. public ISet<string> AllowedSchemes { get; private set; }
  151. /// <summary>
  152. /// Gets or sets the allowed HTML tag names such as "a" and "div".
  153. /// </summary>
  154. /// <value>
  155. /// The allowed tag names.
  156. /// </value>
  157. public ISet<string> AllowedTags { get; private set; }
  158. /// <summary>
  159. /// Gets or sets the allowed HTML attributes such as "href" and "alt".
  160. /// </summary>
  161. /// <value>
  162. /// The allowed HTML attributes.
  163. /// </value>
  164. public ISet<string> AllowedAttributes { get; private set; }
  165. /// <summary>
  166. /// Allow all HTML5 data attributes; the attributes prefixed with <c>data-</c>.
  167. /// </summary>
  168. public bool AllowDataAttributes { get; set; }
  169. /// <summary>
  170. /// Gets or sets the HTML attributes that can contain a URI such as "href".
  171. /// </summary>
  172. /// <value>
  173. /// The URI attributes.
  174. /// </value>
  175. public ISet<string> UriAttributes { get; private set; }
  176. /// <summary>
  177. /// Gets or sets the allowed CSS properties such as "font" and "margin".
  178. /// </summary>
  179. /// <value>
  180. /// The allowed CSS properties.
  181. /// </value>
  182. public ISet<string> AllowedCssProperties { get; private set; }
  183. /// <summary>
  184. /// Gets or sets a regex that must not match for legal CSS property values.
  185. /// </summary>
  186. /// <value>
  187. /// The regex.
  188. /// </value>
  189. public Regex DisallowCssPropertyValue { get; set; } = DefaultDisallowedCssPropertyValue;
  190. /// <summary>
  191. /// Gets or sets the allowed CSS classes. If the set is empty, all classes will be allowed.
  192. /// </summary>
  193. /// <value>
  194. /// The allowed CSS classes. An empty set means all classes are allowed.
  195. /// </value>
  196. public ISet<string> AllowedClasses { get; private set; }
  197. /// <summary>
  198. /// Occurs after sanitizing the document and post processing nodes.
  199. /// </summary>
  200. public event EventHandler<PostProcessDomEventArgs>? PostProcessDom;
  201. /// <summary>
  202. /// Occurs for every node after sanitizing.
  203. /// </summary>
  204. public event EventHandler<PostProcessNodeEventArgs>? PostProcessNode;
  205. /// <summary>
  206. /// Occurs before a tag is removed.
  207. /// </summary>
  208. public event EventHandler<RemovingTagEventArgs>? RemovingTag;
  209. /// <summary>
  210. /// Occurs before an attribute is removed.
  211. /// </summary>
  212. public event EventHandler<RemovingAttributeEventArgs>? RemovingAttribute;
  213. /// <summary>
  214. /// Occurs before a style is removed.
  215. /// </summary>
  216. public event EventHandler<RemovingStyleEventArgs>? RemovingStyle;
  217. /// <summary>
  218. /// Occurs before an at-rule is removed.
  219. /// </summary>
  220. public event EventHandler<RemovingAtRuleEventArgs>? RemovingAtRule;
  221. /// <summary>
  222. /// Occurs before a comment is removed.
  223. /// </summary>
  224. public event EventHandler<RemovingCommentEventArgs>? RemovingComment;
  225. /// <summary>
  226. /// Occurs before a CSS class is removed.
  227. /// </summary>
  228. public event EventHandler<RemovingCssClassEventArgs>? RemovingCssClass;
  229. /// <summary>
  230. /// Occurs when a URL is being sanitized.
  231. /// </summary>
  232. public event EventHandler<FilterUrlEventArgs>? FilterUrl;
  233. /// <summary>
  234. /// Raises the <see cref="E:PostProcessDom" /> event.
  235. /// </summary>
  236. /// <param name="e">The <see cref="PostProcessDomEventArgs"/> instance containing the event data.</param>
  237. protected virtual void OnPostProcessDom(PostProcessDomEventArgs e)
  238. {
  239. PostProcessDom?.Invoke(this, e);
  240. }
  241. /// <summary>
  242. /// Raises the <see cref="E:PostProcessNode" /> event.
  243. /// </summary>
  244. /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param>
  245. protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e)
  246. {
  247. PostProcessNode?.Invoke(this, e);
  248. }
  249. /// <summary>
  250. /// Raises the <see cref="E:RemovingTag" /> event.
  251. /// </summary>
  252. /// <param name="e">The <see cref="RemovingTagEventArgs"/> instance containing the event data.</param>
  253. protected virtual void OnRemovingTag(RemovingTagEventArgs e)
  254. {
  255. RemovingTag?.Invoke(this, e);
  256. }
  257. /// <summary>
  258. /// Raises the <see cref="E:RemovingAttribute" /> event.
  259. /// </summary>
  260. /// <param name="e">The <see cref="RemovingAttributeEventArgs"/> instance containing the event data.</param>
  261. protected virtual void OnRemovingAttribute(RemovingAttributeEventArgs e)
  262. {
  263. RemovingAttribute?.Invoke(this, e);
  264. }
  265. /// <summary>
  266. /// Raises the <see cref="E:RemovingStyle" /> event.
  267. /// </summary>
  268. /// <param name="e">The <see cref="RemovingStyleEventArgs"/> instance containing the event data.</param>
  269. protected virtual void OnRemovingStyle(RemovingStyleEventArgs e)
  270. {
  271. RemovingStyle?.Invoke(this, e);
  272. }
  273. /// <summary>
  274. /// Raises the <see cref="E:RemovingAtRule" /> event.
  275. /// </summary>
  276. /// <param name="e">The <see cref="RemovingAtRuleEventArgs"/> instance containing the event data.</param>
  277. protected virtual void OnRemovingAtRule(RemovingAtRuleEventArgs e)
  278. {
  279. RemovingAtRule?.Invoke(this, e);
  280. }
  281. /// <summary>
  282. /// Raises the <see cref="E:RemovingComment" /> event.
  283. /// </summary>
  284. /// <param name="e">The <see cref="RemovingCommentEventArgs"/> instance containing the event data.</param>
  285. protected virtual void OnRemovingComment(RemovingCommentEventArgs e)
  286. {
  287. RemovingComment?.Invoke(this, e);
  288. }
  289. /// <summary>
  290. /// The default regex for disallowed CSS property values.
  291. /// </summary>
  292. public static readonly Regex DefaultDisallowedCssPropertyValue = new(@"[<>]", RegexOptions.Compiled);
  293. /// <summary>
  294. /// Raises the <see cref="E:RemovingCSSClass" /> event.
  295. /// </summary>
  296. /// <param name="e">The <see cref="RemovingCssClassEventArgs"/> instance containing the event data.</param>
  297. protected virtual void OnRemovingCssClass(RemovingCssClassEventArgs e)
  298. {
  299. RemovingCssClass?.Invoke(this, e);
  300. }
  301. /// <summary>
  302. /// Raises the <see cref="E:FilterUrl" /> event.
  303. /// </summary>
  304. /// <param name="e">The <see cref="FilterUrlEventArgs"/> instance containing the event data.</param>
  305. protected virtual void OnFilteringUrl(FilterUrlEventArgs e)
  306. {
  307. FilterUrl?.Invoke(this, e);
  308. }
  309. /// <summary>
  310. /// Return all nested subnodes of a node. The nodes are returned in DOM order.
  311. /// </summary>
  312. /// <param name="dom">The root node.</param>
  313. /// <returns>All nested subnodes.</returns>
  314. private static IEnumerable<INode> GetAllNodes(INode dom)
  315. {
  316. if (dom.ChildNodes.Length == 0) yield break;
  317. var s = new Stack<INode>();
  318. for (var i = dom.ChildNodes.Length - 1; i >= 0; i--)
  319. {
  320. s.Push(dom.ChildNodes[i]);
  321. }
  322. while (s.Count > 0)
  323. {
  324. var n = s.Pop();
  325. yield return n;
  326. for (var i = n.ChildNodes.Length - 1; i >= 0; i--)
  327. {
  328. s.Push(n.ChildNodes[i]);
  329. }
  330. }
  331. }
  332. /// <summary>
  333. /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
  334. /// </summary>
  335. /// <param name="html">The HTML body fragment to sanitize.</param>
  336. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  337. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  338. /// <returns>The sanitized HTML body fragment.</returns>
  339. public string Sanitize(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
  340. {
  341. using var dom = SanitizeDom(html, baseUrl);
  342. if (dom.Body == null) return string.Empty;
  343. var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? OutputFormatter);
  344. return output;
  345. }
  346. /// <summary>
  347. /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
  348. /// </summary>
  349. /// <param name="html">The HTML body fragment to sanitize.</param>
  350. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  351. /// <returns>The sanitized HTML document.</returns>
  352. public IHtmlDocument SanitizeDom(string html, string baseUrl = "")
  353. {
  354. var parser = HtmlParserFactory();
  355. var dom = parser.ParseDocument("<!doctype html><html><body>" + html);
  356. if (dom.Body != null)
  357. DoSanitize(dom, dom.Body, baseUrl);
  358. return dom;
  359. }
  360. /// <summary>
  361. /// Sanitizes the specified parsed HTML body fragment.
  362. /// If the document has not been parsed with CSS support then all styles will be removed.
  363. /// </summary>
  364. /// <param name="document">The parsed HTML document.</param>
  365. /// <param name="context">The node within which to sanitize.</param>
  366. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  367. /// <returns>The sanitized HTML document.</returns>
  368. public IHtmlDocument SanitizeDom(IHtmlDocument document, IHtmlElement? context = null, string baseUrl = "")
  369. {
  370. DoSanitize(document, context ?? (IParentNode)document, baseUrl);
  371. return document;
  372. }
  373. /// <summary>
  374. /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
  375. /// </summary>
  376. /// <param name="html">The HTML document to sanitize.</param>
  377. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  378. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  379. /// <returns>The sanitized HTML document.</returns>
  380. public string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
  381. {
  382. var parser = HtmlParserFactory();
  383. using var dom = parser.ParseDocument(html);
  384. DoSanitize(dom, dom, baseUrl);
  385. var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
  386. return output;
  387. }
  388. /// <summary>
  389. /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
  390. /// </summary>
  391. /// <param name="html">The HTML document to sanitize.</param>
  392. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  393. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  394. /// <returns>The sanitized HTML document.</returns>
  395. public string SanitizeDocument(Stream html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
  396. {
  397. var parser = HtmlParserFactory();
  398. using var dom = parser.ParseDocument(html);
  399. DoSanitize(dom, dom, baseUrl);
  400. var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
  401. return output;
  402. }
  403. /// <summary>
  404. /// Removes all comment nodes from a list of nodes.
  405. /// </summary>
  406. /// <param name="context">The node within which to remove comments.</param>
  407. /// <returns><c>true</c> if any comments were removed; otherwise, <c>false</c>.</returns>
  408. private void RemoveComments(INode context)
  409. {
  410. foreach (var comment in GetAllNodes(context).OfType<IComment>().ToList())
  411. {
  412. EncodeComment(comment);
  413. var e = new RemovingCommentEventArgs(comment);
  414. OnRemovingComment(e);
  415. if (!e.Cancel)
  416. comment.Remove();
  417. }
  418. }
  419. private static void DefaultEncodeComment(IComment comment)
  420. {
  421. var escapedText = comment.TextContent.Replace("<", "&lt;").Replace(">", "&gt;");
  422. if (escapedText != comment.TextContent)
  423. comment.TextContent = escapedText;
  424. }
  425. private static void DefaultEncodeLiteralTextElementContent(IElement tag)
  426. {
  427. var escapedHtml = tag.InnerHtml.Replace("<", "&lt;").Replace(">", "&gt;");
  428. if (escapedHtml != tag.InnerHtml)
  429. tag.InnerHtml = escapedHtml;
  430. if (tag.InnerHtml != escapedHtml) // setting InnerHtml does not work for noscript
  431. tag.SetInnerText(escapedHtml);
  432. }
  433. private void DoSanitize(IHtmlDocument dom, IParentNode context, string baseUrl = "")
  434. {
  435. // remove disallowed tags
  436. foreach (var tag in context.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
  437. {
  438. RemoveTag(tag, RemoveReason.NotAllowedTag);
  439. }
  440. // always encode text in raw data content
  441. foreach (var tag in context.QuerySelectorAll("*")
  442. .Where(t => t is not IHtmlStyleElement
  443. && t.Flags.HasFlag(NodeFlags.LiteralText)
  444. && !string.IsNullOrWhiteSpace(t.InnerHtml)))
  445. {
  446. EncodeLiteralTextElementContent(tag);
  447. }
  448. SanitizeStyleSheets(dom, baseUrl);
  449. // cleanup attributes
  450. foreach (var tag in context.QuerySelectorAll("*").ToList())
  451. {
  452. // remove disallowed attributes
  453. foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
  454. {
  455. RemoveAttribute(tag, attribute, RemoveReason.NotAllowedAttribute);
  456. }
  457. // sanitize URLs in URL-marked attributes
  458. foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList())
  459. {
  460. var url = SanitizeUrl(tag, attribute.Value, baseUrl);
  461. if (url == null)
  462. RemoveAttribute(tag, attribute, RemoveReason.NotAllowedUrlValue);
  463. else
  464. tag.SetAttribute(attribute.Name, url);
  465. }
  466. // sanitize the style attribute
  467. var oldStyleEmpty = string.IsNullOrEmpty(tag.GetAttribute("style"));
  468. SanitizeStyle(tag, baseUrl);
  469. // sanitize the value of the attributes
  470. foreach (var attribute in tag.Attributes.ToList())
  471. {
  472. // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
  473. // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
  474. if (attribute.Value.Contains("&{"))
  475. {
  476. RemoveAttribute(tag, attribute, RemoveReason.NotAllowedValue);
  477. }
  478. else
  479. {
  480. if (AllowedClasses.Any() && attribute.Name == "class")
  481. {
  482. var removedClasses = tag.ClassList.Except(AllowedClasses).ToArray();
  483. foreach (var removedClass in removedClasses)
  484. RemoveCssClass(tag, removedClass, RemoveReason.NotAllowedCssClass);
  485. if (!tag.ClassList.Any())
  486. RemoveAttribute(tag, attribute, RemoveReason.ClassAttributeEmpty);
  487. }
  488. else if (!oldStyleEmpty && attribute.Name == "style" && string.IsNullOrEmpty(attribute.Value))
  489. {
  490. RemoveAttribute(tag, attribute, RemoveReason.StyleAttributeEmpty);
  491. }
  492. }
  493. }
  494. }
  495. if (context is INode node)
  496. {
  497. RemoveComments(node);
  498. }
  499. DoPostProcess(dom, context as INode);
  500. }
  501. private void SanitizeStyleSheets(IHtmlDocument dom, string baseUrl)
  502. {
  503. foreach (var styleSheet in dom.StyleSheets.OfType<ICssStyleSheet>())
  504. {
  505. var styleTag = styleSheet.OwnerNode;
  506. for (int i = 0; i < styleSheet.Rules.Length;)
  507. {
  508. var rule = styleSheet.Rules[i];
  509. if (!SanitizeStyleRule(rule, styleTag, baseUrl) && RemoveAtRule(styleTag, rule))
  510. styleSheet.RemoveAt(i);
  511. else i++;
  512. }
  513. styleTag.InnerHtml = styleSheet.ToCss(StyleFormatter).Replace("<", "\\3c ");
  514. }
  515. }
  516. private bool SanitizeStyleRule(ICssRule rule, IElement styleTag, string baseUrl)
  517. {
  518. if (!AllowedAtRules.Contains(rule.Type)) return false;
  519. if (rule is ICssStyleRule styleRule)
  520. {
  521. SanitizeStyleDeclaration(styleTag, styleRule.Style, baseUrl);
  522. }
  523. else
  524. {
  525. if (rule is ICssGroupingRule groupingRule)
  526. {
  527. for (int i = 0; i < groupingRule.Rules.Length;)
  528. {
  529. var childRule = groupingRule.Rules[i];
  530. if (!SanitizeStyleRule(childRule, styleTag, baseUrl) && RemoveAtRule(styleTag, childRule))
  531. groupingRule.RemoveAt(i);
  532. else i++;
  533. }
  534. }
  535. else if (rule is ICssPageRule pageRule)
  536. {
  537. SanitizeStyleDeclaration(styleTag, pageRule.Style, baseUrl);
  538. }
  539. else if (rule is ICssKeyframesRule keyFramesRule)
  540. {
  541. foreach (var childRule in keyFramesRule.Rules.OfType<ICssKeyframeRule>().ToList())
  542. {
  543. if (!SanitizeStyleRule(childRule, styleTag, baseUrl) && RemoveAtRule(styleTag, childRule))
  544. keyFramesRule.Remove(childRule.KeyText);
  545. }
  546. }
  547. else if (rule is ICssKeyframeRule keyFrameRule)
  548. {
  549. SanitizeStyleDeclaration(styleTag, keyFrameRule.Style, baseUrl);
  550. }
  551. }
  552. return true;
  553. }
  554. /// <summary>
  555. /// Performs post processing on all nodes in the document.
  556. /// </summary>
  557. /// <param name="dom">The HTML document.</param>
  558. /// <param name="context">The node within which to post process all nodes.</param>
  559. private void DoPostProcess(IHtmlDocument dom, INode? context)
  560. {
  561. if (PostProcessNode != null)
  562. {
  563. dom.Normalize();
  564. if (context != null)
  565. {
  566. var nodes = GetAllNodes(context).ToList();
  567. foreach (var node in nodes)
  568. {
  569. var e = new PostProcessNodeEventArgs(dom, node);
  570. OnPostProcessNode(e);
  571. if (e.ReplacementNodes.Any())
  572. {
  573. ((IChildNode)node).Replace([.. e.ReplacementNodes]);
  574. }
  575. }
  576. }
  577. }
  578. if (PostProcessDom != null)
  579. {
  580. var e = new PostProcessDomEventArgs(dom);
  581. OnPostProcessDom(e);
  582. }
  583. }
  584. /// <summary>
  585. /// Determines whether the specified attribute can contain a URI.
  586. /// </summary>
  587. /// <param name="attribute">The attribute.</param>
  588. /// <returns><c>true</c> if the attribute can contain a URI; otherwise, <c>false</c>.</returns>
  589. private bool IsUriAttribute(IAttr attribute)
  590. {
  591. return UriAttributes.Contains(attribute.Name);
  592. }
  593. /// <summary>
  594. /// Determines whether the specified tag is allowed.
  595. /// </summary>
  596. /// <param name="tag">The tag.</param>
  597. /// <returns><c>true</c> if the tag is allowed; otherwise, <c>false</c>.</returns>
  598. private bool IsAllowedTag(IElement tag)
  599. {
  600. return AllowedTags.Contains(tag.NodeName);
  601. }
  602. /// <summary>
  603. /// Determines whether the specified attribute is allowed.
  604. /// </summary>
  605. /// <param name="attribute">The attribute.</param>
  606. /// <returns><c>true</c> if the attribute is allowed; otherwise, <c>false</c>.</returns>
  607. private bool IsAllowedAttribute(IAttr attribute)
  608. {
  609. return AllowedAttributes.Contains(attribute.Name)
  610. // test html5 data- attributes
  611. || (AllowDataAttributes && attribute.Name != null && attribute.Name.StartsWith("data-", StringComparison.OrdinalIgnoreCase));
  612. }
  613. /// <summary>
  614. /// Sanitizes the style.
  615. /// </summary>
  616. /// <param name="element">The element.</param>
  617. /// <param name="baseUrl">The base URL.</param>
  618. protected void SanitizeStyle(IElement element, string baseUrl)
  619. {
  620. // filter out invalid CSS declarations
  621. // see https://github.com/AngleSharp/AngleSharp/issues/101
  622. var attribute = element.GetAttribute("style");
  623. if (attribute == null)
  624. return;
  625. if (element.GetStyle() == null)
  626. {
  627. element.RemoveAttribute("style");
  628. return;
  629. }
  630. element.SetAttribute("style", element.GetStyle().ToCss(StyleFormatter));
  631. var styles = element.GetStyle();
  632. if (styles == null || styles.Length == 0)
  633. return;
  634. SanitizeStyleDeclaration(element, styles, baseUrl);
  635. }
  636. private void SanitizeStyleDeclaration(IElement element, ICssStyleDeclaration styles, string baseUrl)
  637. {
  638. var removeStyles = new List<Tuple<ICssProperty, RemoveReason>>();
  639. var setStyles = new Dictionary<string, string>();
  640. foreach (var style in styles)
  641. {
  642. var key = DecodeCss(style.Name);
  643. var val = DecodeCss(style.Value);
  644. if (!AllowedCssProperties.Contains(key))
  645. {
  646. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedStyle));
  647. continue;
  648. }
  649. if (CssExpression.IsMatch(val) || DisallowCssPropertyValue.IsMatch(val))
  650. {
  651. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedValue));
  652. continue;
  653. }
  654. val = WhitespaceRegex.Replace(val, string.Empty);
  655. var urls = CssUrl.Matches(val).Cast<Match>().Select(m => (Match: m, Url: SanitizeUrl(element, m.Groups[2].Value, baseUrl)));
  656. if (urls.Any())
  657. {
  658. if (urls.Any(u => u.Url == null))
  659. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedUrlValue));
  660. else
  661. {
  662. var sb = new StringBuilder();
  663. var ix = 0;
  664. foreach (var url in urls)
  665. {
  666. sb.Append(val, ix, url.Match.Index - ix);
  667. sb.Append("url(");
  668. sb.Append(url.Match.Groups[1].Value);
  669. sb.Append(url.Url);
  670. sb.Append(url.Match.Groups[3].Value);
  671. ix = url.Match.Index + url.Match.Length;
  672. }
  673. sb.Append(val, ix, val.Length - ix);
  674. var s = sb.ToString();
  675. if (s != val)
  676. {
  677. if (key != style.Name)
  678. {
  679. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedUrlValue));
  680. }
  681. setStyles[key] = s;
  682. }
  683. }
  684. }
  685. }
  686. foreach (var style in setStyles)
  687. {
  688. styles.SetProperty(style.Key, style.Value);
  689. }
  690. foreach (var style in removeStyles)
  691. {
  692. RemoveStyle(element, styles, style.Item1, style.Item2);
  693. }
  694. }
  695. /// <summary>
  696. /// Decodes CSS Unicode escapes and removes comments.
  697. /// </summary>
  698. /// <param name="css">The CSS string.</param>
  699. /// <returns>The decoded CSS string.</returns>
  700. protected static string DecodeCss(string css)
  701. {
  702. var r = CssUnicodeEscapes.Replace(css, m =>
  703. {
  704. if (m.Groups[1].Success)
  705. return ((char)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber)).ToString();
  706. var t = m.Groups[2].Value;
  707. return t == "\\" ? @"\\" : t;
  708. });
  709. r = CssComments.Replace(r, m => "");
  710. return r;
  711. }
  712. private static readonly Regex SchemeRegex = new(@"^([^\/#]*?)(?:\:|&#0*58|&#x0*3a)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  713. /// <summary>
  714. /// Tries to create a safe <see cref="Iri"/> object from a string.
  715. /// </summary>
  716. /// <param name="url">The URL.</param>
  717. /// <returns>The <see cref="Iri"/> object or null if no safe <see cref="Iri"/> can be created.</returns>
  718. protected Iri? GetSafeIri(string url)
  719. {
  720. url = url.TrimStart();
  721. var schemeMatch = SchemeRegex.Match(url);
  722. if (schemeMatch.Success)
  723. {
  724. var scheme = schemeMatch.Groups[1].Value;
  725. return AllowedSchemes.Contains(scheme, StringComparer.OrdinalIgnoreCase) ? new Iri(url, scheme) : null;
  726. }
  727. return new Iri(url);
  728. }
  729. /// <summary>
  730. /// Sanitizes a URL.
  731. /// </summary>
  732. /// <param name="element">The tag containing the URL being sanitized.</param>
  733. /// <param name="url">The URL.</param>
  734. /// <param name="baseUrl">The base URL relative URLs are resolved against (empty or null for no resolution).</param>
  735. /// <returns>The sanitized URL or <c>null</c> if no safe URL can be created.</returns>
  736. protected virtual string? SanitizeUrl(IElement element, string url, string baseUrl)
  737. {
  738. var iri = GetSafeIri(url);
  739. if (iri != null && !iri.IsAbsolute && !string.IsNullOrEmpty(baseUrl))
  740. {
  741. // resolve relative URI
  742. if (Uri.TryCreate(baseUrl, UriKind.Absolute, out Uri baseUri))
  743. {
  744. try
  745. {
  746. return new Uri(baseUri, iri.Value).AbsoluteUri;
  747. }
  748. catch (UriFormatException)
  749. {
  750. iri = null;
  751. }
  752. }
  753. else iri = null;
  754. }
  755. var e = new FilterUrlEventArgs(element, url, iri?.Value);
  756. OnFilteringUrl(e);
  757. return e.SanitizedUrl;
  758. }
  759. /// <summary>
  760. /// Removes a tag from the document.
  761. /// </summary>
  762. /// <param name="tag">Tag to be removed.</param>
  763. /// <param name="reason">Reason for removal.</param>
  764. private void RemoveTag(IElement tag, RemoveReason reason)
  765. {
  766. var e = new RemovingTagEventArgs(tag, reason);
  767. OnRemovingTag(e);
  768. if (!e.Cancel)
  769. {
  770. if (KeepChildNodes && tag.HasChildNodes)
  771. tag.Replace([.. tag.ChildNodes]);
  772. else
  773. tag.Remove();
  774. }
  775. }
  776. /// <summary>
  777. /// Removes an attribute from the document.
  778. /// </summary>
  779. /// <param name="tag">Tag the attribute belongs to.</param>
  780. /// <param name="attribute">Attribute to be removed.</param>
  781. /// <param name="reason">Reason for removal.</param>
  782. private void RemoveAttribute(IElement tag, IAttr attribute, RemoveReason reason)
  783. {
  784. var e = new RemovingAttributeEventArgs(tag, attribute, reason);
  785. OnRemovingAttribute(e);
  786. if (!e.Cancel)
  787. tag.RemoveAttribute(attribute.Name);
  788. }
  789. /// <summary>
  790. /// Removes a style from the document.
  791. /// </summary>
  792. /// <param name="tag">Tag the style belongs to.</param>
  793. /// <param name="styles">Style rule that contains the style to be removed.</param>
  794. /// <param name="style">Style to be removed.</param>
  795. /// <param name="reason">Reason for removal.</param>
  796. private void RemoveStyle(IElement tag, ICssStyleDeclaration styles, ICssProperty style, RemoveReason reason)
  797. {
  798. var e = new RemovingStyleEventArgs(tag, style, reason);
  799. OnRemovingStyle(e);
  800. if (!e.Cancel)
  801. styles.RemoveProperty(style.Name);
  802. }
  803. /// <summary>
  804. /// Removes an at-rule from the document.
  805. /// </summary>
  806. /// <param name="tag">Tag the style belongs to.</param>
  807. /// <param name="rule">Rule to be removed.</param>
  808. /// <returns><c>true</c>, if the rule can be removed; <c>false</c>, otherwise.</returns>
  809. private bool RemoveAtRule(IElement tag, ICssRule rule)
  810. {
  811. var e = new RemovingAtRuleEventArgs(tag, rule);
  812. OnRemovingAtRule(e);
  813. return !e.Cancel;
  814. }
  815. /// <summary>
  816. /// Removes a CSS class from a class attribute.
  817. /// </summary>
  818. /// <param name="tag">Tag the style belongs to.</param>
  819. /// <param name="cssClass">Class to be removed.</param>
  820. /// <param name="reason">Reason for removal.</param>
  821. private void RemoveCssClass(IElement tag, string cssClass, RemoveReason reason)
  822. {
  823. var e = new RemovingCssClassEventArgs(tag, cssClass, reason);
  824. OnRemovingCssClass(e);
  825. if (!e.Cancel)
  826. tag.ClassList.Remove(cssClass);
  827. }
  828. }
  829. }