HtmlSanitizer.cs 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980
  1. using AngleSharp;
  2. using AngleSharp.Css;
  3. using AngleSharp.Css.Dom;
  4. using AngleSharp.Css.Parser;
  5. using AngleSharp.Dom;
  6. using AngleSharp.Html.Dom;
  7. using AngleSharp.Html.Parser;
  8. using System;
  9. using System.Collections.Generic;
  10. using System.Globalization;
  11. using System.IO;
  12. using System.Linq;
  13. using System.Text;
  14. using System.Text.RegularExpressions;
  15. namespace Ganss.Xss;
  16. /// <summary>
  17. /// Cleans HTML documents and fragments from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
  18. /// </summary>
  19. /// <remarks>
  20. /// XSS attacks can occur at several levels within an HTML document or fragment:
  21. /// <list type="bullet">
  22. /// <item>HTML tags (e.g. the &lt;script&gt; tag)</item>
  23. /// <item>HTML attributes (e.g. the "onload" attribute)</item>
  24. /// <item>CSS styles (url property values)</item>
  25. /// <item>malformed HTML or HTML that exploits parser bugs in specific browsers</item>
  26. /// </list>
  27. /// <para>
  28. /// The HtmlSanitizer class addresses all of these possible attack vectors by using a sophisticated HTML parser (<a href="https://github.com/AngleSharp/AngleSharp">AngleSharp</a>).
  29. /// </para>
  30. /// <para>
  31. /// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
  32. /// <list type="bullet">
  33. /// <item>You can specify the allowed HTML tags through the property <see cref="AllowedTags"/>. All other tags will be stripped.</item>
  34. /// <item>You can specify the allowed HTML attributes through the property <see cref="AllowedAttributes"/>. All other attributes will be stripped.</item>
  35. /// <item>You can specify the allowed CSS property names through the property <see cref="AllowedCssProperties"/>. All other styles will be stripped.</item>
  36. /// <item>You can specify the allowed URI schemes through the property <see cref="AllowedSchemes"/>. All other URIs will be stripped.</item>
  37. /// <item>You can specify the HTML attributes that contain URIs (such as "src", "href" etc.) through the property <see cref="UriAttributes"/>.</item>
  38. /// </list>
  39. /// </para>
  40. /// </remarks>
  41. /// <example>
  42. /// <code>
  43. /// <![CDATA[
  44. /// var sanitizer = new HtmlSanitizer();
  45. /// var html = @"<script>alert('xss')</script><div onload=""alert('xss')"" style=""background-color: test"">Test<img src=""test.gif"" style=""background-image: url(javascript:alert('xss')); margin: 10px""></div>";
  46. /// var sanitized = sanitizer.Sanitize(html, "http://www.example.com");
  47. /// // -> "<div style="background-color: test">Test<img style="margin: 10px" src="http://www.example.com/test.gif"></div>"
  48. /// ]]>
  49. /// </code>
  50. /// </example>
  51. public class HtmlSanitizer : IHtmlSanitizer
  52. {
  53. private const string StyleAttributeName = "style";
  54. // from http://genshi.edgewall.org/
  55. private static readonly Regex CssUnicodeEscapes = new(@"\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'""{};:()#*])", RegexOptions.Compiled);
  56. private static readonly Regex CssComments = new(@"/\*.*?\*/", RegexOptions.Compiled);
  57. // IE6 <http://heideri.ch/jso/#80>
  58. private static readonly Regex CssExpression = new(@"[eE\uFF25\uFF45][xX\uFF38\uFF58][pP\uFF30\uFF50][rR\u0280\uFF32\uFF52][eE\uFF25\uFF45][sS\uFF33\uFF53]{2}[iI\u026A\uFF29\uFF49][oO\uFF2F\uFF4F][nN\u0274\uFF2E\uFF4E]", RegexOptions.Compiled);
  59. private static readonly Regex CssUrl = new(@"[Uu][Rr\u0280][Ll\u029F]\((['""]?)([^'"")]+)(['""]?)", RegexOptions.Compiled);
  60. private static readonly Regex WhitespaceRegex = new(@"\s*", RegexOptions.Compiled);
  61. private static readonly IConfiguration defaultConfiguration = Configuration.Default.WithCss(new CssParserOptions
  62. {
  63. IsIncludingUnknownDeclarations = true,
  64. IsIncludingUnknownRules = true,
  65. IsToleratingInvalidSelectors = true,
  66. });
  67. private static readonly HtmlParser defaultHtmlParser = new(new HtmlParserOptions { IsScripting = true }, BrowsingContext.New(defaultConfiguration));
  68. /// <summary>
  69. /// Initializes a new instance of the <see cref="HtmlSanitizer"/> class
  70. /// with the default options.
  71. /// </summary>
  72. public HtmlSanitizer()
  73. {
  74. AllowedTags = new HashSet<string>(HtmlSanitizerDefaults.AllowedTags, StringComparer.OrdinalIgnoreCase);
  75. AllowedSchemes = new HashSet<string>(HtmlSanitizerDefaults.AllowedSchemes, StringComparer.OrdinalIgnoreCase);
  76. AllowedAttributes = new HashSet<string>(HtmlSanitizerDefaults.AllowedAttributes, StringComparer.OrdinalIgnoreCase);
  77. UriAttributes = new HashSet<string>(HtmlSanitizerDefaults.UriAttributes, StringComparer.OrdinalIgnoreCase);
  78. AllowedCssProperties = new HashSet<string>(HtmlSanitizerDefaults.AllowedCssProperties, StringComparer.OrdinalIgnoreCase);
  79. AllowedAtRules = new HashSet<CssRuleType>(HtmlSanitizerDefaults.AllowedAtRules);
  80. AllowedClasses = new HashSet<string>(HtmlSanitizerDefaults.AllowedClasses);
  81. }
  82. /// <summary>
  83. /// Initializes a new instance of the <see cref="HtmlSanitizer"/> class
  84. /// with the given options.
  85. /// </summary>
  86. /// <param name="options">Options to control the sanitizing.</param>
  87. public HtmlSanitizer(HtmlSanitizerOptions options)
  88. {
  89. AllowedTags = new HashSet<string>(options.AllowedTags, StringComparer.OrdinalIgnoreCase);
  90. AllowedSchemes = new HashSet<string>(options.AllowedSchemes, StringComparer.OrdinalIgnoreCase);
  91. AllowedAttributes = new HashSet<string>(options.AllowedAttributes, StringComparer.OrdinalIgnoreCase);
  92. UriAttributes = new HashSet<string>(options.UriAttributes, StringComparer.OrdinalIgnoreCase);
  93. AllowedClasses = new HashSet<string>(options.AllowedCssClasses, StringComparer.OrdinalIgnoreCase);
  94. AllowedCssProperties = new HashSet<string>(options.AllowedCssProperties, StringComparer.OrdinalIgnoreCase);
  95. AllowedAtRules = new HashSet<CssRuleType>(options.AllowedAtRules);
  96. AllowCssCustomProperties = options.AllowCssCustomProperties;
  97. AllowDataAttributes = options.AllowDataAttributes;
  98. }
  99. /// <summary>
  100. /// Gets or sets the default <see cref="Action{IComment}"/> method that encodes comments.
  101. /// </summary>
  102. public Action<IComment> EncodeComment { get; set; } = DefaultEncodeComment;
  103. /// <summary>
  104. /// Gets or sets the default <see cref="Action{IElement}"/> method that encodes literal text content.
  105. /// </summary>
  106. public Action<IElement> EncodeLiteralTextElementContent { get; set; } = DefaultEncodeLiteralTextElementContent;
  107. /// <summary>
  108. /// Gets or sets the default value indicating whether to keep child nodes of elements that are removed. Default is false.
  109. /// </summary>
  110. public static bool DefaultKeepChildNodes { get; set; } = false;
  111. /// <summary>
  112. /// Gets or sets a value indicating whether to keep child nodes of elements that are removed. Default is <see cref="DefaultKeepChildNodes"/>.
  113. /// </summary>
  114. public bool KeepChildNodes { get; set; } = DefaultKeepChildNodes;
  115. /// <summary>
  116. /// Gets or sets the default <see cref="Func{HtmlParser}"/> object that creates the parser used for parsing the input.
  117. /// </summary>
  118. public static Func<HtmlParser> DefaultHtmlParserFactory { get; set; } = () => defaultHtmlParser;
  119. /// <summary>
  120. /// Gets or sets the <see cref="Func{HtmlParser}"/> object the creates the parser used for parsing the input.
  121. /// </summary>
  122. public Func<HtmlParser> HtmlParserFactory { get; set; } = DefaultHtmlParserFactory;
  123. /// <summary>
  124. /// Gets or sets the default <see cref="IMarkupFormatter"/> object used for generating output. Default is <see cref="HtmlFormatter.Instance"/>.
  125. /// </summary>
  126. public static IMarkupFormatter DefaultOutputFormatter { get; set; } = HtmlFormatter.Instance;
  127. /// <summary>
  128. /// Gets or sets the <see cref="IMarkupFormatter"/> object used for generating output. Default is <see cref="DefaultOutputFormatter"/>.
  129. /// </summary>
  130. public IMarkupFormatter OutputFormatter { get; set; } = DefaultOutputFormatter;
  131. /// <summary>
  132. /// Gets or sets the default <see cref="IStyleFormatter"/> object used for generating CSS output. Default is <see cref="CssStyleFormatter.Instance"/>.
  133. /// </summary>
  134. public static IStyleFormatter DefaultStyleFormatter { get; set; } = CssStyleFormatter.Instance;
  135. /// <summary>
  136. /// Gets or sets the <see cref="IStyleFormatter"/> object used for generating CSS output. Default is <see cref="DefaultStyleFormatter"/>.
  137. /// </summary>
  138. public IStyleFormatter StyleFormatter { get; set; } = DefaultStyleFormatter;
  139. /// <summary>
  140. /// Gets or sets the allowed CSS at-rules such as "@media" and "@font-face".
  141. /// </summary>
  142. /// <value>
  143. /// The allowed CSS at-rules.
  144. /// </value>
  145. public ISet<CssRuleType> AllowedAtRules { get; private set; }
  146. /// <summary>
  147. /// Gets or sets the allowed URI schemes such as "http" and "https".
  148. /// </summary>
  149. /// <value>
  150. /// The allowed URI schemes.
  151. /// </value>
  152. public ISet<string> AllowedSchemes { get; private set; }
  153. /// <summary>
  154. /// Gets or sets the allowed HTML tag names such as "a" and "div".
  155. /// </summary>
  156. /// <value>
  157. /// The allowed tag names.
  158. /// </value>
  159. public ISet<string> AllowedTags { get; private set; }
  160. /// <summary>
  161. /// Gets or sets the allowed HTML attributes such as "href" and "alt".
  162. /// </summary>
  163. /// <value>
  164. /// The allowed HTML attributes.
  165. /// </value>
  166. public ISet<string> AllowedAttributes { get; private set; }
  167. /// <summary>
  168. /// Allow all HTML5 data attributes; the attributes prefixed with <c>data-</c>.
  169. /// </summary>
  170. public bool AllowDataAttributes { get; set; }
  171. /// <summary>
  172. /// Gets or sets the HTML attributes that can contain a URI such as "href".
  173. /// </summary>
  174. /// <value>
  175. /// The URI attributes.
  176. /// </value>
  177. public ISet<string> UriAttributes { get; private set; }
  178. /// <summary>
  179. /// Gets or sets the allowed CSS properties such as "font" and "margin".
  180. /// </summary>
  181. /// <value>
  182. /// The allowed CSS properties.
  183. /// </value>
  184. public ISet<string> AllowedCssProperties { get; private set; }
  185. /// <summary>
  186. /// Allow all custom CSS properties (variables) prefixed with <c>--</c>.
  187. /// </summary>
  188. public bool AllowCssCustomProperties { get; set; }
  189. /// <summary>
  190. /// Gets or sets a regex that must not match for legal CSS property values.
  191. /// </summary>
  192. /// <value>
  193. /// The regex.
  194. /// </value>
  195. public Regex DisallowCssPropertyValue { get; set; } = DefaultDisallowedCssPropertyValue;
  196. /// <summary>
  197. /// Gets or sets the allowed CSS classes. If the set is empty, all classes will be allowed.
  198. /// </summary>
  199. /// <value>
  200. /// The allowed CSS classes. An empty set means all classes are allowed.
  201. /// </value>
  202. public ISet<string> AllowedClasses { get; private set; }
  203. /// <summary>
  204. /// Occurs after sanitizing the document and post processing nodes.
  205. /// </summary>
  206. public event EventHandler<PostProcessDomEventArgs>? PostProcessDom;
  207. /// <summary>
  208. /// Occurs for every node after sanitizing.
  209. /// </summary>
  210. public event EventHandler<PostProcessNodeEventArgs>? PostProcessNode;
  211. /// <summary>
  212. /// Occurs before a tag is removed.
  213. /// </summary>
  214. public event EventHandler<RemovingTagEventArgs>? RemovingTag;
  215. /// <summary>
  216. /// Occurs before an attribute is removed.
  217. /// </summary>
  218. public event EventHandler<RemovingAttributeEventArgs>? RemovingAttribute;
  219. /// <summary>
  220. /// Occurs before a style is removed.
  221. /// </summary>
  222. public event EventHandler<RemovingStyleEventArgs>? RemovingStyle;
  223. /// <summary>
  224. /// Occurs before an at-rule is removed.
  225. /// </summary>
  226. public event EventHandler<RemovingAtRuleEventArgs>? RemovingAtRule;
  227. /// <summary>
  228. /// Occurs before a comment is removed.
  229. /// </summary>
  230. public event EventHandler<RemovingCommentEventArgs>? RemovingComment;
  231. /// <summary>
  232. /// Occurs before a CSS class is removed.
  233. /// </summary>
  234. public event EventHandler<RemovingCssClassEventArgs>? RemovingCssClass;
  235. /// <summary>
  236. /// Occurs when a URL is being sanitized.
  237. /// </summary>
  238. public event EventHandler<FilterUrlEventArgs>? FilterUrl;
  239. /// <summary>
  240. /// Raises the <see cref="E:PostProcessDom" /> event.
  241. /// </summary>
  242. /// <param name="e">The <see cref="PostProcessDomEventArgs"/> instance containing the event data.</param>
  243. protected virtual void OnPostProcessDom(PostProcessDomEventArgs e)
  244. {
  245. PostProcessDom?.Invoke(this, e);
  246. }
  247. /// <summary>
  248. /// Raises the <see cref="E:PostProcessNode" /> event.
  249. /// </summary>
  250. /// <param name="e">The <see cref="PostProcessNodeEventArgs"/> instance containing the event data.</param>
  251. protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e)
  252. {
  253. PostProcessNode?.Invoke(this, e);
  254. }
  255. /// <summary>
  256. /// Raises the <see cref="E:RemovingTag" /> event.
  257. /// </summary>
  258. /// <param name="e">The <see cref="RemovingTagEventArgs"/> instance containing the event data.</param>
  259. protected virtual void OnRemovingTag(RemovingTagEventArgs e)
  260. {
  261. RemovingTag?.Invoke(this, e);
  262. }
  263. /// <summary>
  264. /// Raises the <see cref="E:RemovingAttribute" /> event.
  265. /// </summary>
  266. /// <param name="e">The <see cref="RemovingAttributeEventArgs"/> instance containing the event data.</param>
  267. protected virtual void OnRemovingAttribute(RemovingAttributeEventArgs e)
  268. {
  269. RemovingAttribute?.Invoke(this, e);
  270. }
  271. /// <summary>
  272. /// Raises the <see cref="E:RemovingStyle" /> event.
  273. /// </summary>
  274. /// <param name="e">The <see cref="RemovingStyleEventArgs"/> instance containing the event data.</param>
  275. protected virtual void OnRemovingStyle(RemovingStyleEventArgs e)
  276. {
  277. RemovingStyle?.Invoke(this, e);
  278. }
  279. /// <summary>
  280. /// Raises the <see cref="E:RemovingAtRule" /> event.
  281. /// </summary>
  282. /// <param name="e">The <see cref="RemovingAtRuleEventArgs"/> instance containing the event data.</param>
  283. protected virtual void OnRemovingAtRule(RemovingAtRuleEventArgs e)
  284. {
  285. RemovingAtRule?.Invoke(this, e);
  286. }
  287. /// <summary>
  288. /// Raises the <see cref="E:RemovingComment" /> event.
  289. /// </summary>
  290. /// <param name="e">The <see cref="RemovingCommentEventArgs"/> instance containing the event data.</param>
  291. protected virtual void OnRemovingComment(RemovingCommentEventArgs e)
  292. {
  293. RemovingComment?.Invoke(this, e);
  294. }
  295. /// <summary>
  296. /// The default regex for disallowed CSS property values.
  297. /// </summary>
  298. public static readonly Regex DefaultDisallowedCssPropertyValue = new(@"[<>]", RegexOptions.Compiled);
  299. /// <summary>
  300. /// Raises the <see cref="E:RemovingCSSClass" /> event.
  301. /// </summary>
  302. /// <param name="e">The <see cref="RemovingCssClassEventArgs"/> instance containing the event data.</param>
  303. protected virtual void OnRemovingCssClass(RemovingCssClassEventArgs e)
  304. {
  305. RemovingCssClass?.Invoke(this, e);
  306. }
  307. /// <summary>
  308. /// Raises the <see cref="E:FilterUrl" /> event.
  309. /// </summary>
  310. /// <param name="e">The <see cref="FilterUrlEventArgs"/> instance containing the event data.</param>
  311. protected virtual void OnFilteringUrl(FilterUrlEventArgs e)
  312. {
  313. FilterUrl?.Invoke(this, e);
  314. }
  315. /// <summary>
  316. /// Return all nested subnodes of a node. The nodes are returned in DOM order.
  317. /// </summary>
  318. /// <param name="dom">The root node.</param>
  319. /// <returns>All nested subnodes.</returns>
  320. private static IEnumerable<INode> GetAllNodes(INode dom)
  321. {
  322. if (dom.ChildNodes.Length == 0) yield break;
  323. var s = new Stack<INode>();
  324. for (var i = dom.ChildNodes.Length - 1; i >= 0; i--)
  325. {
  326. s.Push(dom.ChildNodes[i]);
  327. }
  328. while (s.Count > 0)
  329. {
  330. var n = s.Pop();
  331. yield return n;
  332. for (var i = n.ChildNodes.Length - 1; i >= 0; i--)
  333. {
  334. s.Push(n.ChildNodes[i]);
  335. }
  336. }
  337. }
  338. /// <summary>
  339. /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
  340. /// </summary>
  341. /// <param name="html">The HTML body fragment to sanitize.</param>
  342. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  343. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  344. /// <returns>The sanitized HTML body fragment.</returns>
  345. public string Sanitize(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
  346. {
  347. using var dom = SanitizeDom(html, baseUrl);
  348. if (dom.Body == null) return string.Empty;
  349. var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? OutputFormatter);
  350. return output;
  351. }
  352. /// <summary>
  353. /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
  354. /// </summary>
  355. /// <param name="html">The HTML body fragment to sanitize.</param>
  356. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  357. /// <returns>The sanitized HTML document.</returns>
  358. public IHtmlDocument SanitizeDom(string html, string baseUrl = "")
  359. {
  360. var parser = HtmlParserFactory();
  361. var dom = parser.ParseDocument("<!doctype html><html><body>" + html);
  362. if (dom.Body != null)
  363. DoSanitize(dom, dom.Body, baseUrl);
  364. return dom;
  365. }
  366. /// <summary>
  367. /// Sanitizes the specified parsed HTML body fragment.
  368. /// If the document has not been parsed with CSS support then all styles will be removed.
  369. /// </summary>
  370. /// <param name="document">The parsed HTML document.</param>
  371. /// <param name="context">The node within which to sanitize.</param>
  372. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  373. /// <returns>The sanitized HTML document.</returns>
  374. public IHtmlDocument SanitizeDom(IHtmlDocument document, IHtmlElement? context = null, string baseUrl = "")
  375. {
  376. DoSanitize(document, context ?? (IParentNode)document, baseUrl);
  377. return document;
  378. }
  379. /// <summary>
  380. /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
  381. /// </summary>
  382. /// <param name="html">The HTML document to sanitize.</param>
  383. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  384. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  385. /// <returns>The sanitized HTML document.</returns>
  386. public string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
  387. {
  388. var parser = HtmlParserFactory();
  389. using var dom = parser.ParseDocument(html);
  390. DoSanitize(dom, dom, baseUrl);
  391. var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
  392. return output;
  393. }
  394. /// <summary>
  395. /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
  396. /// </summary>
  397. /// <param name="html">The HTML document to sanitize.</param>
  398. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  399. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  400. /// <returns>The sanitized HTML document.</returns>
  401. public string SanitizeDocument(Stream html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
  402. {
  403. var parser = HtmlParserFactory();
  404. using var dom = parser.ParseDocument(html);
  405. DoSanitize(dom, dom, baseUrl);
  406. var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
  407. return output;
  408. }
  409. /// <summary>
  410. /// Removes all comment nodes from a list of nodes.
  411. /// </summary>
  412. /// <param name="context">The node within which to remove comments.</param>
  413. /// <returns><c>true</c> if any comments were removed; otherwise, <c>false</c>.</returns>
  414. private void RemoveComments(INode context)
  415. {
  416. foreach (var comment in GetAllNodes(context).OfType<IComment>().ToList())
  417. {
  418. EncodeComment(comment);
  419. var e = new RemovingCommentEventArgs(comment);
  420. OnRemovingComment(e);
  421. if (!e.Cancel)
  422. comment.Remove();
  423. }
  424. }
  425. private static void DefaultEncodeComment(IComment comment)
  426. {
  427. var escapedText = comment.TextContent.Replace("<", "&lt;").Replace(">", "&gt;");
  428. if (escapedText != comment.TextContent)
  429. comment.TextContent = escapedText;
  430. }
  431. private static void DefaultEncodeLiteralTextElementContent(IElement tag)
  432. {
  433. var escapedHtml = tag.InnerHtml.Replace("<", "&lt;").Replace(">", "&gt;");
  434. if (escapedHtml != tag.InnerHtml)
  435. tag.InnerHtml = escapedHtml;
  436. if (tag.InnerHtml != escapedHtml) // setting InnerHtml does not work for noscript
  437. tag.SetInnerText(escapedHtml);
  438. }
  439. private void DoSanitize(IHtmlDocument dom, IParentNode context, string baseUrl = "")
  440. {
  441. // remove disallowed tags
  442. foreach (var tag in context.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
  443. {
  444. RemoveTag(tag, RemoveReason.NotAllowedTag);
  445. }
  446. // always encode text in raw data content
  447. foreach (var tag in context.QuerySelectorAll("*")
  448. .Where(t => t is not IHtmlStyleElement
  449. && t.Flags.HasFlag(NodeFlags.LiteralText)
  450. && !string.IsNullOrWhiteSpace(t.InnerHtml)))
  451. {
  452. EncodeLiteralTextElementContent(tag);
  453. }
  454. SanitizeStyleSheets(dom, baseUrl);
  455. // cleanup attributes
  456. foreach (var tag in context.QuerySelectorAll("*").ToList())
  457. {
  458. // remove disallowed attributes
  459. foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
  460. {
  461. RemoveAttribute(tag, attribute, RemoveReason.NotAllowedAttribute);
  462. }
  463. // sanitize URLs in URL-marked attributes
  464. foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList())
  465. {
  466. var url = SanitizeUrl(tag, attribute.Value, baseUrl);
  467. if (url == null)
  468. RemoveAttribute(tag, attribute, RemoveReason.NotAllowedUrlValue);
  469. else
  470. tag.SetAttribute(attribute.Name, url);
  471. }
  472. // sanitize the style attribute
  473. var oldStyleEmpty = string.IsNullOrEmpty(tag.GetAttribute(StyleAttributeName));
  474. SanitizeStyle(tag, baseUrl);
  475. // sanitize the value of the attributes
  476. foreach (var attribute in tag.Attributes.ToList())
  477. {
  478. // The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
  479. // (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
  480. if (attribute.Value.Contains("&{"))
  481. {
  482. RemoveAttribute(tag, attribute, RemoveReason.NotAllowedValue);
  483. }
  484. else
  485. {
  486. if (AllowedClasses.Any() && attribute.Name == "class")
  487. {
  488. var removedClasses = tag.ClassList.Except(AllowedClasses).ToArray();
  489. foreach (var removedClass in removedClasses)
  490. RemoveCssClass(tag, removedClass, RemoveReason.NotAllowedCssClass);
  491. if (tag.ClassList.Length == 0)
  492. RemoveAttribute(tag, attribute, RemoveReason.ClassAttributeEmpty);
  493. }
  494. else if (!oldStyleEmpty && attribute.Name == StyleAttributeName && string.IsNullOrEmpty(attribute.Value))
  495. {
  496. RemoveAttribute(tag, attribute, RemoveReason.StyleAttributeEmpty);
  497. }
  498. }
  499. }
  500. }
  501. if (context is INode node)
  502. {
  503. RemoveComments(node);
  504. }
  505. DoPostProcess(dom, context as INode);
  506. }
  507. private void SanitizeStyleSheets(IHtmlDocument dom, string baseUrl)
  508. {
  509. foreach (var styleSheet in dom.StyleSheets.OfType<ICssStyleSheet>())
  510. {
  511. var styleTag = styleSheet.OwnerNode;
  512. var i = 0;
  513. while (i < styleSheet.Rules.Length)
  514. {
  515. var rule = styleSheet.Rules[i];
  516. if (!SanitizeStyleRule(rule, styleTag, baseUrl) && RemoveAtRule(styleTag, rule))
  517. styleSheet.RemoveAt(i);
  518. else i++;
  519. }
  520. styleTag.InnerHtml = styleSheet.ToCss(StyleFormatter).Replace("<", "\\3c ");
  521. }
  522. }
  523. private bool SanitizeStyleRule(ICssRule rule, IElement styleTag, string baseUrl)
  524. {
  525. if (!AllowedAtRules.Contains(rule.Type)) return false;
  526. if (rule is ICssStyleRule styleRule)
  527. {
  528. SanitizeStyleDeclaration(styleTag, styleRule.Style, baseUrl);
  529. }
  530. else
  531. {
  532. if (rule is ICssGroupingRule groupingRule)
  533. {
  534. var i = 0;
  535. while (i < groupingRule.Rules.Length)
  536. {
  537. var childRule = groupingRule.Rules[i];
  538. if (!SanitizeStyleRule(childRule, styleTag, baseUrl) && RemoveAtRule(styleTag, childRule))
  539. groupingRule.RemoveAt(i);
  540. else i++;
  541. }
  542. }
  543. else if (rule is ICssPageRule pageRule)
  544. {
  545. SanitizeStyleDeclaration(styleTag, pageRule.Style, baseUrl);
  546. }
  547. else if (rule is ICssKeyframesRule keyFramesRule)
  548. {
  549. foreach (var childRule in keyFramesRule.Rules.OfType<ICssKeyframeRule>().ToList())
  550. {
  551. if (!SanitizeStyleRule(childRule, styleTag, baseUrl) && RemoveAtRule(styleTag, childRule))
  552. keyFramesRule.Remove(childRule.KeyText);
  553. }
  554. }
  555. else if (rule is ICssKeyframeRule keyFrameRule)
  556. {
  557. SanitizeStyleDeclaration(styleTag, keyFrameRule.Style, baseUrl);
  558. }
  559. }
  560. return true;
  561. }
  562. /// <summary>
  563. /// Performs post processing on all nodes in the document.
  564. /// </summary>
  565. /// <param name="dom">The HTML document.</param>
  566. /// <param name="context">The node within which to post process all nodes.</param>
  567. private void DoPostProcess(IHtmlDocument dom, INode? context)
  568. {
  569. if (PostProcessNode != null)
  570. {
  571. dom.Normalize();
  572. if (context != null)
  573. {
  574. var nodes = GetAllNodes(context).ToList();
  575. foreach (var node in nodes)
  576. {
  577. var e = new PostProcessNodeEventArgs(dom, node);
  578. OnPostProcessNode(e);
  579. if (e.ReplacementNodes.Count != 0)
  580. {
  581. ((IChildNode)node).Replace([.. e.ReplacementNodes]);
  582. }
  583. }
  584. }
  585. }
  586. if (PostProcessDom != null)
  587. {
  588. var e = new PostProcessDomEventArgs(dom);
  589. OnPostProcessDom(e);
  590. }
  591. }
  592. /// <summary>
  593. /// Determines whether the specified attribute can contain a URI.
  594. /// </summary>
  595. /// <param name="attribute">The attribute.</param>
  596. /// <returns><c>true</c> if the attribute can contain a URI; otherwise, <c>false</c>.</returns>
  597. private bool IsUriAttribute(IAttr attribute)
  598. {
  599. return UriAttributes.Contains(attribute.Name);
  600. }
  601. /// <summary>
  602. /// Determines whether the specified tag is allowed.
  603. /// </summary>
  604. /// <param name="tag">The tag.</param>
  605. /// <returns><c>true</c> if the tag is allowed; otherwise, <c>false</c>.</returns>
  606. private bool IsAllowedTag(IElement tag)
  607. {
  608. return AllowedTags.Contains(tag.NodeName);
  609. }
  610. /// <summary>
  611. /// Determines whether the specified attribute is allowed.
  612. /// </summary>
  613. /// <param name="attribute">The attribute.</param>
  614. /// <returns><c>true</c> if the attribute is allowed; otherwise, <c>false</c>.</returns>
  615. private bool IsAllowedAttribute(IAttr attribute)
  616. {
  617. return AllowedAttributes.Contains(attribute.Name)
  618. // test html5 data- attributes
  619. || (AllowDataAttributes && attribute.Name != null && attribute.Name.StartsWith("data-", StringComparison.OrdinalIgnoreCase));
  620. }
  621. /// <summary>
  622. /// Sanitizes the style.
  623. /// </summary>
  624. /// <param name="element">The element.</param>
  625. /// <param name="baseUrl">The base URL.</param>
  626. protected void SanitizeStyle(IElement element, string baseUrl)
  627. {
  628. // filter out invalid CSS declarations
  629. // see https://github.com/AngleSharp/AngleSharp/issues/101
  630. var attribute = element.GetAttribute(StyleAttributeName);
  631. if (attribute == null)
  632. return;
  633. if (element.GetStyle() == null)
  634. {
  635. element.RemoveAttribute(StyleAttributeName);
  636. return;
  637. }
  638. element.SetAttribute(StyleAttributeName, element.GetStyle().ToCss(StyleFormatter));
  639. var styles = element.GetStyle();
  640. if (styles == null || styles.Length == 0)
  641. return;
  642. SanitizeStyleDeclaration(element, styles, baseUrl);
  643. }
  644. /// <summary>
  645. /// Verify if the given CSS property name is allowed. By default this will
  646. /// check if the property is in the <see cref="AllowedCssProperties"/> set,
  647. /// or if the property is a custom property and <see cref="AllowCssCustomProperties"/> is true.
  648. /// </summary>
  649. /// <param name="propertyName">The name of the CSS property.</param>
  650. /// <returns>True if the property is allowed or not.</returns>
  651. protected virtual bool IsAllowedCssProperty(string propertyName)
  652. {
  653. return AllowedCssProperties.Contains(propertyName)
  654. || AllowCssCustomProperties && propertyName != null && propertyName.StartsWith("--");
  655. }
  656. private void SanitizeStyleDeclaration(IElement element, ICssStyleDeclaration styles, string baseUrl)
  657. {
  658. var removeStyles = new List<Tuple<ICssProperty, RemoveReason>>();
  659. var setStyles = new Dictionary<string, string>();
  660. foreach (var style in styles)
  661. {
  662. var key = DecodeCss(style.Name);
  663. var val = DecodeCss(style.Value);
  664. if (!IsAllowedCssProperty(key))
  665. {
  666. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedStyle));
  667. continue;
  668. }
  669. if (CssExpression.IsMatch(val) || DisallowCssPropertyValue.IsMatch(val))
  670. {
  671. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedValue));
  672. continue;
  673. }
  674. val = WhitespaceRegex.Replace(val, string.Empty);
  675. var urls = CssUrl.Matches(val).Cast<Match>().Select(m => (Match: m, Url: SanitizeUrl(element, m.Groups[2].Value, baseUrl)));
  676. if (urls.Any())
  677. {
  678. if (urls.Any(u => u.Url == null))
  679. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedUrlValue));
  680. else
  681. {
  682. var sb = new StringBuilder();
  683. var ix = 0;
  684. foreach (var url in urls)
  685. {
  686. sb.Append(val, ix, url.Match.Index - ix);
  687. sb.Append("url(");
  688. sb.Append(url.Match.Groups[1].Value);
  689. sb.Append(url.Url);
  690. sb.Append(url.Match.Groups[3].Value);
  691. ix = url.Match.Index + url.Match.Length;
  692. }
  693. sb.Append(val, ix, val.Length - ix);
  694. var s = sb.ToString();
  695. if (s != val)
  696. {
  697. if (key != style.Name)
  698. {
  699. removeStyles.Add(new Tuple<ICssProperty, RemoveReason>(style, RemoveReason.NotAllowedUrlValue));
  700. }
  701. setStyles[key] = s;
  702. }
  703. }
  704. }
  705. }
  706. foreach (var style in setStyles)
  707. {
  708. styles.SetProperty(style.Key, style.Value);
  709. }
  710. foreach (var style in removeStyles)
  711. {
  712. RemoveStyle(element, styles, style.Item1, style.Item2);
  713. }
  714. }
  715. /// <summary>
  716. /// Decodes CSS Unicode escapes and removes comments.
  717. /// </summary>
  718. /// <param name="css">The CSS string.</param>
  719. /// <returns>The decoded CSS string.</returns>
  720. protected static string DecodeCss(string css)
  721. {
  722. var r = CssUnicodeEscapes.Replace(css, m =>
  723. {
  724. if (m.Groups[1].Success)
  725. return ((char)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber)).ToString();
  726. var t = m.Groups[2].Value;
  727. return t == "\\" ? @"\\" : t;
  728. });
  729. r = CssComments.Replace(r, m => "");
  730. return r;
  731. }
  732. private static readonly Regex SchemeRegex = new(@"^([^\/#]*?)(?:\:|&#0*58|&#x0*3a)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  733. /// <summary>
  734. /// Tries to create a safe <see cref="Iri"/> object from a string.
  735. /// </summary>
  736. /// <param name="url">The URL.</param>
  737. /// <returns>The <see cref="Iri"/> object or null if no safe <see cref="Iri"/> can be created.</returns>
  738. protected Iri? GetSafeIri(string url)
  739. {
  740. url = url.TrimStart();
  741. var schemeMatch = SchemeRegex.Match(url);
  742. if (schemeMatch.Success)
  743. {
  744. var scheme = schemeMatch.Groups[1].Value;
  745. return AllowedSchemes.Contains(scheme, StringComparer.OrdinalIgnoreCase) ? new Iri(url, scheme) : null;
  746. }
  747. return new Iri(url);
  748. }
  749. /// <summary>
  750. /// Sanitizes a URL.
  751. /// </summary>
  752. /// <param name="element">The tag containing the URL being sanitized.</param>
  753. /// <param name="url">The URL.</param>
  754. /// <param name="baseUrl">The base URL relative URLs are resolved against (empty or null for no resolution).</param>
  755. /// <returns>The sanitized URL or <c>null</c> if no safe URL can be created.</returns>
  756. protected virtual string? SanitizeUrl(IElement element, string url, string baseUrl)
  757. {
  758. var iri = GetSafeIri(url);
  759. if (iri != null && !iri.IsAbsolute && !string.IsNullOrEmpty(baseUrl))
  760. {
  761. // resolve relative URI
  762. if (Uri.TryCreate(baseUrl, UriKind.Absolute, out Uri? baseUri))
  763. {
  764. try
  765. {
  766. var sanitizedUrl = new Uri(baseUri, iri.Value).AbsoluteUri;
  767. var ev = new FilterUrlEventArgs(element, url, sanitizedUrl);
  768. OnFilteringUrl(ev);
  769. return ev.SanitizedUrl;
  770. }
  771. catch (UriFormatException)
  772. {
  773. iri = null;
  774. }
  775. }
  776. else iri = null;
  777. }
  778. var e = new FilterUrlEventArgs(element, url, iri?.Value);
  779. OnFilteringUrl(e);
  780. return e.SanitizedUrl;
  781. }
  782. /// <summary>
  783. /// Removes a tag from the document.
  784. /// </summary>
  785. /// <param name="tag">Tag to be removed.</param>
  786. /// <param name="reason">Reason for removal.</param>
  787. private void RemoveTag(IElement tag, RemoveReason reason)
  788. {
  789. var e = new RemovingTagEventArgs(tag, reason);
  790. OnRemovingTag(e);
  791. if (!e.Cancel)
  792. {
  793. if (KeepChildNodes && tag.HasChildNodes)
  794. tag.Replace([.. tag.ChildNodes]);
  795. else
  796. tag.Remove();
  797. }
  798. }
  799. /// <summary>
  800. /// Removes an attribute from the document.
  801. /// </summary>
  802. /// <param name="tag">Tag the attribute belongs to.</param>
  803. /// <param name="attribute">Attribute to be removed.</param>
  804. /// <param name="reason">Reason for removal.</param>
  805. private void RemoveAttribute(IElement tag, IAttr attribute, RemoveReason reason)
  806. {
  807. var e = new RemovingAttributeEventArgs(tag, attribute, reason);
  808. OnRemovingAttribute(e);
  809. if (!e.Cancel)
  810. tag.RemoveAttribute(attribute.Name);
  811. }
  812. /// <summary>
  813. /// Removes a style from the document.
  814. /// </summary>
  815. /// <param name="tag">Tag the style belongs to.</param>
  816. /// <param name="styles">Style rule that contains the style to be removed.</param>
  817. /// <param name="style">Style to be removed.</param>
  818. /// <param name="reason">Reason for removal.</param>
  819. private void RemoveStyle(IElement tag, ICssStyleDeclaration styles, ICssProperty style, RemoveReason reason)
  820. {
  821. var e = new RemovingStyleEventArgs(tag, style, reason);
  822. OnRemovingStyle(e);
  823. if (!e.Cancel)
  824. styles.RemoveProperty(style.Name);
  825. }
  826. /// <summary>
  827. /// Removes an at-rule from the document.
  828. /// </summary>
  829. /// <param name="tag">Tag the style belongs to.</param>
  830. /// <param name="rule">Rule to be removed.</param>
  831. /// <returns><c>true</c>, if the rule can be removed; <c>false</c>, otherwise.</returns>
  832. private bool RemoveAtRule(IElement tag, ICssRule rule)
  833. {
  834. var e = new RemovingAtRuleEventArgs(tag, rule);
  835. OnRemovingAtRule(e);
  836. return !e.Cancel;
  837. }
  838. /// <summary>
  839. /// Removes a CSS class from a class attribute.
  840. /// </summary>
  841. /// <param name="tag">Tag the style belongs to.</param>
  842. /// <param name="cssClass">Class to be removed.</param>
  843. /// <param name="reason">Reason for removal.</param>
  844. private void RemoveCssClass(IElement tag, string cssClass, RemoveReason reason)
  845. {
  846. var e = new RemovingCssClassEventArgs(tag, cssClass, reason);
  847. OnRemovingCssClass(e);
  848. if (!e.Cancel)
  849. tag.ClassList.Remove(cssClass);
  850. }
  851. }