IHtmlSanitizer.cs 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. using AngleSharp;
  2. using AngleSharp.Css.Dom;
  3. using AngleSharp.Html.Dom;
  4. using AngleSharp.Html.Parser;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Text.RegularExpressions;
  8. namespace Ganss.Xss;
  9. /// <summary>
  10. /// Enables an inheriting class to implement an HtmlSanitizer class, which cleans HTML documents and fragments
  11. /// from constructs that can lead to <a href="https://en.wikipedia.org/wiki/Cross-site_scripting">XSS attacks</a>.
  12. /// </summary>
  13. public interface IHtmlSanitizer
  14. {
  15. /// <summary>
  16. /// Gets or sets a value indicating whether to keep child nodes of elements that are removed.
  17. /// </summary>
  18. bool KeepChildNodes { get; set; }
  19. /// <summary>
  20. /// Gets or sets the <see cref="Func{HtmlParser}"/> object the creates the parser used for parsing the input.
  21. /// </summary>
  22. Func<HtmlParser> HtmlParserFactory { get; set; }
  23. /// <summary>
  24. /// Gets or sets the <see cref="IMarkupFormatter"/> object used for generating output.
  25. /// </summary>
  26. IMarkupFormatter OutputFormatter { get; set; }
  27. /// <summary>
  28. /// Gets the allowed CSS at-rules such as "@media" and "@font-face".
  29. /// </summary>
  30. /// <value>
  31. /// The allowed CSS at-rules.
  32. /// </value>
  33. ISet<CssRuleType> AllowedAtRules { get; }
  34. /// <summary>
  35. /// Gets the allowed URI schemes such as "http" and "https".
  36. /// </summary>
  37. /// <value>
  38. /// The allowed URI schemes.
  39. /// </value>
  40. ISet<string> AllowedSchemes { get; }
  41. /// <summary>
  42. /// Gets the allowed HTML tag names such as "a" and "div".
  43. /// </summary>
  44. /// <value>
  45. /// The allowed tag names.
  46. /// </value>
  47. ISet<string> AllowedTags { get; }
  48. /// <summary>
  49. /// Gets the allowed HTML attributes such as "href" and "alt".
  50. /// </summary>
  51. /// <value>
  52. /// The allowed HTML attributes.
  53. /// </value>
  54. ISet<string> AllowedAttributes { get; }
  55. /// <summary>
  56. /// Allow all HTML5 data attributes; the attributes prefixed with data-
  57. /// </summary>
  58. bool AllowDataAttributes { get; set; }
  59. /// <summary>
  60. /// Gets or sets the HTML attributes that can contain a URI such as "href".
  61. /// </summary>
  62. /// <value>
  63. /// The URI attributes.
  64. /// </value>
  65. ISet<string> UriAttributes { get; }
  66. /// <summary>
  67. /// Gets or sets the allowed CSS properties such as "font" and "margin".
  68. /// </summary>
  69. /// <value>
  70. /// The allowed CSS properties.
  71. /// </value>
  72. ISet<string> AllowedCssProperties { get; }
  73. /// <summary>
  74. /// Gets or sets a regex that must not match for legal CSS property values.
  75. /// </summary>
  76. /// <value>
  77. /// The regex.
  78. /// </value>
  79. Regex DisallowCssPropertyValue { get; set; }
  80. /// <summary>
  81. /// Gets or sets the allowed CSS classes. If the set is empty, all classes will be allowed.
  82. /// </summary>
  83. /// <value>
  84. /// The allowed CSS classes. An empty set means all classes are allowed.
  85. /// </value>
  86. ISet<string> AllowedClasses { get; }
  87. /// <summary>
  88. /// Occurs after sanitizing the document and post processing nodes.
  89. /// </summary>
  90. event EventHandler<PostProcessDomEventArgs> PostProcessDom;
  91. /// <summary>
  92. /// Occurs for every node after sanitizing.
  93. /// </summary>
  94. event EventHandler<PostProcessNodeEventArgs> PostProcessNode;
  95. /// <summary>
  96. /// Occurs before a tag is removed.
  97. /// </summary>
  98. event EventHandler<RemovingTagEventArgs> RemovingTag;
  99. /// <summary>
  100. /// Occurs before an attribute is removed.
  101. /// </summary>
  102. event EventHandler<RemovingAttributeEventArgs> RemovingAttribute;
  103. /// <summary>
  104. /// Occurs before a style is removed.
  105. /// </summary>
  106. event EventHandler<RemovingStyleEventArgs> RemovingStyle;
  107. /// <summary>
  108. /// Occurs before an at-rule is removed.
  109. /// </summary>
  110. event EventHandler<RemovingAtRuleEventArgs> RemovingAtRule;
  111. /// <summary>
  112. /// Occurs before a comment is removed.
  113. /// </summary>
  114. event EventHandler<RemovingCommentEventArgs> RemovingComment;
  115. /// <summary>
  116. /// Occurs before a CSS class is removed.
  117. /// </summary>
  118. event EventHandler<RemovingCssClassEventArgs> RemovingCssClass;
  119. /// <summary>
  120. /// Occurs when a URL is being sanitized.
  121. /// </summary>
  122. event EventHandler<FilterUrlEventArgs>? FilterUrl;
  123. /// <summary>
  124. /// Sanitizes the specified HTML.
  125. /// </summary>
  126. /// <param name="html">The HTML to sanitize.</param>
  127. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  128. /// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
  129. /// <returns>The sanitized HTML.</returns>
  130. string Sanitize(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null);
  131. /// <summary>
  132. /// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
  133. /// </summary>
  134. /// <param name="html">The HTML body fragment to sanitize.</param>
  135. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  136. /// <returns>The sanitized HTML document.</returns>
  137. IHtmlDocument SanitizeDom(string html, string baseUrl = "");
  138. /// <summary>
  139. /// Sanitizes the specified parsed HTML body fragment.
  140. /// If the document has not been parsed with CSS support then all styles will be removed.
  141. /// </summary>
  142. /// <param name="document">The parsed HTML document.</param>
  143. /// <param name="context">The node within which to sanitize.</param>
  144. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  145. /// <returns>The sanitized HTML document.</returns>
  146. IHtmlDocument SanitizeDom(IHtmlDocument document, IHtmlElement? context = null, string baseUrl = "");
  147. /// <summary>
  148. /// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
  149. /// </summary>
  150. /// <param name="html">The HTML document to sanitize.</param>
  151. /// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
  152. /// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
  153. /// <returns>The sanitized HTML document.</returns>
  154. string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null);
  155. }