using AngleSharp;
using AngleSharp.Css;
using AngleSharp.Css.Dom;
using AngleSharp.Css.Parser;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace Ganss.Xss;
///
/// Cleans HTML documents and fragments from constructs that can lead to XSS attacks.
///
///
/// XSS attacks can occur at several levels within an HTML document or fragment:
///
/// - HTML tags (e.g. the <script> tag)
/// - HTML attributes (e.g. the "onload" attribute)
/// - CSS styles (url property values)
/// - malformed HTML or HTML that exploits parser bugs in specific browsers
///
///
/// The HtmlSanitizer class addresses all of these possible attack vectors by using a sophisticated HTML parser (AngleSharp).
///
///
/// In order to facilitate different use cases, HtmlSanitizer can be customized at the levels mentioned above:
///
/// - You can specify the allowed HTML tags through the property . All other tags will be stripped.
/// - You can specify the allowed HTML attributes through the property . All other attributes will be stripped.
/// - You can specify the allowed CSS property names through the property . All other styles will be stripped.
/// - You can specify the allowed URI schemes through the property . All other URIs will be stripped.
/// - You can specify the HTML attributes that contain URIs (such as "src", "href" etc.) through the property .
///
///
///
///
///
/// alert('xss')Test
![]()
";
/// var sanitized = sanitizer.Sanitize(html, "http://www.example.com");
/// // -> "Test

"
/// ]]>
///
///
public class HtmlSanitizer : IHtmlSanitizer
{
private const string StyleAttributeName = "style";
// from http://genshi.edgewall.org/
private static readonly Regex CssUnicodeEscapes = new(@"\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'""{};:()#*])", RegexOptions.Compiled);
private static readonly Regex CssComments = new(@"/\*.*?\*/", RegexOptions.Compiled);
// IE6
private static readonly Regex CssExpression = new(@"[eE\uFF25\uFF45][xX\uFF38\uFF58][pP\uFF30\uFF50][rR\u0280\uFF32\uFF52][eE\uFF25\uFF45][sS\uFF33\uFF53]{2}[iI\u026A\uFF29\uFF49][oO\uFF2F\uFF4F][nN\u0274\uFF2E\uFF4E]", RegexOptions.Compiled);
private static readonly Regex CssUrl = new(@"[Uu][Rr\u0280][Ll\u029F]\((['""]?)([^'"")]+)(['""]?)", RegexOptions.Compiled);
private static readonly Regex WhitespaceRegex = new(@"\s*", RegexOptions.Compiled);
private static readonly IConfiguration defaultConfiguration = Configuration.Default.WithCss(new CssParserOptions
{
IsIncludingUnknownDeclarations = true,
IsIncludingUnknownRules = true,
IsToleratingInvalidSelectors = true,
});
private static readonly HtmlParser defaultHtmlParser = new(new HtmlParserOptions { IsScripting = true }, BrowsingContext.New(defaultConfiguration));
///
/// Initializes a new instance of the class
/// with the default options.
///
public HtmlSanitizer()
{
AllowedTags = new HashSet(HtmlSanitizerDefaults.AllowedTags, StringComparer.OrdinalIgnoreCase);
AllowedSchemes = new HashSet(HtmlSanitizerDefaults.AllowedSchemes, StringComparer.OrdinalIgnoreCase);
AllowedAttributes = new HashSet(HtmlSanitizerDefaults.AllowedAttributes, StringComparer.OrdinalIgnoreCase);
UriAttributes = new HashSet(HtmlSanitizerDefaults.UriAttributes, StringComparer.OrdinalIgnoreCase);
AllowedCssProperties = new HashSet(HtmlSanitizerDefaults.AllowedCssProperties, StringComparer.OrdinalIgnoreCase);
AllowedAtRules = new HashSet(HtmlSanitizerDefaults.AllowedAtRules);
AllowedClasses = new HashSet(HtmlSanitizerDefaults.AllowedClasses);
}
///
/// Initializes a new instance of the class
/// with the given options.
///
/// Options to control the sanitizing.
public HtmlSanitizer(HtmlSanitizerOptions options)
{
AllowedTags = new HashSet(options.AllowedTags, StringComparer.OrdinalIgnoreCase);
AllowedSchemes = new HashSet(options.AllowedSchemes, StringComparer.OrdinalIgnoreCase);
AllowedAttributes = new HashSet(options.AllowedAttributes, StringComparer.OrdinalIgnoreCase);
UriAttributes = new HashSet(options.UriAttributes, StringComparer.OrdinalIgnoreCase);
AllowedClasses = new HashSet(options.AllowedCssClasses, StringComparer.OrdinalIgnoreCase);
AllowedCssProperties = new HashSet(options.AllowedCssProperties, StringComparer.OrdinalIgnoreCase);
AllowedAtRules = new HashSet(options.AllowedAtRules);
AllowCssCustomProperties = options.AllowCssCustomProperties;
AllowDataAttributes = options.AllowDataAttributes;
}
///
/// Gets or sets the default method that encodes comments.
///
public Action EncodeComment { get; set; } = DefaultEncodeComment;
///
/// Gets or sets the default method that encodes literal text content.
///
public Action EncodeLiteralTextElementContent { get; set; } = DefaultEncodeLiteralTextElementContent;
///
/// Gets or sets the default value indicating whether to keep child nodes of elements that are removed. Default is false.
///
public static bool DefaultKeepChildNodes { get; set; } = false;
///
/// Gets or sets a value indicating whether to keep child nodes of elements that are removed. Default is .
///
public bool KeepChildNodes { get; set; } = DefaultKeepChildNodes;
///
/// Gets or sets the default object that creates the parser used for parsing the input.
///
public static Func DefaultHtmlParserFactory { get; set; } = () => defaultHtmlParser;
///
/// Gets or sets the object the creates the parser used for parsing the input.
///
public Func HtmlParserFactory { get; set; } = DefaultHtmlParserFactory;
///
/// Gets or sets the default object used for generating output. Default is .
///
public static IMarkupFormatter DefaultOutputFormatter { get; set; } = HtmlFormatter.Instance;
///
/// Gets or sets the object used for generating output. Default is .
///
public IMarkupFormatter OutputFormatter { get; set; } = DefaultOutputFormatter;
///
/// Gets or sets the default object used for generating CSS output. Default is .
///
public static IStyleFormatter DefaultStyleFormatter { get; set; } = CssStyleFormatter.Instance;
///
/// Gets or sets the object used for generating CSS output. Default is .
///
public IStyleFormatter StyleFormatter { get; set; } = DefaultStyleFormatter;
///
/// Gets or sets the allowed CSS at-rules such as "@media" and "@font-face".
///
///
/// The allowed CSS at-rules.
///
public ISet AllowedAtRules { get; private set; }
///
/// Gets or sets the allowed URI schemes such as "http" and "https".
///
///
/// The allowed URI schemes.
///
public ISet AllowedSchemes { get; private set; }
///
/// Gets or sets the allowed HTML tag names such as "a" and "div".
///
///
/// The allowed tag names.
///
public ISet AllowedTags { get; private set; }
///
/// Gets or sets the allowed HTML attributes such as "href" and "alt".
///
///
/// The allowed HTML attributes.
///
public ISet AllowedAttributes { get; private set; }
///
/// Allow all HTML5 data attributes; the attributes prefixed with data-.
///
public bool AllowDataAttributes { get; set; }
///
/// Gets or sets the HTML attributes that can contain a URI such as "href".
///
///
/// The URI attributes.
///
public ISet UriAttributes { get; private set; }
///
/// Gets or sets the allowed CSS properties such as "font" and "margin".
///
///
/// The allowed CSS properties.
///
public ISet AllowedCssProperties { get; private set; }
///
/// Allow all custom CSS properties (variables) prefixed with --.
///
public bool AllowCssCustomProperties { get; set; }
///
/// Gets or sets a regex that must not match for legal CSS property values.
///
///
/// The regex.
///
public Regex DisallowCssPropertyValue { get; set; } = DefaultDisallowedCssPropertyValue;
///
/// Gets or sets the allowed CSS classes. If the set is empty, all classes will be allowed.
///
///
/// The allowed CSS classes. An empty set means all classes are allowed.
///
public ISet AllowedClasses { get; private set; }
///
/// Occurs after sanitizing the document and post processing nodes.
///
public event EventHandler? PostProcessDom;
///
/// Occurs for every node after sanitizing.
///
public event EventHandler? PostProcessNode;
///
/// Occurs before a tag is removed.
///
public event EventHandler? RemovingTag;
///
/// Occurs before an attribute is removed.
///
public event EventHandler? RemovingAttribute;
///
/// Occurs before a style is removed.
///
public event EventHandler? RemovingStyle;
///
/// Occurs before an at-rule is removed.
///
public event EventHandler? RemovingAtRule;
///
/// Occurs before a comment is removed.
///
public event EventHandler? RemovingComment;
///
/// Occurs before a CSS class is removed.
///
public event EventHandler? RemovingCssClass;
///
/// Occurs when a URL is being sanitized.
///
public event EventHandler? FilterUrl;
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnPostProcessDom(PostProcessDomEventArgs e)
{
PostProcessDom?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnPostProcessNode(PostProcessNodeEventArgs e)
{
PostProcessNode?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnRemovingTag(RemovingTagEventArgs e)
{
RemovingTag?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnRemovingAttribute(RemovingAttributeEventArgs e)
{
RemovingAttribute?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnRemovingStyle(RemovingStyleEventArgs e)
{
RemovingStyle?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnRemovingAtRule(RemovingAtRuleEventArgs e)
{
RemovingAtRule?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnRemovingComment(RemovingCommentEventArgs e)
{
RemovingComment?.Invoke(this, e);
}
///
/// The default regex for disallowed CSS property values.
///
public static readonly Regex DefaultDisallowedCssPropertyValue = new(@"[<>]", RegexOptions.Compiled);
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnRemovingCssClass(RemovingCssClassEventArgs e)
{
RemovingCssClass?.Invoke(this, e);
}
///
/// Raises the event.
///
/// The instance containing the event data.
protected virtual void OnFilteringUrl(FilterUrlEventArgs e)
{
FilterUrl?.Invoke(this, e);
}
///
/// Return all nested subnodes of a node. The nodes are returned in DOM order.
///
/// The root node.
/// All nested subnodes.
private static IEnumerable GetAllNodes(INode dom)
{
if (dom.ChildNodes.Length == 0) yield break;
var s = new Stack();
for (var i = dom.ChildNodes.Length - 1; i >= 0; i--)
{
s.Push(dom.ChildNodes[i]);
}
while (s.Count > 0)
{
var n = s.Pop();
yield return n;
for (var i = n.ChildNodes.Length - 1; i >= 0; i--)
{
s.Push(n.ChildNodes[i]);
}
}
}
///
/// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
///
/// The HTML body fragment to sanitize.
/// The base URL relative URLs are resolved against. No resolution if empty.
/// The formatter used to render the DOM. Using the if null.
/// The sanitized HTML body fragment.
public string Sanitize(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
{
using var dom = SanitizeDom(html, baseUrl);
if (dom.Body == null) return string.Empty;
var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? OutputFormatter);
return output;
}
///
/// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
///
/// The HTML body fragment to sanitize.
/// The base URL relative URLs are resolved against. No resolution if empty.
/// The sanitized HTML document.
public IHtmlDocument SanitizeDom(string html, string baseUrl = "")
{
var parser = HtmlParserFactory();
var dom = parser.ParseDocument("" + html);
if (dom.Body != null)
DoSanitize(dom, dom.Body, baseUrl);
return dom;
}
///
/// Sanitizes the specified parsed HTML body fragment.
/// If the document has not been parsed with CSS support then all styles will be removed.
///
/// The parsed HTML document.
/// The node within which to sanitize.
/// The base URL relative URLs are resolved against. No resolution if empty.
/// The sanitized HTML document.
public IHtmlDocument SanitizeDom(IHtmlDocument document, IHtmlElement? context = null, string baseUrl = "")
{
DoSanitize(document, context ?? (IParentNode)document, baseUrl);
return document;
}
///
/// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
///
/// The HTML document to sanitize.
/// The base URL relative URLs are resolved against. No resolution if empty.
/// The formatter used to render the DOM. Using the if null.
/// The sanitized HTML document.
public string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
{
var parser = HtmlParserFactory();
using var dom = parser.ParseDocument(html);
DoSanitize(dom, dom, baseUrl);
var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
return output;
}
///
/// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
///
/// The HTML document to sanitize.
/// The base URL relative URLs are resolved against. No resolution if empty.
/// The formatter used to render the DOM. Using the if null.
/// The sanitized HTML document.
public string SanitizeDocument(Stream html, string baseUrl = "", IMarkupFormatter? outputFormatter = null)
{
var parser = HtmlParserFactory();
using var dom = parser.ParseDocument(html);
DoSanitize(dom, dom, baseUrl);
var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
return output;
}
///
/// Removes all comment nodes from a list of nodes.
///
/// The node within which to remove comments.
/// true if any comments were removed; otherwise, false.
private void RemoveComments(INode context)
{
foreach (var comment in GetAllNodes(context).OfType().ToList())
{
EncodeComment(comment);
var e = new RemovingCommentEventArgs(comment);
OnRemovingComment(e);
if (!e.Cancel)
comment.Remove();
}
}
private static void DefaultEncodeComment(IComment comment)
{
var escapedText = comment.TextContent.Replace("<", "<").Replace(">", ">");
if (escapedText != comment.TextContent)
comment.TextContent = escapedText;
}
private static void DefaultEncodeLiteralTextElementContent(IElement tag)
{
var escapedHtml = tag.InnerHtml.Replace("<", "<").Replace(">", ">");
if (escapedHtml != tag.InnerHtml)
tag.InnerHtml = escapedHtml;
if (tag.InnerHtml != escapedHtml) // setting InnerHtml does not work for noscript
tag.SetInnerText(escapedHtml);
}
private void DoSanitize(IHtmlDocument dom, IParentNode context, string baseUrl = "")
{
// remove disallowed tags
foreach (var tag in context.QuerySelectorAll("*").Where(t => !IsAllowedTag(t)).ToList())
{
RemoveTag(tag, RemoveReason.NotAllowedTag);
}
// always encode text in raw data content
foreach (var tag in context.QuerySelectorAll("*")
.Where(t => t is not IHtmlStyleElement
&& t.Flags.HasFlag(NodeFlags.LiteralText)
&& !string.IsNullOrWhiteSpace(t.InnerHtml)))
{
EncodeLiteralTextElementContent(tag);
}
SanitizeStyleSheets(dom, baseUrl);
// cleanup attributes
foreach (var tag in context.QuerySelectorAll("*").ToList())
{
// remove disallowed attributes
foreach (var attribute in tag.Attributes.Where(a => !IsAllowedAttribute(a)).ToList())
{
RemoveAttribute(tag, attribute, RemoveReason.NotAllowedAttribute);
}
// sanitize URLs in URL-marked attributes
foreach (var attribute in tag.Attributes.Where(IsUriAttribute).ToList())
{
var url = SanitizeUrl(tag, attribute.Value, baseUrl);
if (url == null)
RemoveAttribute(tag, attribute, RemoveReason.NotAllowedUrlValue);
else
tag.SetAttribute(attribute.Name, url);
}
// sanitize the style attribute
var oldStyleEmpty = string.IsNullOrEmpty(tag.GetAttribute(StyleAttributeName));
SanitizeStyle(tag, baseUrl);
// sanitize the value of the attributes
foreach (var attribute in tag.Attributes.ToList())
{
// The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
// (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
if (attribute.Value.Contains("&{"))
{
RemoveAttribute(tag, attribute, RemoveReason.NotAllowedValue);
}
else
{
if (AllowedClasses.Any() && attribute.Name == "class")
{
var removedClasses = tag.ClassList.Except(AllowedClasses).ToArray();
foreach (var removedClass in removedClasses)
RemoveCssClass(tag, removedClass, RemoveReason.NotAllowedCssClass);
if (tag.ClassList.Length == 0)
RemoveAttribute(tag, attribute, RemoveReason.ClassAttributeEmpty);
}
else if (!oldStyleEmpty && attribute.Name == StyleAttributeName && string.IsNullOrEmpty(attribute.Value))
{
RemoveAttribute(tag, attribute, RemoveReason.StyleAttributeEmpty);
}
}
}
}
if (context is INode node)
{
RemoveComments(node);
}
DoPostProcess(dom, context as INode);
}
private void SanitizeStyleSheets(IHtmlDocument dom, string baseUrl)
{
foreach (var styleSheet in dom.StyleSheets.OfType())
{
var styleTag = styleSheet.OwnerNode;
var i = 0;
while (i < styleSheet.Rules.Length)
{
var rule = styleSheet.Rules[i];
if (!SanitizeStyleRule(rule, styleTag, baseUrl) && RemoveAtRule(styleTag, rule))
styleSheet.RemoveAt(i);
else i++;
}
styleTag.InnerHtml = styleSheet.ToCss(StyleFormatter).Replace("<", "\\3c ");
}
}
private bool SanitizeStyleRule(ICssRule rule, IElement styleTag, string baseUrl)
{
if (!AllowedAtRules.Contains(rule.Type)) return false;
if (rule is ICssStyleRule styleRule)
{
SanitizeStyleDeclaration(styleTag, styleRule.Style, baseUrl);
}
else
{
if (rule is ICssGroupingRule groupingRule)
{
var i = 0;
while (i < groupingRule.Rules.Length)
{
var childRule = groupingRule.Rules[i];
if (!SanitizeStyleRule(childRule, styleTag, baseUrl) && RemoveAtRule(styleTag, childRule))
groupingRule.RemoveAt(i);
else i++;
}
}
else if (rule is ICssPageRule pageRule)
{
SanitizeStyleDeclaration(styleTag, pageRule.Style, baseUrl);
}
else if (rule is ICssKeyframesRule keyFramesRule)
{
foreach (var childRule in keyFramesRule.Rules.OfType().ToList())
{
if (!SanitizeStyleRule(childRule, styleTag, baseUrl) && RemoveAtRule(styleTag, childRule))
keyFramesRule.Remove(childRule.KeyText);
}
}
else if (rule is ICssKeyframeRule keyFrameRule)
{
SanitizeStyleDeclaration(styleTag, keyFrameRule.Style, baseUrl);
}
}
return true;
}
///
/// Performs post processing on all nodes in the document.
///
/// The HTML document.
/// The node within which to post process all nodes.
private void DoPostProcess(IHtmlDocument dom, INode? context)
{
if (PostProcessNode != null)
{
dom.Normalize();
if (context != null)
{
var nodes = GetAllNodes(context).ToList();
foreach (var node in nodes)
{
var e = new PostProcessNodeEventArgs(dom, node);
OnPostProcessNode(e);
if (e.ReplacementNodes.Count != 0)
{
((IChildNode)node).Replace([.. e.ReplacementNodes]);
}
}
}
}
if (PostProcessDom != null)
{
var e = new PostProcessDomEventArgs(dom);
OnPostProcessDom(e);
}
}
///
/// Determines whether the specified attribute can contain a URI.
///
/// The attribute.
/// true if the attribute can contain a URI; otherwise, false.
private bool IsUriAttribute(IAttr attribute)
{
return UriAttributes.Contains(attribute.Name);
}
///
/// Determines whether the specified tag is allowed.
///
/// The tag.
/// true if the tag is allowed; otherwise, false.
private bool IsAllowedTag(IElement tag)
{
return AllowedTags.Contains(tag.NodeName);
}
///
/// Determines whether the specified attribute is allowed.
///
/// The attribute.
/// true if the attribute is allowed; otherwise, false.
private bool IsAllowedAttribute(IAttr attribute)
{
return AllowedAttributes.Contains(attribute.Name)
// test html5 data- attributes
|| (AllowDataAttributes && attribute.Name != null && attribute.Name.StartsWith("data-", StringComparison.OrdinalIgnoreCase));
}
///
/// Sanitizes the style.
///
/// The element.
/// The base URL.
protected void SanitizeStyle(IElement element, string baseUrl)
{
// filter out invalid CSS declarations
// see https://github.com/AngleSharp/AngleSharp/issues/101
var attribute = element.GetAttribute(StyleAttributeName);
if (attribute == null)
return;
if (element.GetStyle() == null)
{
element.RemoveAttribute(StyleAttributeName);
return;
}
element.SetAttribute(StyleAttributeName, element.GetStyle().ToCss(StyleFormatter));
var styles = element.GetStyle();
if (styles == null || styles.Length == 0)
return;
SanitizeStyleDeclaration(element, styles, baseUrl);
}
///
/// Verify if the given CSS property name is allowed. By default this will
/// check if the property is in the set,
/// or if the property is a custom property and is true.
///
/// The name of the CSS property.
/// True if the property is allowed or not.
protected virtual bool IsAllowedCssProperty(string propertyName)
{
return AllowedCssProperties.Contains(propertyName)
|| AllowCssCustomProperties && propertyName != null && propertyName.StartsWith("--");
}
private void SanitizeStyleDeclaration(IElement element, ICssStyleDeclaration styles, string baseUrl)
{
var removeStyles = new List>();
var setStyles = new Dictionary();
foreach (var style in styles)
{
var key = DecodeCss(style.Name);
var val = DecodeCss(style.Value);
if (!IsAllowedCssProperty(key))
{
removeStyles.Add(new Tuple(style, RemoveReason.NotAllowedStyle));
continue;
}
if (CssExpression.IsMatch(val) || DisallowCssPropertyValue.IsMatch(val))
{
removeStyles.Add(new Tuple(style, RemoveReason.NotAllowedValue));
continue;
}
val = WhitespaceRegex.Replace(val, string.Empty);
var urls = CssUrl.Matches(val).Cast().Select(m => (Match: m, Url: SanitizeUrl(element, m.Groups[2].Value, baseUrl)));
if (urls.Any())
{
if (urls.Any(u => u.Url == null))
removeStyles.Add(new Tuple(style, RemoveReason.NotAllowedUrlValue));
else
{
var sb = new StringBuilder();
var ix = 0;
foreach (var url in urls)
{
sb.Append(val, ix, url.Match.Index - ix);
sb.Append("url(");
sb.Append(url.Match.Groups[1].Value);
sb.Append(url.Url);
sb.Append(url.Match.Groups[3].Value);
ix = url.Match.Index + url.Match.Length;
}
sb.Append(val, ix, val.Length - ix);
var s = sb.ToString();
if (s != val)
{
if (key != style.Name)
{
removeStyles.Add(new Tuple(style, RemoveReason.NotAllowedUrlValue));
}
setStyles[key] = s;
}
}
}
}
foreach (var style in setStyles)
{
styles.SetProperty(style.Key, style.Value);
}
foreach (var style in removeStyles)
{
RemoveStyle(element, styles, style.Item1, style.Item2);
}
}
///
/// Decodes CSS Unicode escapes and removes comments.
///
/// The CSS string.
/// The decoded CSS string.
protected static string DecodeCss(string css)
{
var r = CssUnicodeEscapes.Replace(css, m =>
{
if (m.Groups[1].Success)
return ((char)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber)).ToString();
var t = m.Groups[2].Value;
return t == "\\" ? @"\\" : t;
});
r = CssComments.Replace(r, m => "");
return r;
}
private static readonly Regex SchemeRegex = new(@"^([^\/#]*?)(?:\:|*58|*3a)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
///
/// Tries to create a safe object from a string.
///
/// The URL.
/// The object or null if no safe can be created.
protected Iri? GetSafeIri(string url)
{
url = url.TrimStart();
var schemeMatch = SchemeRegex.Match(url);
if (schemeMatch.Success)
{
var scheme = schemeMatch.Groups[1].Value;
return AllowedSchemes.Contains(scheme, StringComparer.OrdinalIgnoreCase) ? new Iri(url, scheme) : null;
}
return new Iri(url);
}
///
/// Sanitizes a URL.
///
/// The tag containing the URL being sanitized.
/// The URL.
/// The base URL relative URLs are resolved against (empty or null for no resolution).
/// The sanitized URL or null if no safe URL can be created.
protected virtual string? SanitizeUrl(IElement element, string url, string baseUrl)
{
var iri = GetSafeIri(url);
if (iri != null && !iri.IsAbsolute && !string.IsNullOrEmpty(baseUrl))
{
// resolve relative URI
if (Uri.TryCreate(baseUrl, UriKind.Absolute, out Uri? baseUri))
{
try
{
var sanitizedUrl = new Uri(baseUri, iri.Value).AbsoluteUri;
var ev = new FilterUrlEventArgs(element, url, sanitizedUrl);
OnFilteringUrl(ev);
return ev.SanitizedUrl;
}
catch (UriFormatException)
{
iri = null;
}
}
else iri = null;
}
var e = new FilterUrlEventArgs(element, url, iri?.Value);
OnFilteringUrl(e);
return e.SanitizedUrl;
}
///
/// Removes a tag from the document.
///
/// Tag to be removed.
/// Reason for removal.
private void RemoveTag(IElement tag, RemoveReason reason)
{
var e = new RemovingTagEventArgs(tag, reason);
OnRemovingTag(e);
if (!e.Cancel)
{
if (KeepChildNodes && tag.HasChildNodes)
tag.Replace([.. tag.ChildNodes]);
else
tag.Remove();
}
}
///
/// Removes an attribute from the document.
///
/// Tag the attribute belongs to.
/// Attribute to be removed.
/// Reason for removal.
private void RemoveAttribute(IElement tag, IAttr attribute, RemoveReason reason)
{
var e = new RemovingAttributeEventArgs(tag, attribute, reason);
OnRemovingAttribute(e);
if (!e.Cancel)
tag.RemoveAttribute(attribute.Name);
}
///
/// Removes a style from the document.
///
/// Tag the style belongs to.
/// Style rule that contains the style to be removed.
/// Style to be removed.
/// Reason for removal.
private void RemoveStyle(IElement tag, ICssStyleDeclaration styles, ICssProperty style, RemoveReason reason)
{
var e = new RemovingStyleEventArgs(tag, style, reason);
OnRemovingStyle(e);
if (!e.Cancel)
styles.RemoveProperty(style.Name);
}
///
/// Removes an at-rule from the document.
///
/// Tag the style belongs to.
/// Rule to be removed.
/// true, if the rule can be removed; false, otherwise.
private bool RemoveAtRule(IElement tag, ICssRule rule)
{
var e = new RemovingAtRuleEventArgs(tag, rule);
OnRemovingAtRule(e);
return !e.Cancel;
}
///
/// Removes a CSS class from a class attribute.
///
/// Tag the style belongs to.
/// Class to be removed.
/// Reason for removal.
private void RemoveCssClass(IElement tag, string cssClass, RemoveReason reason)
{
var e = new RemovingCssClassEventArgs(tag, cssClass, reason);
OnRemovingCssClass(e);
if (!e.Cancel)
tag.ClassList.Remove(cssClass);
}
}