瀏覽代碼

AngleSharp替代HtmlAgilityPack

懒得勤快 4 年之前
父節點
當前提交
73a3d3370f

+ 11 - 25
Masuit.Tools.Abstractions/Html/HtmlTools.cs

@@ -1,5 +1,6 @@
-using Ganss.XSS;
-using HtmlAgilityPack;
+using AngleSharp;
+using AngleSharp.Dom;
+using Ganss.XSS;
 using Masuit.Tools.RandomSelector;
 using System;
 using System.Collections.Generic;
@@ -89,9 +90,9 @@ namespace Masuit.Tools.Html
         /// <returns></returns>
         public static string RemoveHtmlTag(this string html, int length = 0)
         {
-            var doc = new HtmlDocument();
-            doc.LoadHtml(html);
-            var strText = doc.DocumentNode.InnerText;
+            var context = BrowsingContext.New(Configuration.Default);
+            var doc = context.OpenAsync(req => req.Content(html)).Result;
+            var strText = doc.Body.TextContent;
             if (length > 0 && strText.Length > length)
             {
                 return strText.Substring(0, length);
@@ -100,20 +101,6 @@ namespace Masuit.Tools.Html
             return strText;
         }
 
-        /// <summary>
-        /// 清理Word文档转html后的冗余标签属性
-        /// </summary>
-        /// <param name="html"></param>
-        /// <returns></returns>
-        public static string ClearHtml(this string html)
-        {
-            string s = Regex.Match(Regex.Replace(html, @"background-color:#?\w{3,7}|font-family:'?[\w|\(|\)]*'?;?", string.Empty), @"<body[^>]*>([\s\S]*)<\/body>").Groups[1].Value.Replace("&#xa0;", string.Empty);
-            s = Regex.Replace(s, @"\w+-?\w+:0\w+;?", string.Empty); //去除多余的零值属性
-            s = Regex.Replace(s, "alt=\"(.+?)\"", string.Empty); //除去alt属性
-            s = Regex.Replace(s, @"-aw.+?\s", string.Empty); //去除Word产生的-aw属性
-            return s;
-        }
-
         /// <summary>
         /// 替换html的img路径为绝对路径
         /// </summary>
@@ -137,12 +124,11 @@ namespace Masuit.Tools.Html
         /// </summary>
         /// <param name="html"></param>
         /// <returns></returns>
-        public static IEnumerable<HtmlNode> MatchImgTags(this string html)
+        public static IHtmlCollection<IElement> MatchImgTags(this string html)
         {
-            var doc = new HtmlDocument();
-            doc.LoadHtml(html);
-            var nodes = doc.DocumentNode.Descendants("img");
-            return nodes;
+            var context = BrowsingContext.New(Configuration.Default);
+            var doc = context.OpenAsync(req => req.Content(html)).Result;
+            return doc.Body.GetElementsByTagName("img");
         }
 
         /// <summary>
@@ -152,7 +138,7 @@ namespace Masuit.Tools.Html
         /// <returns></returns>
         public static IEnumerable<string> MatchImgSrcs(this string html)
         {
-            return MatchImgTags(html).Where(n => n.Attributes.Contains("src")).Select(n => n.Attributes["src"].Value);
+            return MatchImgTags(html).Where(n => n.HasAttribute("src")).Select(n => n.GetAttribute("src"));
         }
 
         /// <summary>

+ 0 - 1
Masuit.Tools.Abstractions/Masuit.Tools.Abstractions.csproj

@@ -45,7 +45,6 @@
 
   <ItemGroup>
     <PackageReference Include="DnsClient" Version="1.4.0" />
-    <PackageReference Include="HtmlAgilityPack" Version="1.11.32" />
     <PackageReference Include="HtmlSanitizer" Version="5.0.376" />
     <PackageReference Include="Microsoft.Win32.Registry" Version="4.7.0" />
     <PackageReference Include="Newtonsoft.Json" Version="13.0.1" />

+ 0 - 1
Masuit.Tools.Core/Masuit.Tools.Core.csproj

@@ -33,7 +33,6 @@
 
     <ItemGroup>
         <PackageReference Include="DnsClient" Version="1.4.0" />
-        <PackageReference Include="HtmlAgilityPack" Version="1.11.32" />
         <PackageReference Include="HtmlSanitizer" Version="5.0.376" />
         <PackageReference Include="Microsoft.AspNetCore.Mvc" Version="2.2.0" />
         <PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="5.0" />

+ 0 - 4
Masuit.Tools/Masuit.Tools.csproj

@@ -388,7 +388,6 @@
     <None Include="package.nuspec" />
   </ItemGroup>
   <ItemGroup>
-    <Reference Include="Microsoft.CSharp" />
     <Reference Include="System" />
     <Reference Include="System.ComponentModel.DataAnnotations" />
     <Reference Include="System.Configuration" />
@@ -404,9 +403,6 @@
     <PackageReference Include="DnsClient">
       <Version>1.4.0</Version>
     </PackageReference>
-    <PackageReference Include="HtmlAgilityPack">
-      <Version>1.11.32</Version>
-    </PackageReference>
     <PackageReference Include="HtmlSanitizer">
       <Version>5.0.376</Version>
     </PackageReference>