| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938 |
- using System;
- using System.IO;
- using System.IO.Compression;
- using System.Net;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Threading;
- using System.Web;
- using Masuit.Tools.Win32;
- namespace Masuit.Tools.Core.Html
- {
- /// <summary>
- ///1、获取HTML<br/>
- ///1.1获取指定页面的HTML代码 GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)<br/>
- ///1.2获取HTMLGetHtml(string url, CookieContainer cookieContainer)<br/>
- ///2、获取字符流<br/>
- ///2.1获取字符流GetStream(string url, CookieContainer cookieContainer)<br/>
- ///3、清除HTML标记 <br/>
- ///3.1清除HTML标记 NoHTML(string Htmlstring)<br/>
- ///4、匹配页面的链接 <br/>
- ///4.1获取页面的链接正则 GetHref(string HtmlCode)<br/>
- ///5、匹配页面的图片地址<br/>
- /// 5.1匹配页面的图片地址 GetImgSrc(string HtmlCode, string imgHttp)<br/>
- ///5.2匹配<img src="" />中的图片路径实际链接 GetImg(string ImgString, string imgHttp)<br/>
- ///6、抓取远程页面内容<br/>
- /// 6.1以GET方式抓取远程页面内容 Get_Http(string tUrl)<br/>
- /// 6.2以POST方式抓取远程页面内容 Post_Http(string url, string postData, string encodeType)<br/>
- ///7、压缩HTML输出<br/>
- ///7.1压缩HTML输出 ZipHtml(string Html)<br/>
- ///8、过滤HTML标签<br/>
- /// 8.1过滤指定HTML标签 DelHtml(string s_TextStr, string html_Str) <br/>
- /// 8.2过滤HTML中的不安全标签 RemoveUnsafeHtml(string content)<br/>
- /// HTML转行成TEXT HtmlToTxt(string strHtml)<br/>
- /// 字符串转换为 HtmlStringToHtml(string str)<br/>
- /// html转换成字符串HtmlToString(string strHtml)<br/>
- /// 获取URL编码<br/>
- /// 判断URL是否有效<br/>
- /// 返回 HTML 字符串的编码解码结果
- /// </summary>
- public static partial class HtmlTools
- {
- #region 私有字段
- private static CookieContainer cc = new CookieContainer();
- private static string contentType = "application/x-www-form-urlencoded";
- private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg," +
- " application/x-shockwave-flash, application/x-silverlight, " +
- "application/vnd.ms-excel, application/vnd.ms-powerpoint, " +
- "application/msword, application/x-ms-application," +
- " application/x-ms-xbap," +
- " application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
- private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;" +
- " .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
- private static int delay = 1000;
- private static int currentTry = 0;
- #endregion
- #region 公有属性
- /// <summary>
- /// Cookie容器
- /// </summary>
- public static CookieContainer CookieContainer
- {
- get { return cc; }
- }
- /// <summary>
- /// 获取网页源码时使用的编码
- /// </summary>
- public static Encoding Encoding { get; set; } = Encoding.GetEncoding("utf-8");
- /// <summary>
- /// 网络延迟
- /// </summary>
- public static int NetworkDelay
- {
- get
- {
- Random r = new Random();
- return r.Next(delay, delay * 2);
- // return (r.Next(delay / 1000, delay / 1000 * 2)) * 1000;
- }
- set { delay = value; }
- }
- /// <summary>
- /// 最大尝试次数
- /// </summary>
- public static int MaxTry { get; set; } = 300;
- #endregion
- #region 1、获取HTML
- /// <summary>
- /// 去除html标签后并截取字符串
- /// </summary>
- /// <param name="html">源html</param>
- /// <param name="length">截取长度</param>
- /// <returns></returns>
- public static string RemoveHtmlTag(this string html, int length = 0)
- {
- string strText = Regex.Replace(html, "<[^>]+>", "");
- strText = Regex.Replace(strText, "&[^;]+;", "");
- if (length > 0 && strText.Length > length)
- {
- return strText.Substring(0, length);
- }
- return strText;
- }
- /// <summary>
- /// 获取指定页面的HTML代码
- /// </summary>
- /// <param name="_"></param>
- /// <param name="url">指定页面的路径</param>
- /// <param name="postData">post 提交的字符串</param>
- /// <param name="isPost">是否以post方式发送请求</param>
- /// <param name="cookieContainer">Cookie集合</param>
- public static string GetHtml(this HttpWebRequest _, string url, string postData, bool isPost, CookieContainer cookieContainer)
- {
- if (string.IsNullOrEmpty(postData))
- {
- return GetHtml(null, url, cookieContainer);
- }
- Thread.Sleep(NetworkDelay);
- currentTry++;
- HttpWebRequest httpWebRequest = null;
- HttpWebResponse httpWebResponse = null;
- try
- {
- byte[] byteRequest = Encoding.Default.GetBytes(postData);
- httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
- httpWebRequest.CookieContainer = cookieContainer;
- httpWebRequest.ContentType = contentType;
- httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
- httpWebRequest.Referer = url;
- httpWebRequest.Accept = accept;
- httpWebRequest.UserAgent = userAgent;
- httpWebRequest.Method = isPost ? "POST" : "GET";
- httpWebRequest.ContentLength = byteRequest.Length;
- httpWebRequest.AllowAutoRedirect = false;
- Stream stream = httpWebRequest.GetRequestStream();
- stream.Write(byteRequest, 0, byteRequest.Length);
- stream.Close();
- try
- {
- httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- //redirectURL = httpWebResponse.Headers["Location"];// Get redirected uri
- }
- catch (WebException ex)
- {
- httpWebResponse = (HttpWebResponse)ex.Response;
- }
- //httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- Stream responseStream = httpWebResponse.GetResponseStream();
- StreamReader streamReader = new StreamReader(responseStream, Encoding);
- string html = streamReader.ReadToEnd();
- streamReader.Close();
- responseStream.Close();
- currentTry = 0;
- httpWebRequest.Abort();
- httpWebResponse.Close();
- return html;
- }
- catch (Exception)
- {
- if (currentTry <= MaxTry)
- {
- GetHtml(null, url, postData, isPost, cookieContainer);
- }
- currentTry--;
- if (httpWebRequest != null) httpWebRequest.Abort();
- if (httpWebResponse != null) httpWebResponse.Close();
- return string.Empty;
- }
- }
- /// <summary>
- /// 获取HTML
- /// </summary>
- /// <param name="_"></param>
- /// <param name="url">地址</param>
- /// <param name="cookieContainer">Cookie集合</param>
- public static string GetHtml(this HttpWebRequest _, string url, CookieContainer cookieContainer)
- {
- Thread.Sleep(NetworkDelay);
- currentTry++;
- HttpWebRequest httpWebRequest = null;
- HttpWebResponse httpWebResponse = null;
- try
- {
- httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
- httpWebRequest.CookieContainer = cookieContainer;
- httpWebRequest.ContentType = contentType;
- httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
- httpWebRequest.Referer = url;
- httpWebRequest.Accept = accept;
- httpWebRequest.UserAgent = userAgent;
- httpWebRequest.Method = "GET";
- httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- Stream responseStream = httpWebResponse.GetResponseStream();
- StreamReader streamReader = new StreamReader(responseStream, Encoding);
- string html = streamReader.ReadToEnd();
- streamReader.Close();
- responseStream.Close();
- currentTry--;
- httpWebRequest.Abort();
- httpWebResponse.Close();
- return html;
- }
- catch (Exception)
- {
- if (currentTry <= MaxTry) GetHtml(null, url, cookieContainer);
- currentTry--;
- if (httpWebRequest != null) httpWebRequest.Abort();
- if (httpWebResponse != null) httpWebResponse.Close();
- return string.Empty;
- }
- }
- #endregion
- #region 2、获取字符流
- /// <summary>
- /// 2.1获取字符流
- /// </summary>
- /// ---------------------------------------------------------------------------------------------------------------
- /// 示例:
- /// System.Net.CookieContainer cookie = new System.Net.CookieContainer();
- /// Stream s = HttpHelper.GetStream("http://www.baidu.com", cookie);
- /// picVerify.Image = Image.FromStream(s);
- /// ---------------------------------------------------------------------------------------------------------------
- /// <param name="_"></param>
- /// <param name="url">地址</param>
- /// <param name="cookieContainer">cookieContainer</param>
- public static Stream GetStream(this HttpWebRequest _, string url, CookieContainer cookieContainer)
- {
- currentTry++;
- HttpWebRequest httpWebRequest = null;
- HttpWebResponse httpWebResponse = null;
- try
- {
- httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
- httpWebRequest.CookieContainer = cookieContainer;
- httpWebRequest.ContentType = contentType;
- httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
- httpWebRequest.Referer = url;
- httpWebRequest.Accept = accept;
- httpWebRequest.UserAgent = userAgent;
- httpWebRequest.Method = "GET";
- httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
- Stream responseStream = httpWebResponse.GetResponseStream();
- currentTry--;
- return responseStream;
- }
- catch (Exception)
- {
- if (currentTry <= MaxTry)
- {
- GetHtml(null, url, cookieContainer);
- }
- currentTry--;
- if (httpWebRequest != null)
- {
- httpWebRequest.Abort();
- }
- if (httpWebResponse != null)
- {
- httpWebResponse.Close();
- }
- return null;
- }
- }
- #endregion
- #region 3、清除HTML标记
- /// <summary>
- /// 清理Word文档转html后的冗余标签属性
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static string ClearHtml(this string html)
- {
- string s = Regex.Match(Regex.Replace(html, @"background-color:#?\w{3,7}|font-family:'?[\w|\(|\)]*'?;?", string.Empty), @"<body[^>]*>([\s\S]*)<\/body>").Groups[1].Value.Replace(" ", string.Empty);
- s = Regex.Replace(s, @"\w+-?\w+:0\w+;?", string.Empty);//去除多余的零值属性
- s = Regex.Replace(s, "alt=\"(.+?)\"", string.Empty);//除去alt属性
- s = Regex.Replace(s, @"-aw.+?\s", string.Empty);//去除Word产生的-aw属性
- return s;
- }
- ///<summary>
- ///3.1清除HTML标记
- ///</summary>
- ///<param name="htmlstring">包括HTML的源码</param>
- ///<returns>已经去除后的文字</returns>
- public static string RemoveHtml(this string htmlstring)
- {
- //删除脚本
- htmlstring = Regex.Replace(htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
- //删除HTML
- Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
- htmlstring = regex.Replace(htmlstring, "");
- htmlstring = Regex.Replace(htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"-->", "", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
- htmlstring = Regex.Replace(htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
- htmlstring.Replace("<", "");
- htmlstring.Replace(">", "");
- htmlstring.Replace("\r\n", "");
- return htmlstring;
- }
- #endregion
- #region 4、匹配页面的链接
- #region 4.1获取页面的链接正则
- /// <summary>
- /// 4.1获取页面的链接正则
- /// </summary>
- /// <param name="HtmlCode">html代码</param>
- public static string GetHref(this string HtmlCode)
- {
- string MatchVale = "";
- string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
- foreach (Match m in Regex.Matches(HtmlCode, Reg))
- {
- MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
- }
- return MatchVale;
- }
- #endregion
- #region 4.2取得所有链接URL
- /// <summary>
- /// 4.2取得所有链接URL
- /// </summary>
- /// <param name="html">html代码</param>
- /// <returns>提取到的url</returns>
- public static string GetAllUrl(this string html)
- {
- StringBuilder sb = new StringBuilder();
- Match m = Regex.Match(html.ToLower(), "<a href=(.*?)>.*?</a>");
- while (m.Success)
- {
- sb.AppendLine(m.Result("$1"));
- m.NextMatch();
- }
- return sb.ToString();
- }
- #endregion
- #region 4.3获取所有连接文本
- /// <summary>
- /// 4.3获取所有连接文本
- /// </summary>
- /// <param name="html">html代码</param>
- /// <returns>所有的带链接的a标签</returns>
- public static string GetAllLinkText(this string html)
- {
- StringBuilder sb = new StringBuilder();
- Match m = Regex.Match(html.ToLower(), "<a href=.*?>(1,100})</a>");
- while (m.Success)
- {
- sb.AppendLine(m.Result("$1"));
- m.NextMatch();
- }
- return sb.ToString();
- }
- #endregion
- #endregion
- #region 5、匹配页面的图片地址
- /// <summary>
- /// 替换html的img路径为绝对路径
- /// </summary>
- /// <param name="html"></param>
- /// <param name="imgDest"></param>
- /// <returns></returns>
- public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
- /// <summary>
- /// 匹配页面的图片地址
- /// </summary>
- /// <param name="htmlCode">html代码</param>
- /// <param name="imgHttp">要补充的http://路径信息</param>
- public static string GetImgSrc(this string htmlCode, string imgHttp)
- {
- string matchVale = "";
- string Reg = @"<img.+?>";
- foreach (Match m in Regex.Matches(htmlCode.ToLower(), Reg))
- {
- matchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
- }
- return matchVale;
- }
- /// <summary>
- /// 将src的绝对路径换成相对路径
- /// </summary>
- /// <param name="s"></param>
- /// <returns></returns>
- public static string ConvertImgSrcToRelativePath(this string s)
- {
- return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
- }
- /// <summary>
- /// 匹配html的所有img标签集合
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static MatchCollection MatchImgTags(this string html) => Regex.Matches(html, @"<img[\s]+src[\s]*=[\s]*((['""](?<src>[^'""]*)[\'""])|(?<src>[^\s]*))");
- /// <summary>
- /// 匹配html的一个img标签
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static Match MatchImgTag(this string html) => Regex.Match(html, @"<img[\s]+src[\s]*=[\s]*((['""](?<src>[^'""]*)[\'""])|(?<src>[^\s]*))");
- /// <summary>
- /// 获取html中第一个img标签的src
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static string MatchFirstImgSrc(this string html)
- {
- string src = Regex.Match(html, @"<img\s+[^>]*\s*src\s*=\s*['""]?(\S+\.\w{3,4})['""]?[^>]*>").Groups[1].Value;
- int index = src.IndexOf("\"", StringComparison.Ordinal);
- if (index > 0)
- {
- src = src.Substring(0, index);
- }
- return src;
- }
- /// <summary>
- /// 随机获取html代码中的img标签的src属性
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static string MatchRandomImgSrc(this string html)
- {
- MatchCollection collection = Regex.Matches(html, @"<img\s+[^>]*\s*src\s*=\s*['""]?(\S+\.\w{3,4})['""]?[^>]*>");
- if (collection.Count > 0)
- {
- string src = collection[new Random().StrictNext(collection.Count)].Groups[1].Value;
- int index = src.IndexOf("\"", StringComparison.Ordinal);
- if (index > 0)
- {
- src = src.Substring(0, index);
- }
- return src;
- }
- return String.Empty;
- }
- /// <summary>
- /// 匹配<img src="" />中的图片路径实际链接
- /// </summary>
- /// <param name="imgString"><img src="" />字符串</param>
- /// <param name="imgHttp">图片路径</param>
- public static string GetImg(this string imgString, string imgHttp)
- {
- string matchVale = "";
- string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
- foreach (Match m in Regex.Matches(imgString.ToLower(), Reg))
- {
- matchVale += (m.Value).ToLower().Trim().Replace("src=", "");
- }
- if (matchVale.IndexOf(".net") != -1 || matchVale.IndexOf(".com") != -1 || matchVale.IndexOf(".org") != -1 || matchVale.IndexOf(".cn") != -1 || matchVale.IndexOf(".cc") != -1 || matchVale.IndexOf(".info") != -1 || matchVale.IndexOf(".biz") != -1 || matchVale.IndexOf(".tv") != -1)
- {
- return matchVale;
- }
- return imgHttp + matchVale;
- }
- #endregion
- #region 6、抓取远程页面内容
- /// <summary>
- /// 6.1以GET方式抓取远程页面内容
- /// </summary>
- /// <param name="_"></param>
- /// <param name="tUrl">URL</param>
- public static string Get_Http(this HttpWebRequest _, string tUrl)
- {
- string strResult;
- try
- {
- var hwr = (HttpWebRequest)WebRequest.Create(tUrl);
- hwr.Timeout = 19600;
- var hwrs = (HttpWebResponse)hwr.GetResponse();
- Stream myStream = hwrs.GetResponseStream();
- var sr = new StreamReader(myStream, Encoding.Default);
- var sb = new StringBuilder();
- while (-1 != sr.Peek())
- {
- sb.Append(sr.ReadLine() + "\r\n");
- }
- strResult = sb.ToString();
- hwrs.Close();
- }
- catch (Exception ee)
- {
- strResult = ee.Message;
- }
- return strResult;
- }
- /// <summary>
- /// 6.2以POST方式抓取远程页面内容
- /// </summary>
- /// <param name="_"></param>
- /// <param name="url">URL</param>
- /// <param name="postData">参数列表</param>
- /// <param name="encodeType">编码类型</param>
- public static string Post_Http(this HttpWebRequest _, string url, string postData, string encodeType)
- {
- string strResult;
- try
- {
- Encoding encoding = Encoding.GetEncoding(encodeType);
- byte[] POST = encoding.GetBytes(postData);
- HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
- myRequest.Method = "POST";
- myRequest.ContentType = "application/x-www-form-urlencoded";
- myRequest.ContentLength = POST.Length;
- Stream newStream = myRequest.GetRequestStream();
- newStream.Write(POST, 0, POST.Length); //设置POST
- newStream.Close();
- HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
- StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
- strResult = reader.ReadToEnd();
- }
- catch (Exception ex)
- {
- strResult = ex.Message;
- }
- return strResult;
- }
- #endregion
- #region 7、压缩HTML输出
- /// <summary>
- /// 7.1压缩HTML输出
- /// </summary>
- /// <param name="html">html</param>
- public static string ZipHtml(this string html)
- {
- html = Regex.Replace(html, @">\s+?<", "><");//去除HTML中的空白字符
- html = Regex.Replace(html, @"\r\n\s*", "");
- html = Regex.Replace(html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
- return html;
- }
- #endregion
- #region 8、过滤HTML标签
- #region 8.1过滤指定HTML标签
- /// <summary>
- /// 8.1过滤指定HTML标签
- /// </summary>
- /// <param name="sTextStr">要过滤的字符</param>
- /// <param name="htmlStr">a img p div</param>
- public static string DelHtml(this string sTextStr, string htmlStr)
- {
- string rStr = "";
- if (!string.IsNullOrEmpty(sTextStr))
- {
- rStr = Regex.Replace(sTextStr, "<" + htmlStr + "[^>]*>", "", RegexOptions.IgnoreCase);
- rStr = Regex.Replace(rStr, "</" + htmlStr + ">", "", RegexOptions.IgnoreCase);
- }
- return rStr;
- }
- #endregion
- #region 8.2过滤HTML中的不安全标签
- /// <summary>
- /// 8.2过滤HTML中的不安全标签,去掉尖括号
- /// </summary>
- /// <param name="content">html代码</param>
- /// <returns>过滤后的安全内容</returns>
- public static string RemoveUnsafeHtml(this string content)
- {
- content = Regex.Replace(content, @"(\<|\s+)o([a-z]+\s?=)", "$1$2", RegexOptions.IgnoreCase);
- content = Regex.Replace(content, @"(script|frame|form|meta|behavior|style)([\s|:|>])+", "$1.$2", RegexOptions.IgnoreCase);
- return content;
- }
- #endregion
- #endregion
- #region 转换HTML操作
- #region HTML转行成TEXT
- /// <summary>
- /// HTML转行成TEXT HtmlToTxt(string strHtml)
- /// </summary>
- /// <param name="strHtml">html代码</param>
- /// <returns>普通文本</returns>
- public static string HtmlToTxt(this string strHtml)
- {
- string[] aryReg ={
- @"<script[^>]*?>.*?</script>",
- @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
- @"([\r\n])[\s]+",
- @"&(quot|#34);",
- @"&(amp|#38);",
- @"&(lt|#60);",
- @"&(gt|#62);",
- @"&(nbsp|#160);",
- @"&(iexcl|#161);",
- @"&(cent|#162);",
- @"&(pound|#163);",
- @"&(copy|#169);",
- @"&#(\d+);",
- @"-->",
- @"<!--.*\n"
- };
- string strOutput = strHtml;
- for (int i = 0; i < aryReg.Length; i++)
- {
- Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
- strOutput = regex.Replace(strOutput, string.Empty);
- }
- strOutput.Replace("<", "");
- strOutput.Replace(">", "");
- strOutput.Replace("\r\n", "");
- return strOutput;
- }
- #endregion
- #region 字符串转换为 Html
- /// <summary>
- /// 字符串转换为 HtmlStringToHtml(string str)
- /// </summary>
- /// <param name="str">字符串</param>
- /// <returns>html标签</returns>
- public static string StringToHtml(this string str)
- {
- str = str.Replace("&", "&");
- str = str.Replace(" ", " ");
- str = str.Replace("'", "''");
- str = str.Replace("\"", """);
- str = str.Replace("<", "<");
- str = str.Replace(">", ">");
- str = str.Replace("\n", "<br />");
- str = str.Replace("\r", "<br />");
- str = str.Replace("\r\n", "<br />");
- return str;
- }
- #endregion
- #region Html转换成字符串
- /// <summary>
- /// html转换成字符串
- /// </summary>
- /// <param name="strHtml">html代码</param>
- /// <returns>安全的字符串</returns>
- public static string HtmlToString(this string strHtml)
- {
- strHtml = strHtml.Replace("<br>", "\r\n");
- strHtml = strHtml.Replace(@"<br />", "\r\n");
- strHtml = strHtml.Replace(@"<br/>", "\r\n");
- strHtml = strHtml.Replace(">", ">");
- strHtml = strHtml.Replace("<", "<");
- strHtml = strHtml.Replace(" ", " ");
- strHtml = strHtml.Replace(""", "\"");
- strHtml = Regex.Replace(strHtml, @"<\/?[^>]+>", "", RegexOptions.IgnoreCase);
- return strHtml;
- }
- #endregion
- #endregion
- #region 获取URL编码
- /// <summary>
- /// 获取URL编码
- /// </summary>
- /// <param name="_"></param>
- /// <param name="url">URL</param>
- /// <returns>编码类型</returns>
- public static string GetEncoding(this HttpWebRequest _, string url)
- {
- HttpWebRequest request = null;
- HttpWebResponse response = null;
- StreamReader reader = null;
- try
- {
- request = (HttpWebRequest)WebRequest.Create(url);
- request.Timeout = 20000;
- request.AllowAutoRedirect = false;
- response = (HttpWebResponse)request.GetResponse();
- if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
- {
- if (response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
- {
- reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
- }
- else
- {
- reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
- }
- string html = reader.ReadToEnd();
- Regex regCharset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
- if (regCharset.IsMatch(html))
- {
- return regCharset.Match(html).Groups["charset"].Value;
- }
- else if (response.CharacterSet != string.Empty)
- {
- return response.CharacterSet;
- }
- else
- {
- return Encoding.Default.BodyName;
- }
- }
- }
- finally
- {
- if (response != null)
- {
- response.Close();
- response = null;
- }
- if (reader != null)
- reader.Close();
- if (request != null)
- request = null;
- }
- return Encoding.Default.BodyName;
- }
- #endregion
- #region 判断URL是否有效
- /// <summary>
- /// 判断URL是否有效
- /// </summary>
- /// <param name="_"></param>
- /// <param name="url">待判断的URL,可以是网页以及图片链接等</param>
- /// <returns>200为正确,其余为大致网页错误代码</returns>
- public static int GetUrlError(this HttpWebRequest _, string url)
- {
- int num = 200;
- try
- {
- HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(url));
- ServicePointManager.Expect100Continue = false;
- ((HttpWebResponse)request.GetResponse()).Close();
- }
- catch (WebException exception)
- {
- if (exception.Status != WebExceptionStatus.ProtocolError)
- {
- return num;
- }
- if (exception.Message.IndexOf("500 ") > 0)
- {
- return 500;
- }
- if (exception.Message.IndexOf("401 ") > 0)
- {
- return 401;
- }
- if (exception.Message.IndexOf("404") > 0)
- {
- num = 404;
- }
- }
- catch
- {
- num = 401;
- }
- return num;
- }
- #endregion
- #region 返回 HTML 字符串的编码解码结果
- /// <summary>
- /// 返回 HTML 字符串的编码结果
- /// </summary>
- /// <param name="inputData">字符串</param>
- /// <returns>编码结果</returns>
- public static string HtmlEncode(string inputData)
- {
- return HttpUtility.HtmlEncode(inputData);
- }
- /// <summary>
- /// 返回 HTML 字符串的解码结果
- /// </summary>
- /// <param name="str">字符串</param>
- /// <returns>解码结果</returns>
- public static string HtmlDecode(string str)
- {
- return HttpUtility.HtmlDecode(str);
- }
- #endregion
- /// <summary>
- /// 获取Cookie集合
- /// </summary>
- /// <param name="cookie"></param>
- /// <param name="cookieString">Cookie的键</param>
- /// <returns>Cookie键值集合</returns>
- public static CookieCollection GetCookieCollection(this CookieCollection cookie, string cookieString)
- {
- //string cookieString = "SID=ARRGy4M1QVBtTU-ymi8bL6X8mVkctYbSbyDgdH8inu48rh_7FFxHE6MKYwqBFAJqlplUxq7hnBK5eqoh3E54jqk=;Domain=.google.com;Path=/,LSID=AaMBTixN1MqutGovVSOejyb8mVkctYbSbyDgdH8inu48rh_7FFxHE6MKYwqBFAJqlhCe_QqxLg00W5OZejb_UeQ=;Domain=www.google.com;Path=/accounts";
- Regex re = new Regex("([^;,]+)=([^;,]+);Domain=([^;,]+);Path=([^;,]+)", RegexOptions.IgnoreCase);
- foreach (Match m in re.Matches(cookieString))
- {
- //name, value, path, domain
- Cookie c = new Cookie(m.Groups[1].Value, m.Groups[2].Value, m.Groups[3].Value, m.Groups[3].Value);
- cookie.Add(c);
- }
- return cookie;
- }
- #region 从HTML中获取文本,保留br,p,img
- /// <summary>
- /// 从HTML中获取文本,保留br,p,img
- /// </summary>
- /// <param name="HTML">html代码</param>
- /// <returns>保留br,p,img的文本</returns>
- public static string GetTextFromHTML(this string HTML)
- {
- Regex regEx = new Regex(@"</?(?!br|/?p|img)[^>]*>", RegexOptions.IgnoreCase);
- return regEx.Replace(HTML, "");
- }
- #endregion
- #region 获取HTML页面内制定Key的Value内容
- /// <summary>
- /// 获取HTML页面内制定Key的Value内容
- /// </summary>
- /// <param name="html">html源代码</param>
- /// <param name="key">键</param>
- /// <returns>获取到的值</returns>
- public static string GetHiddenKeyValue(this string html, string key)
- {
- string result = "";
- string sRegex = string.Format("<input\\s*type=\"hidden\".*?name=\"{0}\".*?\\s*value=[\"|'](?<value>.*?)[\"|'^/]", key);
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Match mc = re.Match(html);
- if (mc.Success)
- {
- result = mc.Groups[1].Value;
- }
- return result;
- }
- #endregion
- /// <summary>
- /// 替换回车换行符为html换行符
- /// </summary>
- /// <param name="str">html</param>
- public static string StrFormat(this string str)
- {
- string str2;
- if (str == null)
- {
- str2 = "";
- }
- else
- {
- str = str.Replace("\r\n", "<br />");
- str = str.Replace("\n", "<br />");
- str2 = str;
- }
- return str2;
- }
- /// <summary>
- /// 替换html字符
- /// </summary>
- /// <param name="strHtml">html</param>
- public static string EncodeHtml(this string strHtml)
- {
- if (strHtml != "")
- {
- strHtml = strHtml.Replace(",", "&def");
- strHtml = strHtml.Replace("'", "&dot");
- strHtml = strHtml.Replace(";", "&dec");
- return strHtml;
- }
- return "";
- }
- /// <summary>
- /// 为脚本替换特殊字符串
- /// </summary>
- /// <param name="str"> </param>
- /// <returns> </returns>
- [Obsolete("不建议使用", true)]
- public static string ReplaceStrToScript(string str)
- {
- str = str.Replace("\\", "\\\\");
- str = str.Replace("'", "\\'");
- str = str.Replace("\"", "\\\"");
- return str;
- }
- }
- }
|