| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632 |
- using System;
- using System.Collections;
- using System.Collections.Generic;
- using System.Data;
- using System.IO;
- using System.Net;
- using System.Net.Sockets;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Web;
- using System.Xml;
- using Ganss.XSS;
- using Masuit.Tools.Logging;
- namespace Masuit.Tools.Html
- {
- /// <summary>
- /// html工具类
- /// </summary>
- public static partial class HtmlTools
- {
- #region 防止html的xss净化器
- /// <summary>
- /// 标准的防止html的xss净化器
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static string HtmlSantinizerStandard(this string html)
- {
- var sanitizer = new HtmlSanitizer();
- sanitizer.AllowedAttributes.Remove("id");
- sanitizer.AllowedAttributes.Remove("alt");
- sanitizer.AllowedCssProperties.Remove("font-family");
- sanitizer.AllowedCssProperties.Remove("background-color");
- sanitizer.KeepChildNodes = true;
- sanitizer.AllowedTags.Remove("input");
- sanitizer.AllowedTags.Remove("button");
- sanitizer.AllowedTags.Remove("iframe");
- sanitizer.AllowedTags.Remove("frame");
- sanitizer.AllowedTags.Remove("textarea");
- sanitizer.AllowedTags.Remove("select");
- sanitizer.AllowedTags.Remove("form");
- return sanitizer.Sanitize(html);
- }
- /// <summary>
- /// 自定义的防止html的xss净化器
- /// </summary>
- /// <param name="html">源html</param>
- /// <param name="labels">需要移除的标签集合</param>
- /// <param name="attributes">需要移除的属性集合</param>
- /// <param name="styles">需要移除的样式集合</param>
- /// <returns></returns>
- public static string HtmlSantinizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
- {
- var sanitizer = new HtmlSanitizer();
- if (labels != null)
- {
- foreach (string label in labels)
- {
- sanitizer.AllowedTags.Remove(label);
- }
- }
- if (attributes != null)
- {
- foreach (string attr in attributes)
- {
- sanitizer.AllowedAttributes.Remove(attr);
- }
- }
- if (styles != null)
- {
- foreach (string p in styles)
- {
- sanitizer.AllowedCssProperties.Remove(p);
- }
- }
- sanitizer.KeepChildNodes = true;
- return sanitizer.Sanitize(html);
- }
- #endregion
- #region BaseMethod
- /// <summary>
- /// 多个匹配内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="iGroupIndex">第几个分组, 从1开始, 0代表不分组</param>
- public static List<string> GetList(string sInput, string sRegex, int iGroupIndex)
- {
- List<string> list = new List<string>();
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sInput);
- foreach (Match mc in mcs)
- {
- if (iGroupIndex > 0)
- {
- list.Add(mc.Groups[iGroupIndex].Value);
- }
- else
- {
- list.Add(mc.Value);
- }
- }
- return list;
- }
- /// <summary>
- /// 多个匹配内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="sGroupName">分组名, ""代表不分组</param>
- public static List<string> GetList(string sInput, string sRegex, string sGroupName)
- {
- List<string> list = new List<string>();
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sInput);
- foreach (Match mc in mcs)
- {
- if (sGroupName != "")
- {
- list.Add(mc.Groups[sGroupName].Value);
- }
- else
- {
- list.Add(mc.Value);
- }
- }
- return list;
- }
- /// <summary>
- /// 单个匹配内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="iGroupIndex">分组序号, 从1开始, 0不分组</param>
- public static string GetText(string sInput, string sRegex, int iGroupIndex)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Match mc = re.Match(sInput);
- string result = "";
- if (mc.Success)
- {
- if (iGroupIndex > 0)
- {
- result = mc.Groups[iGroupIndex].Value;
- }
- else
- {
- result = mc.Value;
- }
- }
- return result;
- }
- /// <summary>
- /// 单个匹配内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="sGroupName">分组名, ""代表不分组</param>
- public static string GetText(string sInput, string sRegex, string sGroupName)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Match mc = re.Match(sInput);
- string result = "";
- if (mc.Success)
- {
- if (sGroupName != "")
- {
- result = mc.Groups[sGroupName].Value;
- }
- else
- {
- result = mc.Value;
- }
- }
- return result;
- }
- /// <summary>
- /// 替换指定内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="sReplace">替换值</param>
- /// <param name="iGroupIndex">分组序号, 0代表不分组</param>
- public static string Replace(string sInput, string sRegex, string sReplace, int iGroupIndex)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sInput);
- foreach (Match mc in mcs)
- {
- if (iGroupIndex > 0)
- {
- sInput = sInput.Replace(mc.Groups[iGroupIndex].Value, sReplace);
- }
- else
- {
- sInput = sInput.Replace(mc.Value, sReplace);
- }
- }
- return sInput;
- }
- /// <summary>
- /// 替换指定内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="sReplace">替换值</param>
- /// <param name="sGroupName">分组名, "" 代表不分组</param>
- public static string Replace(string sInput, string sRegex, string sReplace, string sGroupName)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sInput);
- foreach (Match mc in mcs)
- {
- if (sGroupName != "")
- {
- sInput = sInput.Replace(mc.Groups[sGroupName].Value, sReplace);
- }
- else
- {
- sInput = sInput.Replace(mc.Value, sReplace);
- }
- }
- return sInput;
- }
- /// <summary>
- /// 分割指定内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- /// <param name="iStrLen">最小保留字符串长度</param>
- public static List<string> Split(string sInput, string sRegex, int iStrLen)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- string[] sArray = re.Split(sInput);
- List<string> list = new List<string>();
- list.Clear();
- foreach (string s in sArray)
- {
- if (s.Trim().Length < iStrLen)
- continue;
- list.Add(s.Trim());
- }
- return list;
- }
- #endregion BaseMethod
- #region 获得特定内容
- /// <summary>
- /// 多个链接
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static List<string> GetLinks(string sInput)
- {
- return GetList(sInput, @"<a[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href");
- }
- /// <summary>
- /// 单个链接
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static string GetLinkHelp(string sInput)
- {
- return GetText(sInput, @"<a[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href");
- }
- /// <summary>
- /// 图片标签
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static List<string> GetImgTag(string sInput)
- {
- return GetList(sInput, "<img[^>]+src=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "");
- }
- /// <summary>
- /// 图片地址
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static string GetImgSrc(string sInput)
- {
- return GetText(sInput, "<img[^>]+src=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src");
- }
- /// <summary>
- /// 根据URL获得域名
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static string GetDomain(string sInput)
- {
- return GetText(sInput, @"http(s)?://([\w-]+\.)+(\w){2,}", 0);
- }
- #endregion 获得特定内容
- #region 根据表达式,获得文章内容
- /// <summary>
- /// 文章标题
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- public static string GetTitle(string sInput, string sRegex)
- {
- string sTitle = GetText(sInput, sRegex, "Title");
- sTitle = ClearTag(sTitle);
- if (sTitle.Length > 99)
- {
- sTitle = sTitle.Substring(0, 99);
- }
- return sTitle;
- }
- /// <summary>
- /// 网页标题
- /// </summary>
- /// <param name="sInput">html</param>
- public static string GetTitle(string sInput)
- {
- return GetText(sInput, @"<Title[^>]*>(?<Title>[\s\S]{10,})</Title>", "Title");
- }
- /// <summary>
- /// 网页内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static string GetHtml(string sInput)
- {
- return Replace(sInput, @"(?<Head>[^<]+)<", "", "Head");
- }
- /// <summary>
- /// 网页Body内容
- /// </summary>
- /// <param name="sInput">html</param>
- public static string GetBodyHelp(string sInput)
- {
- return GetText(sInput, @"<Body[^>]*>(?<Body>[\s\S]{10,})</body>", "Body");
- }
- /// <summary>
- /// 网页Body内容
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- public static string GetBody(string sInput, string sRegex)
- {
- return GetText(sInput, sRegex, "Body");
- }
- /// <summary>
- /// 文章来源
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- public static string GetSource(string sInput, string sRegex)
- {
- string sSource = GetText(sInput, sRegex, "Source");
- sSource = ClearTag(sSource);
- if (sSource.Length > 99)
- sSource = sSource.Substring(0, 99);
- return sSource;
- }
- /// <summary>
- /// 作者名
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- public static string GetAuthor(string sInput, string sRegex)
- {
- string sAuthor = GetText(sInput, sRegex, "Author");
- sAuthor = ClearTag(sAuthor);
- if (sAuthor.Length > 99)
- sAuthor = sAuthor.Substring(0, 99);
- return sAuthor;
- }
- /// <summary>
- /// 分页链接地址
- /// </summary>
- /// <param name="sInput">输入内容</param>
- /// <param name="sRegex">表达式字符串</param>
- public static List<string> GetPageLinks(string sInput, string sRegex)
- {
- return GetList(sInput, sRegex, "href");
- }
- /// <summary>
- /// 根据相对路径得到绝对路径
- /// </summary>
- /// <param name="sInput">原始网站地址</param>
- /// <param name="sRelativeUrl">相对链接地址</param>
- public static string GetUrl(string sInput, string sRelativeUrl)
- {
- string sReturnUrl = "";
- string sUrl = _GetStandardUrlDepth(sInput);//返回了http://www.163.com/news/这种形式
- if (sRelativeUrl.ToLower().StartsWith("http") || sRelativeUrl.ToLower().StartsWith("https"))
- {
- sReturnUrl = sRelativeUrl.Trim();
- }
- else if (sRelativeUrl.StartsWith("/"))
- {
- sReturnUrl = GetDomain(sInput) + sRelativeUrl;
- }
- else if (sRelativeUrl.StartsWith("../"))
- {
- sUrl = sUrl.Substring(0, sUrl.Length - 1);
- while (sRelativeUrl.IndexOf("../") >= 0)
- {
- string temp = sUrl.Substring(0, sUrl.LastIndexOf("/")); // CString.GetPreStrByLast(sUrl, "/");
- if (temp.Length > 6)
- {//temp != "http:/",否则的话,说明已经回溯到尽头了,"../"与网址的层次对应不上。存在这种情况,网页上面的链接是错误的,但浏览器还能正常显示
- sUrl = temp;
- }
- sRelativeUrl = sRelativeUrl.Substring(3);
- }
- sReturnUrl = sUrl + "/" + sRelativeUrl.Trim();
- }
- else if (sRelativeUrl.StartsWith("./"))
- {
- sReturnUrl = sUrl + sRelativeUrl.Trim().Substring(2);
- }
- else if (sRelativeUrl.Trim() != "")
- {//2007images/modecss.css
- sReturnUrl = sUrl + sRelativeUrl.Trim();
- }
- return sReturnUrl;
- }
- /// <summary>
- /// 获得标准的URL路径深度
- /// </summary>
- /// <param name="url">URL路径</param>
- /// <returns>返回标准的形式:http://www.163.com/或http://www.163.com/news/。</returns>
- private static string _GetStandardUrlDepth(string url)
- {
- string sheep = url.Trim().ToLower();
- string header = "http://";
- if (sheep.IndexOf("https://") != -1)
- {
- header = "https://";
- sheep = sheep.Replace("https://", "");
- }
- else
- {
- sheep = sheep.Replace("http://", "");
- }
- int p = sheep.LastIndexOf("/");
- if (p == -1)
- {//www.163.com
- sheep += "/";
- }
- else if (p == sheep.Length - 1)
- {//传来的是:http://www.163.com/news/
- }
- else if (sheep.Substring(p).IndexOf(".") != -1)
- {//传来的是:http://www.163.com/news/hello.htm 这种形式
- sheep = sheep.Substring(0, p + 1);
- }
- else
- {
- sheep += "/";
- }
- return header + sheep;
- }
- /// <summary>
- /// 关键字
- /// </summary>
- /// <param name="sInput">输入内容</param>
- public static string GetKeyWord(string sInput)
- {
- List<string> list = Split(sInput, "(,|,|\\+|+|。|;|;|:|:|“)|”|、|_|\\(|(|\\)|)", 2);
- List<string> listReturn = new List<string>();
- Regex re;
- foreach (string str in list)
- {
- re = new Regex(@"[a-zA-z]+", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
- MatchCollection mcs = re.Matches(str);
- string sTemp = str;
- foreach (Match mc in mcs)
- {
- if (mc.Value.Length > 2)
- listReturn.Add(mc.Value);
- sTemp = sTemp.Replace(mc.Value, ",");
- }
- re = new Regex(@",{1}", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
- mcs = re.Matches(sTemp);
- foreach (string s in re.Split(sTemp))
- {
- if (s.Trim().Length <= 2)
- continue;
- listReturn.Add(s);
- }
- }
- string sReturn = "";
- for (int i = 0; i < listReturn.Count - 1; i++)
- {
- for (int j = i + 1; j < listReturn.Count; j++)
- {
- if (listReturn[i] == listReturn[j])
- {
- listReturn[j] = "";
- }
- }
- }
- foreach (string str in listReturn)
- {
- if (str.Length > 2)
- sReturn += str + ",";
- }
- if (sReturn.Length > 0)
- sReturn = sReturn.Substring(0, sReturn.Length - 1);
- else
- sReturn = sInput;
- if (sReturn.Length > 99)
- sReturn = sReturn.Substring(0, 99);
- return sReturn;
- }
- /// <summary>
- /// 获取内容
- /// </summary>
- /// <param name="sOriContent">原始数据</param>
- /// <param name="sOtherRemoveReg">需要移除的字符</param>
- /// <param name="sPageUrl">URL</param>
- /// <param name="dtAntiLink">反链 表数据</param>
- /// <returns>转码后的内容</returns>
- public static string GetContent(string sOriContent, string sOtherRemoveReg, string sPageUrl, DataTable dtAntiLink)
- {
- string sFormartted = sOriContent;
- //去掉有危险的标记
- sFormartted = Regex.Replace(sFormartted, @"<script[\s\S]*?</script>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
- sFormartted = Regex.Replace(sFormartted, @"<iframe[^>]*>[\s\S]*?</iframe>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
- Regex r = new Regex(@"<input[\s\S]+?>|<form[\s\S]+?>|</form[\s\S]*?>|<select[\s\S]+?>?</select>|<textarea[\s\S]*?>?</textarea>|<file[\s\S]*?>|<noscript>|</noscript>", RegexOptions.IgnoreCase);
- sFormartted = r.Replace(sFormartted, "");
- string[] sOtherReg = sOtherRemoveReg.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
- foreach (string sRemoveReg in sOtherReg)
- {
- sFormartted = Replace(sFormartted, sRemoveReg, "", 0);
- }
- //图片路径
- sFormartted = _ReplaceUrl("<img[\\s\\S]+?src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src", sFormartted, sPageUrl);
- //反防盗链
- string domain = GetDomain(sPageUrl);
- DataRow[] drs = dtAntiLink.Select("Domain='" + domain + "'");
- if (drs.Length > 0)
- {
- foreach (DataRow dr in drs)
- {
- switch (Convert.ToInt32(dr["Type"]))
- {
- case 1://置换
- sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=");
- break;
- default://附加
- sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=" + dr["imgUrl"].ToString());
- break;
- }
- }
- }
- //A链接
- sFormartted = _ReplaceUrl(@"<a[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl);
- //CSS
- sFormartted = _ReplaceUrl(@"<link[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl);
- //BACKGROUND
- sFormartted = _ReplaceUrl(@"background\s*=\s*(?:'(?<img>[^']+)'|""(?<img>[^""]+)""|(?<img>[^>\s]+))", "img", sFormartted, sPageUrl);
- //style方式的背景:background-image:url(...)
- sFormartted = _ReplaceUrl(@"background-image\s*:\s*url\s*\x28(?<img>[^\x29]+)\x29", "img", sFormartted, sPageUrl);
- //FLASH
- sFormartted = _ReplaceUrl(@"<param\s[^>]+""movie""[^>]+value\s*=\s*""(?<flash>[^"">]+\x2eswf)""[^>]*>", "flash", sFormartted, sPageUrl);
- //XSL
- if (IsXml(sFormartted))
- {
- sFormartted = _ReplaceUrl(@"<\x3fxml-stylesheet\s+[^\x3f>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)"")\s*[^\x3f>]*\x3f>", "href", sFormartted, sPageUrl);
- }
- //script
- //sFormartted = _ReplaceUrl(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", "src", sFormartted,sPageUrl);
- return sFormartted;
- }
- private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl)
- {
- Regex re = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
- MatchCollection mcs = re.Matches(sFormartted);
- string sOriStr = "";
- string sSubMatch = "";
- string sReplaceStr = "";
- foreach (Match mc in mcs)
- {
- sOriStr = mc.Value;
- sSubMatch = mc.Groups[subMatch].Value;
- sReplaceStr = sOriStr.Replace(sSubMatch, GetUrl(sPageUrl, sSubMatch));
- sFormartted = sFormartted.Replace(sOriStr, sReplaceStr);
- }
- return sFormartted;
- }
- /// <summary>
- /// 判断是否是xml格式
- /// </summary>
- /// <param name="sFormartted">输入内容</param>
- /// <returns>是否是xml数据</returns>
- public static bool IsXml(string sFormartted)
- {
- Regex re = new Regex(@"<\x3fxml\s+", RegexOptions.IgnoreCase);
- MatchCollection mcs = re.Matches(sFormartted);
- return mcs.Count > 0;
- }
- #endregion 根据表达式,获得文章内容
- #region HTML相关操作
- /// <summary>
- /// 清除html标签
- /// </summary>
- /// <param name="sHtml">html代码</param>
- /// <returns>清理后的内容</returns>
- public static string ClearTag(string sHtml)
- {
- if (sHtml?.Length == 0)
- return "";
- string sTemp = sHtml;
- Regex re = new Regex(@"(<[^>\s]*\b(\w)+\b[^>]*>)|(<>)|( )|(>)|(<)|(&)|\r|\n|\t", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
- return re.Replace(sHtml, "");
- }
- /// <summary>
- /// 根据正则清除html标签
- /// </summary>
- /// <param name="sHtml">html代码</param>
- /// <param name="sRegex">正则表达式</param>
- /// <returns>清理后的内容</returns>
- public static string ClearTag(string sHtml, string sRegex)
- {
- string sTemp = sHtml;
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
- return re.Replace(sHtml, "");
- }
- /// <summary>
- /// 将html转换成js代码
- /// </summary>
- /// <param name="sHtml">html代码</param>
- /// <returns>js代码</returns>
- public static string ConvertToJavascript(string sHtml)
- {
- StringBuilder sText = new StringBuilder();
- var re = new Regex(@"\r\n", RegexOptions.IgnoreCase);
- string[] strArray = re.Split(sHtml);
- foreach (string strLine in strArray)
- {
- sText.Append("document.writeln(\"" + strLine.Replace("\"", "\\\"") + "\");\r\n");
- }
- return sText.ToString();
- }
- /// <summary>
- /// 删除字符串中的特定标记
- /// </summary>
- /// <param name="str">html代码</param>
- /// <param name="tag">指定的标记</param>
- /// <param name="isContent">是否清除内容 </param>
- /// <returns>清理后的代码</returns>
- public static string DelTag(string str, string tag, bool isContent)
- {
- if (tag == null || tag == " ")
- {
- return str;
- }
- if (isContent) //要求清除内容
- {
- return Regex.Replace(str, string.Format("<({0})[^>]*>([\\s\\S]*?)<\\/\\1>", tag), "", RegexOptions.IgnoreCase);
- }
- return Regex.Replace(str, string.Format(@"(<{0}[^>]*(>)?)|(</{0}[^>] *>)|", tag), "", RegexOptions.IgnoreCase);
- }
- /// <summary>
- /// 删除字符串中的一组标记
- /// </summary>
- /// <param name="str">html代码</param>
- /// <param name="tagA">标记</param>
- /// <param name="isContent">是否清除内容 </param>
- /// <returns>清理后的代码</returns>
- public static string DelTagArray(string str, string tagA, bool isContent)
- {
- string[] tagAa = tagA.Split(',');
- foreach (string sr1 in tagAa) //遍历所有标记,删除
- {
- str = DelTag(str, sr1, isContent);
- }
- return str;
- }
- #endregion HTML相关操作
- #region 根据内容获得链接
- /// <summary>
- /// 根据内容获得链接
- /// </summary>
- /// <param name="sContent">html代码</param>
- /// <returns>链接</returns>
- public static string GetLink(string sContent)
- {
- string strReturn = "";
- Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
- Regex js = new Regex(@"(href|onclick)=[^>]+javascript[^>]+(('(?<href>[\w\d/-]+\.[^']*)')|("(?<href>[\w\d/-]+\.[^;]*)"))[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
- Match mc = js.Match(sContent);//获取javascript中的链接,有待改进
- if (mc.Success)
- {
- strReturn = mc.Groups["href"].Value;
- }
- else
- {
- Match me = re.Match(sContent);
- if (me.Success)
- {
- strReturn = System.Web.HttpUtility.HtmlDecode(me.Groups["href"].Value);
- //strReturn = RemoveByReg(strReturn, @";.*|javascript:.*");
- strReturn = RemoveByReg(strReturn, @";[^?&]*|javascript:.*");
- }
- }
- return strReturn;
- }
- /// <summary>
- /// 根据链接得到文本
- /// </summary>
- /// <param name="sContent">链接</param>
- /// <returns>文本</returns>
- public static string GetTextByLink(string sContent)
- {
- Regex re = new Regex(@"<a(?:\s+[^>]*)?>([\s\S]*)?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Regex email = new Regex(@"(href|onclick)=[^>]+mailto[^>]+@[^>]+>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Match me = email.Match(sContent);
- if (me.Success)
- return "";
- Match mc = re.Match(sContent);
- if (mc.Success)
- return mc.Groups[1].Value;
- else
- return "";
- }
- private static void _GetLinks(string sContent, string sUrl, ref Dictionary<string, string> lisA)
- {
- const string sFilter =
- @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采
- |登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入
- |更多|more|专题|精选|热卖|热销|推荐|精彩
- |加盟|联盟|友情|链接|相关
- |订阅|阅读器|RSS
- |免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私
- |〖|〗|【|】|(|)|[|]|『|』|\.";
- Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Regex re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sContent);
- //foreach (Match mc in mcs)
- for (int i = mcs.Count - 1; i >= 0; i--)
- {
- Match mc = mcs[i];
- string strHref = GetLink(mc.Value).Trim();
- strHref = strHref.Replace("\\\"", "");//针对JS输出链接
- strHref = strHref.Replace("\\\'", "");
- string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
- if (strTemp.Length < 2)
- {
- continue;
- }
- //过滤广告或无意义的链接
- string strText = ClearTag(GetTextByLink(mc.Value)).Trim();
- strTemp = RemoveByReg(strText, sFilter);
- if (Encoding.Default.GetBytes(strTemp).Length < 9)
- {
- continue;
- }
- if (re2.IsMatch(strText))
- {
- continue;
- }
- //换上绝对地址
- strHref = GetUrlByRelative(sUrl, strHref);
- if (strHref.Length <= 18)//例如,http://www.163.com = 18
- {
- continue;
- }
- //计算#字符出现的位置,移除它后面的内容
- //如果是域名地址,就跳过
- int charIndex = strHref.IndexOf('#');
- if (charIndex > -1)
- {
- strHref = strHref.Substring(0, charIndex);
- }
- strHref = strHref.Trim(new char[] { '/', '\\' });
- string tmpDomainURL = GetDomain(strHref);
- if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase))
- {
- continue;
- }
- if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText))
- {
- lisA.Add(strHref, strText);
- }
- }
- }
- /// <summary>
- /// 判断是否是js链接
- /// </summary>
- /// <param name="sHtml">html</param>
- /// <returns>判断是否是js链接</returns>
- public static bool IsExistsScriptLink(string sHtml)
- {
- Regex re = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
- return re.IsMatch(sHtml);
- }
- /// <summary>
- /// 从RSS FEED中读取
- /// </summary>
- /// <param name="sContent">RSS内容</param>
- /// <param name="sUrl">URL</param>
- /// <returns>读取到的内容</returns>
- public static Dictionary<string, string> GetLinksFromRss(string sContent, string sUrl)
- {
- Dictionary<string, string> lisDes = new Dictionary<string, string>();
- return GetLinksFromRss(sContent, sUrl, ref lisDes);
- }
- /// <summary>
- /// 从RSS FEED中读取
- /// </summary>
- /// <param name="sContent">RSS内容</param>
- /// <param name="sUrl">URL</param>
- /// <param name="lisDes">过滤条件</param>
- /// <returns>读取到的内容</returns>
- public static Dictionary<string, string> GetLinksFromRss(string sContent, string sUrl, ref Dictionary<string, string> lisDes)
- {
- Dictionary<string, string> listResult = new Dictionary<string, string>();
- XmlDocument xml = new XmlDocument();
- //RSS2.0
- try
- {
- xml.LoadXml(sContent.Trim());
- XmlNodeList nodes = xml.SelectNodes("/rss/channel/item");
- if (nodes.Count > 0)
- {
- for (int i = nodes.Count - 1; i >= 0; i--)
- {
- try
- {
- string sLink = GetUrlByRelative(sUrl, nodes[i].SelectSingleNode("link").InnerText);
- listResult.Add(sLink, nodes[i].SelectSingleNode("title").InnerText);
- lisDes.Add(sLink, nodes[i].SelectSingleNode("description").InnerText);
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- }
- return listResult;
- }
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- //RSS1.0(RDF)
- try
- {
- XmlNamespaceManager nsMgr = new XmlNamespaceManager(xml.NameTable);
- nsMgr.AddNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
- nsMgr.AddNamespace("rss", "http://purl.org/rss/1.0/");
- XmlNodeList nodes = xml.SelectNodes("/rdf:RDF//rss:item", nsMgr);
- if (nodes.Count > 0)
- {
- for (int i = nodes.Count - 1; i >= 0; i--)
- {
- try
- {
- string sLink = GetUrlByRelative(sUrl, nodes[i].SelectSingleNode("rss:link", nsMgr).InnerText);
- listResult.Add(sLink, nodes[i].SelectSingleNode("rss:title", nsMgr).InnerText);
- lisDes.Add(sLink, nodes[i].SelectSingleNode("rss:description", nsMgr).InnerText);
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- //listResult.Add("<a href=\"" + nodes[i].SelectSingleNode("rss:link",nsMgr).InnerText + "\">" + nodes[i].SelectSingleNode("rss:title",nsMgr).InnerText + "</a>");
- }
- return listResult;
- }
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- //RSS ATOM
- try
- {
- XmlNamespaceManager nsMgr = new XmlNamespaceManager(xml.NameTable);
- nsMgr.AddNamespace("atom", "http://purl.org/atom/ns#");
- XmlNodeList nodes = xml.SelectNodes("/atom:feed/atom:entry", nsMgr);
- if (nodes.Count > 0)
- {
- for (int i = nodes.Count - 1; i >= 0; i--)
- {
- try
- {
- string sLink = GetUrlByRelative(sUrl, nodes[i].SelectSingleNode("atom:link", nsMgr).Attributes["href"].InnerText);
- listResult.Add(sLink, nodes[i].SelectSingleNode("atom:title", nsMgr).InnerText);
- lisDes.Add(sLink, nodes[i].SelectSingleNode("atom:content", nsMgr).InnerText);
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- //listResult.Add("<a href=\"" + nodes[i].SelectSingleNode("atom:link",nsMgr).Attributes["href"].InnerText + "\">" + nodes[i].SelectSingleNode("atom:title",nsMgr).InnerText + "</a>");
- }
- return listResult;
- }
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- return listResult;
- }
- /// <summary>
- /// 从RSS FEED中读取标题
- /// </summary>
- /// <param name="sContent">RSS</param>
- /// <returns>标题</returns>
- public static string GetTitleFromRss(string sContent)
- {
- string title = "";
- XmlDocument xml = new XmlDocument();
- //RSS2.0
- try
- {
- xml.LoadXml(sContent.Trim());
- title = xml.SelectSingleNode("/rss/channel/title").InnerText;
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- return title;
- }
- /// <summary>
- /// 根据标签进行移除
- /// </summary>
- /// <param name="sContent">html</param>
- /// <param name="sRegex">正则表达式</param>
- /// <returns>清理后的代码</returns>
- public static string RemoveByReg(string sContent, string sRegex)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sContent);
- foreach (Match mc in mcs)
- {
- sContent = sContent.Replace(mc.Value, "");
- }
- return sContent;
- }
- /// <summary>
- /// 根据正则表达式替换内容
- /// </summary>
- /// <param name="sContent">html</param>
- /// <param name="sReplace">需要替换的内容</param>
- /// <param name="sRegex">符合正则的内容</param>
- /// <returns>替换后内容</returns>
- public static string ReplaceByReg(string sContent, string sReplace, string sRegex)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- sContent = re.Replace(sContent, sReplace);
- return sContent;
- }
- /// <summary>
- /// 网页Body内容
- /// </summary>
- /// <param name="sContent">html源代码</param>
- /// <returns>网页Body内容</returns>
- public static string GetBody(string sContent)
- {
- Regex re = new Regex(@"[\s\S]*?<\bbody\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
- sContent = re.Replace(sContent, "");
- re = new Regex(@"</\bbody\b[^>]*>\s*</html>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.RightToLeft);
- sContent = re.Replace(sContent, "");
- return sContent;
- }
- #endregion 根据超链接地址获取页面内容
- #region 根据内容作字符串分析
- /// <summary>
- /// 根据标签获取文本
- /// </summary>
- /// <param name="sContent">html</param>
- /// <param name="sRegex">正则表达式</param>
- /// <returns>文本</returns>
- public static string GetTextByReg(string sContent, string sRegex)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Match mc = re.Match(sContent);
- string str = "";
- if (mc.Success)
- str = mc.Groups[0].Value;
- while (str.EndsWith("_"))
- {
- str = RemoveEndWith(str, "_");
- }
- return str;
- }
- // charset=[\s]*(?<Coding>[^'"]+)[\s]*['"]?[\s]*[/]?>
- /// <summary>
- /// 根据标签获取文本
- /// </summary>
- /// <param name="sContent">html</param>
- /// <param name="sRegex">正则表达式</param>
- /// <param name="sGroupName">分组名</param>
- /// <returns>文本</returns>
- public static string GetTextByReg(string sContent, string sRegex, string sGroupName)
- {
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- Match mc = re.Match(sContent);
- string str = "";
- if (mc.Success)
- str = mc.Groups[sGroupName].Value;
- return str;
- }
- /// <summary>
- /// 获得链接的绝对路径
- /// </summary>
- /// <param name="sUrl">原链接地址</param>
- /// <param name="sRUrl">相对地址</param>
- /// <returns>获得链接的绝对路径</returns>
- public static string GetUrlByRelative(string sUrl, string sRUrl)
- {
- try
- {
- //http://q.yesky.com/grp/dsc/view.do;jsessionid=A6324FD46B4893303124F70C0B2AAC1E?grpId=201595&rvId=8215876
- Uri baseUri = new Uri(sUrl);
- if (!sUrl.EndsWith("/"))
- {
- int i = baseUri.Segments.Length - 1;
- if (i > 0)
- {
- string file = baseUri.Segments[i];
- if (file.IndexOf('.') < 1)
- {
- baseUri = new Uri(sUrl + "/");
- }
- }
- }
- Uri myUri = new Uri(baseUri, sRUrl);
- return myUri.AbsoluteUri;
- }
- catch
- {
- return sUrl;
- }
- }
- /// <summary>
- /// 根据标签获取数据集合
- /// </summary>
- /// <param name="sContent">html</param>
- /// <param name="sRegex">正则表达式</param>
- /// <returns>数据集合</returns>
- public static List<string> GetListByReg(string sContent, string sRegex)
- {
- List<string> list = new List<string>();
- Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
- MatchCollection mcs = re.Matches(sContent);
- foreach (Match mc in mcs)
- {
- list.Add(mc.Groups["href"].Value);
- }
- return list;
- }
- /// <summary>
- /// 获得主域
- /// </summary>
- /// <param name="sUrl">URL</param>
- /// <returns>域名</returns>
- public static string GetDomainUrl(string sUrl)
- {
- try
- {
- Uri baseUri = new Uri(sUrl);
- return baseUri.Scheme + "://" + baseUri.Authority;
- }
- catch
- {
- return sUrl;
- }
- }
- #endregion
- #region 杂项
- /// <summary>
- /// 从html中过滤出文本
- /// </summary>
- /// <param name="sHtml">html</param>
- /// <returns>纯文本</returns>
- public static string GetTxtFromHtml(this string sHtml)
- {
- string del = @"<head[^>]*>[\s\S]*?</head>";
- string content = RemoveByReg(sHtml, del);
- del = @"(<script[^>]*>[\s\S]*?</script>)|(<IFRAME[^>]*>[\s\S]*?</IFRAME>)|(<style[^>]*>[\s\S]*?</style>|<title[^>]*>[\s\S]*?</title>|<meta[^>]*>|<option[^>]*>[\s\S]*?</option>)";
- content = RemoveByReg(content, del);
- del = @"( )|([\n\t]+)";
- content = RemoveByReg(content, del);
- string re = @"(<table(\s+[^>]*)*>)|(<td(\s+[^>]*)*>)|(<tr(\s+[^>]*)*>)|(<p(\s+[^>]*)*>)|(<div(\s+[^>]*)*>)|(<ul(\s+[^>]*)*>)|(<li(\s+[^>]*)*>)|</table>|</td>|</tr>|</p>|<br>|</div>|</li>|</ul>|<p />|<br />";
- content = ReplaceByReg(content, "", re);
- content = ReplaceByReg(content, "", @"[\f\n\r\v]+");
- content = RemoveByReg(content, @"<a(\s+[^>]*)*>[\s\S]*?</a>");
- content = RemoveByReg(content, "<[^>]+>");//去除各种HTML标记,获得纯内容
- content = content.Replace("\n", "");
- content = content.Replace("\r", "");
- content = content.Trim();
- return content;
- }
- /// <summary>
- /// 从html中过滤出文本,不过保留换行符号
- /// </summary>
- /// <param name="sHtml">html源代码</param>
- /// <returns>从html中过滤出文本,不过保留换行符号</returns>
- public static string GetTxtFromHtml2(this string sHtml)
- {
- string del = @"<head[^>]*>[\s\S]*?</head>";
- string content = RemoveByReg(sHtml, del);
- del = @"(<script[^>]*>[\s\S]*?</script>)|(<IFRAME[^>]*>[\s\S]*?</IFRAME>)|(<style[^>]*>[\s\S]*?</style>|<title[^>]*>[\s\S]*?</title>|<meta[^>]*>|<option[^>]*>[\s\S]*?</option>)";
- content = RemoveByReg(content, del);
- del = @"( )|([\t]+)";//del = @"( )|([\n\t]+)";
- content = RemoveByReg(content, del);
- string re = @"(<table(\s+[^>]*)*>)|(<td(\s+[^>]*)*>)|(<tr(\s+[^>]*)*>)|(<p(\s+[^>]*)*>)|(<div(\s+[^>]*)*>)|(<ul(\s+[^>]*)*>)|(<li(\s+[^>]*)*>)|</table>|</td>|</tr>|</p>|<br>|</div>|</li>|</ul>|<p />|<br />";
- content = ReplaceByReg(content, "", re);
- //content = CText.ReplaceByReg(content, "", @"[\f\n\r\v]+");
- content = RemoveByReg(content, @"<a(\s+[^>]*)*>[\s\S]*?</a>");
- content = RemoveByReg(content, "<[^>]+>");//去除各种HTML标记,获得纯内容
- content = content.Trim();
- return content;
- }
- #endregion
- /// <summary>
- /// 按结尾移除内容
- /// </summary>
- /// <param name="sOrg">原始数据</param>
- /// <param name="sEnd">结束的字符串</param>
- /// <returns>清理后的内容</returns>
- public static string RemoveEndWith(string sOrg, string sEnd)
- {
- if (sOrg.EndsWith(sEnd))
- sOrg = sOrg.Remove(sOrg.IndexOf(sEnd), sEnd.Length);
- return sOrg;
- }
- #region 根据超链接地址获取页面内容
- /// <summary>
- /// 根据超链接地址获取页面内容
- /// </summary>
- /// <param name="sUrl">URL</param>
- /// <returns>页面内容</returns>
- public static string GetHtmlByUrl(string sUrl)
- {
- return GetHtmlByUrl(sUrl, "auto");
- }
- /// <summary>
- /// 根据超链接地址获取页面内容
- /// </summary>
- /// <param name="sUrl">URL</param>
- /// <param name="sCoding">文件编码</param>
- /// <returns>页面内容</returns>
- public static string GetHtmlByUrl(string sUrl, string sCoding)
- {
- return GetHtmlByUrl(ref sUrl, sCoding);
- }
- /// <summary>
- /// 根据超链接地址获取页面内容,并将url作为引用类型
- /// </summary>
- /// <param name="sUrl">URL</param>
- /// <param name="sCoding">文件编码</param>
- /// <returns>页面内容</returns>
- public static string GetHtmlByUrl(ref string sUrl, string sCoding)
- {
- string content = "";
- try
- {
- HttpWebResponse response = _MyGetResponse(sUrl);
- if (response == null)
- {
- return content;
- }
- sUrl = response.ResponseUri.AbsoluteUri;
- Stream stream = response.GetResponseStream();
- byte[] buffer = GetContent(stream);
- stream.Close();
- stream.Dispose();
- string charset = "";
- if (string.IsNullOrEmpty(sCoding) || string.Equals(sCoding, "auto", StringComparison.CurrentCultureIgnoreCase))
- {//如果不指定编码,那么系统代为指定
- //首先,从返回头信息中寻找
- string ht = response.GetResponseHeader("Content-Type");
- response.Close();
- string regCharSet = "[\\s\\S]*charset=(?<charset>[\\S]*)";
- Regex r = new Regex(regCharSet, RegexOptions.IgnoreCase);
- Match m = r.Match(ht);
- charset = (m.Captures.Count != 0) ? m.Result("${charset}") : "";
- if (charset == "-8") charset = "utf-8";
- if (charset?.Length == 0)
- {//找不到,则在文件信息本身中查找
- //先按gb2312来获取文件信息
- content = System.Text.Encoding.GetEncoding("gb2312").GetString(buffer);
- regCharSet = "(<meta[^>]*charset=(?<charset>[^>'\"]*)[\\s\\S]*?>)|(xml[^>]+encoding=(\"|')*(?<charset>[^>'\"]*)[\\s\\S]*?>)";
- r = new Regex(regCharSet, RegexOptions.IgnoreCase);
- m = r.Match(content);
- if (m.Captures.Count == 0)
- {//没办法,都找不到编码,只能返回按"gb2312"获取的信息
- //content = CText.RemoveByReg(content, @"<!--[\s\S]*?-->");
- return content;
- }
- charset = m.Result("${charset}");
- }
- }
- else
- {
- response.Close();
- charset = sCoding.ToLower();
- }
- try
- {
- content = System.Text.Encoding.GetEncoding(charset).GetString(buffer);
- }
- catch (ArgumentException)
- {//指定的编码不可识别
- content = System.Text.Encoding.GetEncoding("gb2312").GetString(buffer);
- }
- //content = CText.RemoveByReg(content, @"<!--[\s\S]*?-->");
- }
- catch
- {
- content = "";
- }
- return content;
- }
- private static HttpWebResponse _MyGetResponse(string sUrl)
- {
- int iTimeOut = 10000;
- //try
- //{
- // //iTimeOut = int.Parse(System.Configuration.ConfigurationManager.AppSettings["SocketTimeOut"]);
- //}
- //catch { iTimeOut = 10000; }
- bool bCookie = false;
- bool bRepeat = false;
- Uri target = new Uri(sUrl);
- ReCatch:
- try
- {
- HttpWebRequest resquest = (HttpWebRequest)WebRequest.Create(target);
- resquest.MaximumResponseHeadersLength = -1;
- resquest.ReadWriteTimeout = 120000;//120秒就超时
- resquest.Timeout = iTimeOut;
- resquest.MaximumAutomaticRedirections = 50;
- resquest.MaximumResponseHeadersLength = 5;
- resquest.AllowAutoRedirect = true;
- if (bCookie)
- {
- resquest.CookieContainer = new CookieContainer();
- }
- resquest.UserAgent = "Mozilla/6.0 (compatible; MSIE 6.0; Windows NT 5.1)";
- //resquest.UserAgent = @"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24";
- //resquest.KeepAlive = true;
- return (HttpWebResponse)resquest.GetResponse();
- }
- catch (WebException)
- {
- if (!bRepeat)
- {
- bRepeat = true;
- bCookie = true;
- goto ReCatch;
- }
- return null;
- }
- catch
- {
- return null;
- }
- }
- private static byte[] GetContent(Stream stream)
- {
- ArrayList arBuffer = new ArrayList();
- const int BUFFSIZE = 4096;
- try
- {
- byte[] buffer = new byte[BUFFSIZE];
- int count = stream.Read(buffer, 0, BUFFSIZE);
- while (count > 0)
- {
- for (int i = 0; i < count; i++)
- {
- arBuffer.Add(buffer[i]);
- }
- count = stream.Read(buffer, 0, BUFFSIZE);
- }
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- return (byte[])arBuffer.ToArray(System.Type.GetType("System.Byte"));
- }
- /// <summary>
- /// 获取http报文头
- /// </summary>
- /// <param name="sUrl">URL</param>
- /// <returns>报文信息</returns>
- public static string GetHttpHead(string sUrl)
- {
- string sHead = "";
- Uri uri = new Uri(sUrl);
- try
- {
- WebRequest req = WebRequest.Create(uri);
- WebResponse resp = req.GetResponse();
- WebHeaderCollection headers = resp.Headers;
- string[] sKeys = headers.AllKeys;
- foreach (string sKey in sKeys)
- {
- sHead += sKey + ":" + headers[sKey] + "\r\n";
- }
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- return sHead;
- }
- /// <summary>
- /// 处理框架页面问题。如果该页面是框架结构的话,返回该框架
- /// </summary>
- /// <param name="url">URL</param>
- /// <param name="content">内容</param>
- /// <returns>框架结构</returns>
- public static string[] DealWithFrame(string url, string content)
- {
- string regFrame = @"<frame\s+[^>]*src\s*=\s*(?:""(?<src>[^""]+)""|'(?<src>[^']+)'|(?<src>[^\s>""']+))[^>]*>";
- return DealWithFrame(regFrame, url, content);
- }
- /// <summary>
- /// 处理浮动桢问题。如果该页面存在浮动桢,返回浮动桢
- /// </summary>
- /// <param name="url">URL</param>
- /// <param name="content">内容</param>
- /// <returns>浮动桢</returns>
- public static string[] DealWithIFrame(string url, string content)
- {
- string regiFrame = @"<iframe\s+[^>]*src\s*=\s*(?:""(?<src>[^""]+)""|'(?<src>[^']+)'|(?<src>[^\s>""']+))[^>]*>";
- return DealWithFrame(regiFrame, url, content);
- }
- private static string[] DealWithFrame(string strReg, string url, string content)
- {
- ArrayList alFrame = new ArrayList();
- Regex r = new Regex(strReg, RegexOptions.IgnoreCase);
- Match m = r.Match(content);
- while (m.Success)
- {
- alFrame.Add(GetUrl(url, m.Groups["src"].Value));
- m = m.NextMatch();
- }
- return (string[])alFrame.ToArray(System.Type.GetType("System.String"));
- }
- #endregion 根据超链接地址获取页面内容
- #region 获得多个页面
- /// <summary>
- /// 获得多个页面
- /// </summary>
- /// <param name="listUrl">URL集合</param>
- /// <param name="sCoding">文件编码</param>
- /// <returns>页面集合</returns>
- /// <exception cref="Exception"> </exception>
- public static List<KeyValuePair<int, string>> GetHtmlByUrlList(List<KeyValuePair<int, string>> listUrl, string sCoding)
- {
- int iTimeOut = 120000;
- StringBuilder sbHtml = new StringBuilder();
- List<KeyValuePair<int, string>> listResult = new List<KeyValuePair<int, string>>();
- Socket sock = null;
- try
- {
- // 初始化
- Uri site = new Uri(listUrl[0].Value);
- var ipHostInfo = Dns.GetHostEntry(site.Host);
- IPAddress ipAddress = ipHostInfo.AddressList[0];
- IPEndPoint remoteEP = new IPEndPoint(ipAddress, site.Port);
- sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp) { SendTimeout = iTimeOut, ReceiveTimeout = iTimeOut };
- sock.Connect(remoteEP);
- foreach (KeyValuePair<int, string> kvUrl in listUrl)
- {
- site = new Uri(kvUrl.Value);
- string sendMsg = "GET " + HttpUtility.UrlDecode(site.PathAndQuery) + " HTTP/1.1\r\n" +
- "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/msword, application/vnd.ms-powerpoint, */*\r\n" +
- "Accept-Language:en-us\r\n" +
- "Accept-Encoding:gb2312, deflate\r\n" +
- "User-Agent: Mozilla/4.0\r\n" +
- "Host: " + site.Host + "\r\n\r\n" + '\0';
- // 发送
- byte[] msg = Encoding.GetEncoding(sCoding).GetBytes(sendMsg);
- int nBytes;
- if ((nBytes = sock.Send(msg)) == 0)
- {
- sock.Shutdown(SocketShutdown.Both);
- sock.Close();
- return listResult;
- }
- // 接受
- byte[] bytes = new byte[2048];
- byte bt = Convert.ToByte('\x7f');
- do
- {
- int count = 0;
- try
- {
- nBytes = sock.Receive(bytes, bytes.Length - 1, 0);
- }
- catch (Exception Ex)
- {
- string str = Ex.Message;
- nBytes = -1;
- }
- if (nBytes <= 0) break;
- if (bytes[nBytes - 1] > bt)
- {
- for (int i = nBytes - 1; i >= 0; i--)
- {
- if (bytes[i] > bt)
- count++;
- else
- break;
- }
- if (count % 2 == 1)
- {
- count = sock.Receive(bytes, nBytes, 1, 0);
- if (count < 0)
- break;
- nBytes += count;
- }
- }
- else
- {
- bytes[nBytes] = (byte)'\0';
- }
- string s = Encoding.GetEncoding(sCoding).GetString(bytes, 0, nBytes);
- sbHtml.Append(s);
- } while (nBytes > 0);
- listResult.Add(new KeyValuePair<int, string>(kvUrl.Key, sbHtml.ToString()));
- sbHtml = null;
- sbHtml = new StringBuilder();
- }
- }
- catch (Exception Ex)
- {
- string s = Ex.Message;
- try
- {
- sock.Shutdown(SocketShutdown.Both);
- sock.Close();
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- }
- finally
- {
- try
- {
- sock.Shutdown(SocketShutdown.Both);
- sock.Close();
- }
- catch (Exception e)
- {
- LogManager.Error(e);
- }
- }
- return listResult;
- }
- #endregion 根据超链接地址获取页面内容
- /// <summary>
- /// 页面类型枚举
- /// </summary>
- public enum PageType : int
- {
- /// <summary>
- /// HTML格式
- /// </summary>
- HTML = 0,
- /// <summary>
- /// RSS格式
- /// </summary>
- RSS = 1
- }
- /// <summary>
- /// 获取页面类型
- /// </summary>
- /// <param name="sUrl">URL</param>
- /// <param name="sHtml">内容</param>
- /// <returns>页面类型枚举</returns>
- public static PageType GetPageType(string sUrl, ref string sHtml)
- {
- PageType pt = PageType.HTML;
- //看有没有RSS FEED
- string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>";
- Regex r = new Regex(regRss, RegexOptions.IgnoreCase);
- Match m = r.Match(sHtml);
- if (m.Captures.Count != 0)
- {//有,则转向从RSS FEED中抓取
- string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))";
- r = new Regex(regHref, RegexOptions.IgnoreCase);
- m = r.Match(m.Captures[0].Value);
- if (m.Captures.Count > 0)
- {
- //有可能是相对路径,加上绝对路径
- string rssFile = GetUrl(sUrl, m.Groups["href"].Value);
- sHtml = GetHtmlByUrl(rssFile);
- pt = PageType.RSS;
- }
- }
- else
- {//看这个地址本身是不是一个Rss feed
- r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
- m = r.Match(sHtml);
- if (m.Captures.Count > 0)
- {
- pt = PageType.RSS;
- }
- }
- return pt;
- }
- }
- }
|