HtmlHelper.cs 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938
  1. using System;
  2. using System.IO;
  3. using System.IO.Compression;
  4. using System.Net;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using System.Threading;
  8. using System.Web;
  9. using Masuit.Tools.Win32;
  10. namespace Masuit.Tools.Core.Html
  11. {
  12. /// <summary>
  13. ///1、获取HTML<br/>
  14. ///1.1获取指定页面的HTML代码 GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)<br/>
  15. ///1.2获取HTMLGetHtml(string url, CookieContainer cookieContainer)<br/>
  16. ///2、获取字符流<br/>
  17. ///2.1获取字符流GetStream(string url, CookieContainer cookieContainer)<br/>
  18. ///3、清除HTML标记 <br/>
  19. ///3.1清除HTML标记 NoHTML(string Htmlstring)<br/>
  20. ///4、匹配页面的链接 <br/>
  21. ///4.1获取页面的链接正则 GetHref(string HtmlCode)<br/>
  22. ///5、匹配页面的图片地址<br/>
  23. /// 5.1匹配页面的图片地址 GetImgSrc(string HtmlCode, string imgHttp)<br/>
  24. ///5.2匹配<img src="" />中的图片路径实际链接 GetImg(string ImgString, string imgHttp)<br/>
  25. ///6、抓取远程页面内容<br/>
  26. /// 6.1以GET方式抓取远程页面内容 Get_Http(string tUrl)<br/>
  27. /// 6.2以POST方式抓取远程页面内容 Post_Http(string url, string postData, string encodeType)<br/>
  28. ///7、压缩HTML输出<br/>
  29. ///7.1压缩HTML输出 ZipHtml(string Html)<br/>
  30. ///8、过滤HTML标签<br/>
  31. /// 8.1过滤指定HTML标签 DelHtml(string s_TextStr, string html_Str) <br/>
  32. /// 8.2过滤HTML中的不安全标签 RemoveUnsafeHtml(string content)<br/>
  33. /// HTML转行成TEXT HtmlToTxt(string strHtml)<br/>
  34. /// 字符串转换为 HtmlStringToHtml(string str)<br/>
  35. /// html转换成字符串HtmlToString(string strHtml)<br/>
  36. /// 获取URL编码<br/>
  37. /// 判断URL是否有效<br/>
  38. /// 返回 HTML 字符串的编码解码结果
  39. /// </summary>
  40. public static partial class HtmlTools
  41. {
  42. #region 私有字段
  43. private static CookieContainer cc = new CookieContainer();
  44. private static string contentType = "application/x-www-form-urlencoded";
  45. private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg," +
  46. " application/x-shockwave-flash, application/x-silverlight, " +
  47. "application/vnd.ms-excel, application/vnd.ms-powerpoint, " +
  48. "application/msword, application/x-ms-application," +
  49. " application/x-ms-xbap," +
  50. " application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
  51. private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;" +
  52. " .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
  53. private static int delay = 1000;
  54. private static int currentTry = 0;
  55. #endregion
  56. #region 公有属性
  57. /// <summary>
  58. /// Cookie容器
  59. /// </summary>
  60. public static CookieContainer CookieContainer
  61. {
  62. get { return cc; }
  63. }
  64. /// <summary>
  65. /// 获取网页源码时使用的编码
  66. /// </summary>
  67. public static Encoding Encoding { get; set; } = Encoding.GetEncoding("utf-8");
  68. /// <summary>
  69. /// 网络延迟
  70. /// </summary>
  71. public static int NetworkDelay
  72. {
  73. get
  74. {
  75. Random r = new Random();
  76. return r.Next(delay, delay * 2);
  77. // return (r.Next(delay / 1000, delay / 1000 * 2)) * 1000;
  78. }
  79. set { delay = value; }
  80. }
  81. /// <summary>
  82. /// 最大尝试次数
  83. /// </summary>
  84. public static int MaxTry { get; set; } = 300;
  85. #endregion
  86. #region 1、获取HTML
  87. /// <summary>
  88. /// 去除html标签后并截取字符串
  89. /// </summary>
  90. /// <param name="html">源html</param>
  91. /// <param name="length">截取长度</param>
  92. /// <returns></returns>
  93. public static string RemoveHtmlTag(this string html, int length = 0)
  94. {
  95. string strText = Regex.Replace(html, "<[^>]+>", "");
  96. strText = Regex.Replace(strText, "&[^;]+;", "");
  97. if (length > 0 && strText.Length > length)
  98. {
  99. return strText.Substring(0, length);
  100. }
  101. return strText;
  102. }
  103. /// <summary>
  104. /// 获取指定页面的HTML代码
  105. /// </summary>
  106. /// <param name="_"></param>
  107. /// <param name="url">指定页面的路径</param>
  108. /// <param name="postData">post 提交的字符串</param>
  109. /// <param name="isPost">是否以post方式发送请求</param>
  110. /// <param name="cookieContainer">Cookie集合</param>
  111. public static string GetHtml(this HttpWebRequest _, string url, string postData, bool isPost, CookieContainer cookieContainer)
  112. {
  113. if (string.IsNullOrEmpty(postData))
  114. {
  115. return GetHtml(null, url, cookieContainer);
  116. }
  117. Thread.Sleep(NetworkDelay);
  118. currentTry++;
  119. HttpWebRequest httpWebRequest = null;
  120. HttpWebResponse httpWebResponse = null;
  121. try
  122. {
  123. byte[] byteRequest = Encoding.Default.GetBytes(postData);
  124. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  125. httpWebRequest.CookieContainer = cookieContainer;
  126. httpWebRequest.ContentType = contentType;
  127. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  128. httpWebRequest.Referer = url;
  129. httpWebRequest.Accept = accept;
  130. httpWebRequest.UserAgent = userAgent;
  131. httpWebRequest.Method = isPost ? "POST" : "GET";
  132. httpWebRequest.ContentLength = byteRequest.Length;
  133. httpWebRequest.AllowAutoRedirect = false;
  134. Stream stream = httpWebRequest.GetRequestStream();
  135. stream.Write(byteRequest, 0, byteRequest.Length);
  136. stream.Close();
  137. try
  138. {
  139. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  140. //redirectURL = httpWebResponse.Headers["Location"];// Get redirected uri
  141. }
  142. catch (WebException ex)
  143. {
  144. httpWebResponse = (HttpWebResponse)ex.Response;
  145. }
  146. //httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  147. Stream responseStream = httpWebResponse.GetResponseStream();
  148. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  149. string html = streamReader.ReadToEnd();
  150. streamReader.Close();
  151. responseStream.Close();
  152. currentTry = 0;
  153. httpWebRequest.Abort();
  154. httpWebResponse.Close();
  155. return html;
  156. }
  157. catch (Exception)
  158. {
  159. if (currentTry <= MaxTry)
  160. {
  161. GetHtml(null, url, postData, isPost, cookieContainer);
  162. }
  163. currentTry--;
  164. if (httpWebRequest != null) httpWebRequest.Abort();
  165. if (httpWebResponse != null) httpWebResponse.Close();
  166. return string.Empty;
  167. }
  168. }
  169. /// <summary>
  170. /// 获取HTML
  171. /// </summary>
  172. /// <param name="_"></param>
  173. /// <param name="url">地址</param>
  174. /// <param name="cookieContainer">Cookie集合</param>
  175. public static string GetHtml(this HttpWebRequest _, string url, CookieContainer cookieContainer)
  176. {
  177. Thread.Sleep(NetworkDelay);
  178. currentTry++;
  179. HttpWebRequest httpWebRequest = null;
  180. HttpWebResponse httpWebResponse = null;
  181. try
  182. {
  183. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  184. httpWebRequest.CookieContainer = cookieContainer;
  185. httpWebRequest.ContentType = contentType;
  186. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  187. httpWebRequest.Referer = url;
  188. httpWebRequest.Accept = accept;
  189. httpWebRequest.UserAgent = userAgent;
  190. httpWebRequest.Method = "GET";
  191. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  192. Stream responseStream = httpWebResponse.GetResponseStream();
  193. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  194. string html = streamReader.ReadToEnd();
  195. streamReader.Close();
  196. responseStream.Close();
  197. currentTry--;
  198. httpWebRequest.Abort();
  199. httpWebResponse.Close();
  200. return html;
  201. }
  202. catch (Exception)
  203. {
  204. if (currentTry <= MaxTry) GetHtml(null, url, cookieContainer);
  205. currentTry--;
  206. if (httpWebRequest != null) httpWebRequest.Abort();
  207. if (httpWebResponse != null) httpWebResponse.Close();
  208. return string.Empty;
  209. }
  210. }
  211. #endregion
  212. #region 2、获取字符流
  213. /// <summary>
  214. /// 2.1获取字符流
  215. /// </summary>
  216. /// ---------------------------------------------------------------------------------------------------------------
  217. /// 示例:
  218. /// System.Net.CookieContainer cookie = new System.Net.CookieContainer();
  219. /// Stream s = HttpHelper.GetStream("http://www.baidu.com", cookie);
  220. /// picVerify.Image = Image.FromStream(s);
  221. /// ---------------------------------------------------------------------------------------------------------------
  222. /// <param name="_"></param>
  223. /// <param name="url">地址</param>
  224. /// <param name="cookieContainer">cookieContainer</param>
  225. public static Stream GetStream(this HttpWebRequest _, string url, CookieContainer cookieContainer)
  226. {
  227. currentTry++;
  228. HttpWebRequest httpWebRequest = null;
  229. HttpWebResponse httpWebResponse = null;
  230. try
  231. {
  232. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  233. httpWebRequest.CookieContainer = cookieContainer;
  234. httpWebRequest.ContentType = contentType;
  235. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  236. httpWebRequest.Referer = url;
  237. httpWebRequest.Accept = accept;
  238. httpWebRequest.UserAgent = userAgent;
  239. httpWebRequest.Method = "GET";
  240. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  241. Stream responseStream = httpWebResponse.GetResponseStream();
  242. currentTry--;
  243. return responseStream;
  244. }
  245. catch (Exception)
  246. {
  247. if (currentTry <= MaxTry)
  248. {
  249. GetHtml(null, url, cookieContainer);
  250. }
  251. currentTry--;
  252. if (httpWebRequest != null)
  253. {
  254. httpWebRequest.Abort();
  255. }
  256. if (httpWebResponse != null)
  257. {
  258. httpWebResponse.Close();
  259. }
  260. return null;
  261. }
  262. }
  263. #endregion
  264. #region 3、清除HTML标记
  265. /// <summary>
  266. /// 清理Word文档转html后的冗余标签属性
  267. /// </summary>
  268. /// <param name="html"></param>
  269. /// <returns></returns>
  270. public static string ClearHtml(this string html)
  271. {
  272. string s = Regex.Match(Regex.Replace(html, @"background-color:#?\w{3,7}|font-family:'?[\w|\(|\)]*'?;?", string.Empty), @"<body[^>]*>([\s\S]*)<\/body>").Groups[1].Value.Replace("&#xa0;", string.Empty);
  273. s = Regex.Replace(s, @"\w+-?\w+:0\w+;?", string.Empty);//去除多余的零值属性
  274. s = Regex.Replace(s, "alt=\"(.+?)\"", string.Empty);//除去alt属性
  275. s = Regex.Replace(s, @"-aw.+?\s", string.Empty);//去除Word产生的-aw属性
  276. return s;
  277. }
  278. ///<summary>
  279. ///3.1清除HTML标记
  280. ///</summary>
  281. ///<param name="htmlstring">包括HTML的源码</param>
  282. ///<returns>已经去除后的文字</returns>
  283. public static string RemoveHtml(this string htmlstring)
  284. {
  285. //删除脚本
  286. htmlstring = Regex.Replace(htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
  287. //删除HTML
  288. Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
  289. htmlstring = regex.Replace(htmlstring, "");
  290. htmlstring = Regex.Replace(htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
  291. htmlstring = Regex.Replace(htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
  292. htmlstring = Regex.Replace(htmlstring, @"-->", "", RegexOptions.IgnoreCase);
  293. htmlstring = Regex.Replace(htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
  294. htmlstring = Regex.Replace(htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
  295. htmlstring = Regex.Replace(htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
  296. htmlstring = Regex.Replace(htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
  297. htmlstring = Regex.Replace(htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
  298. htmlstring = Regex.Replace(htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
  299. htmlstring = Regex.Replace(htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
  300. htmlstring = Regex.Replace(htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
  301. htmlstring = Regex.Replace(htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
  302. htmlstring = Regex.Replace(htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
  303. htmlstring = Regex.Replace(htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
  304. htmlstring.Replace("<", "");
  305. htmlstring.Replace(">", "");
  306. htmlstring.Replace("\r\n", "");
  307. return htmlstring;
  308. }
  309. #endregion
  310. #region 4、匹配页面的链接
  311. #region 4.1获取页面的链接正则
  312. /// <summary>
  313. /// 4.1获取页面的链接正则
  314. /// </summary>
  315. /// <param name="HtmlCode">html代码</param>
  316. public static string GetHref(this string HtmlCode)
  317. {
  318. string MatchVale = "";
  319. string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
  320. foreach (Match m in Regex.Matches(HtmlCode, Reg))
  321. {
  322. MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
  323. }
  324. return MatchVale;
  325. }
  326. #endregion
  327. #region 4.2取得所有链接URL
  328. /// <summary>
  329. /// 4.2取得所有链接URL
  330. /// </summary>
  331. /// <param name="html">html代码</param>
  332. /// <returns>提取到的url</returns>
  333. public static string GetAllUrl(this string html)
  334. {
  335. StringBuilder sb = new StringBuilder();
  336. Match m = Regex.Match(html.ToLower(), "<a href=(.*?)>.*?</a>");
  337. while (m.Success)
  338. {
  339. sb.AppendLine(m.Result("$1"));
  340. m.NextMatch();
  341. }
  342. return sb.ToString();
  343. }
  344. #endregion
  345. #region 4.3获取所有连接文本
  346. /// <summary>
  347. /// 4.3获取所有连接文本
  348. /// </summary>
  349. /// <param name="html">html代码</param>
  350. /// <returns>所有的带链接的a标签</returns>
  351. public static string GetAllLinkText(this string html)
  352. {
  353. StringBuilder sb = new StringBuilder();
  354. Match m = Regex.Match(html.ToLower(), "<a href=.*?>(1,100})</a>");
  355. while (m.Success)
  356. {
  357. sb.AppendLine(m.Result("$1"));
  358. m.NextMatch();
  359. }
  360. return sb.ToString();
  361. }
  362. #endregion
  363. #endregion
  364. #region 5、匹配页面的图片地址
  365. /// <summary>
  366. /// 替换html的img路径为绝对路径
  367. /// </summary>
  368. /// <param name="html"></param>
  369. /// <param name="imgDest"></param>
  370. /// <returns></returns>
  371. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  372. /// <summary>
  373. /// 匹配页面的图片地址
  374. /// </summary>
  375. /// <param name="htmlCode">html代码</param>
  376. /// <param name="imgHttp">要补充的http://路径信息</param>
  377. public static string GetImgSrc(this string htmlCode, string imgHttp)
  378. {
  379. string matchVale = "";
  380. string Reg = @"<img.+?>";
  381. foreach (Match m in Regex.Matches(htmlCode.ToLower(), Reg))
  382. {
  383. matchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
  384. }
  385. return matchVale;
  386. }
  387. /// <summary>
  388. /// 将src的绝对路径换成相对路径
  389. /// </summary>
  390. /// <param name="s"></param>
  391. /// <returns></returns>
  392. public static string ConvertImgSrcToRelativePath(this string s)
  393. {
  394. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  395. }
  396. /// <summary>
  397. /// 匹配html的所有img标签集合
  398. /// </summary>
  399. /// <param name="html"></param>
  400. /// <returns></returns>
  401. public static MatchCollection MatchImgTags(this string html) => Regex.Matches(html, @"<img[\s]+src[\s]*=[\s]*((['""](?<src>[^'""]*)[\'""])|(?<src>[^\s]*))");
  402. /// <summary>
  403. /// 匹配html的一个img标签
  404. /// </summary>
  405. /// <param name="html"></param>
  406. /// <returns></returns>
  407. public static Match MatchImgTag(this string html) => Regex.Match(html, @"<img[\s]+src[\s]*=[\s]*((['""](?<src>[^'""]*)[\'""])|(?<src>[^\s]*))");
  408. /// <summary>
  409. /// 获取html中第一个img标签的src
  410. /// </summary>
  411. /// <param name="html"></param>
  412. /// <returns></returns>
  413. public static string MatchFirstImgSrc(this string html)
  414. {
  415. string src = Regex.Match(html, @"<img\s+[^>]*\s*src\s*=\s*['""]?(\S+\.\w{3,4})['""]?[^>]*>").Groups[1].Value;
  416. int index = src.IndexOf("\"", StringComparison.Ordinal);
  417. if (index > 0)
  418. {
  419. src = src.Substring(0, index);
  420. }
  421. return src;
  422. }
  423. /// <summary>
  424. /// 随机获取html代码中的img标签的src属性
  425. /// </summary>
  426. /// <param name="html"></param>
  427. /// <returns></returns>
  428. public static string MatchRandomImgSrc(this string html)
  429. {
  430. MatchCollection collection = Regex.Matches(html, @"<img\s+[^>]*\s*src\s*=\s*['""]?(\S+\.\w{3,4})['""]?[^>]*>");
  431. if (collection.Count > 0)
  432. {
  433. string src = collection[new Random().StrictNext(collection.Count)].Groups[1].Value;
  434. int index = src.IndexOf("\"", StringComparison.Ordinal);
  435. if (index > 0)
  436. {
  437. src = src.Substring(0, index);
  438. }
  439. return src;
  440. }
  441. return String.Empty;
  442. }
  443. /// <summary>
  444. /// 匹配<img src="" />中的图片路径实际链接
  445. /// </summary>
  446. /// <param name="imgString"><img src="" />字符串</param>
  447. /// <param name="imgHttp">图片路径</param>
  448. public static string GetImg(this string imgString, string imgHttp)
  449. {
  450. string matchVale = "";
  451. string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
  452. foreach (Match m in Regex.Matches(imgString.ToLower(), Reg))
  453. {
  454. matchVale += (m.Value).ToLower().Trim().Replace("src=", "");
  455. }
  456. if (matchVale.IndexOf(".net") != -1 || matchVale.IndexOf(".com") != -1 || matchVale.IndexOf(".org") != -1 || matchVale.IndexOf(".cn") != -1 || matchVale.IndexOf(".cc") != -1 || matchVale.IndexOf(".info") != -1 || matchVale.IndexOf(".biz") != -1 || matchVale.IndexOf(".tv") != -1)
  457. {
  458. return matchVale;
  459. }
  460. return imgHttp + matchVale;
  461. }
  462. #endregion
  463. #region 6、抓取远程页面内容
  464. /// <summary>
  465. /// 6.1以GET方式抓取远程页面内容
  466. /// </summary>
  467. /// <param name="_"></param>
  468. /// <param name="tUrl">URL</param>
  469. public static string Get_Http(this HttpWebRequest _, string tUrl)
  470. {
  471. string strResult;
  472. try
  473. {
  474. var hwr = (HttpWebRequest)WebRequest.Create(tUrl);
  475. hwr.Timeout = 19600;
  476. var hwrs = (HttpWebResponse)hwr.GetResponse();
  477. Stream myStream = hwrs.GetResponseStream();
  478. var sr = new StreamReader(myStream, Encoding.Default);
  479. var sb = new StringBuilder();
  480. while (-1 != sr.Peek())
  481. {
  482. sb.Append(sr.ReadLine() + "\r\n");
  483. }
  484. strResult = sb.ToString();
  485. hwrs.Close();
  486. }
  487. catch (Exception ee)
  488. {
  489. strResult = ee.Message;
  490. }
  491. return strResult;
  492. }
  493. /// <summary>
  494. /// 6.2以POST方式抓取远程页面内容
  495. /// </summary>
  496. /// <param name="_"></param>
  497. /// <param name="url">URL</param>
  498. /// <param name="postData">参数列表</param>
  499. /// <param name="encodeType">编码类型</param>
  500. public static string Post_Http(this HttpWebRequest _, string url, string postData, string encodeType)
  501. {
  502. string strResult;
  503. try
  504. {
  505. Encoding encoding = Encoding.GetEncoding(encodeType);
  506. byte[] POST = encoding.GetBytes(postData);
  507. HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
  508. myRequest.Method = "POST";
  509. myRequest.ContentType = "application/x-www-form-urlencoded";
  510. myRequest.ContentLength = POST.Length;
  511. Stream newStream = myRequest.GetRequestStream();
  512. newStream.Write(POST, 0, POST.Length); //设置POST
  513. newStream.Close();
  514. HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
  515. StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
  516. strResult = reader.ReadToEnd();
  517. }
  518. catch (Exception ex)
  519. {
  520. strResult = ex.Message;
  521. }
  522. return strResult;
  523. }
  524. #endregion
  525. #region 7、压缩HTML输出
  526. /// <summary>
  527. /// 7.1压缩HTML输出
  528. /// </summary>
  529. /// <param name="html">html</param>
  530. public static string ZipHtml(this string html)
  531. {
  532. html = Regex.Replace(html, @">\s+?<", "><");//去除HTML中的空白字符
  533. html = Regex.Replace(html, @"\r\n\s*", "");
  534. html = Regex.Replace(html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
  535. return html;
  536. }
  537. #endregion
  538. #region 8、过滤HTML标签
  539. #region 8.1过滤指定HTML标签
  540. /// <summary>
  541. /// 8.1过滤指定HTML标签
  542. /// </summary>
  543. /// <param name="sTextStr">要过滤的字符</param>
  544. /// <param name="htmlStr">a img p div</param>
  545. public static string DelHtml(this string sTextStr, string htmlStr)
  546. {
  547. string rStr = "";
  548. if (!string.IsNullOrEmpty(sTextStr))
  549. {
  550. rStr = Regex.Replace(sTextStr, "<" + htmlStr + "[^>]*>", "", RegexOptions.IgnoreCase);
  551. rStr = Regex.Replace(rStr, "</" + htmlStr + ">", "", RegexOptions.IgnoreCase);
  552. }
  553. return rStr;
  554. }
  555. #endregion
  556. #region 8.2过滤HTML中的不安全标签
  557. /// <summary>
  558. /// 8.2过滤HTML中的不安全标签,去掉尖括号
  559. /// </summary>
  560. /// <param name="content">html代码</param>
  561. /// <returns>过滤后的安全内容</returns>
  562. public static string RemoveUnsafeHtml(this string content)
  563. {
  564. content = Regex.Replace(content, @"(\<|\s+)o([a-z]+\s?=)", "$1$2", RegexOptions.IgnoreCase);
  565. content = Regex.Replace(content, @"(script|frame|form|meta|behavior|style)([\s|:|>])+", "$1.$2", RegexOptions.IgnoreCase);
  566. return content;
  567. }
  568. #endregion
  569. #endregion
  570. #region 转换HTML操作
  571. #region HTML转行成TEXT
  572. /// <summary>
  573. /// HTML转行成TEXT HtmlToTxt(string strHtml)
  574. /// </summary>
  575. /// <param name="strHtml">html代码</param>
  576. /// <returns>普通文本</returns>
  577. public static string HtmlToTxt(this string strHtml)
  578. {
  579. string[] aryReg ={
  580. @"<script[^>]*?>.*?</script>",
  581. @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
  582. @"([\r\n])[\s]+",
  583. @"&(quot|#34);",
  584. @"&(amp|#38);",
  585. @"&(lt|#60);",
  586. @"&(gt|#62);",
  587. @"&(nbsp|#160);",
  588. @"&(iexcl|#161);",
  589. @"&(cent|#162);",
  590. @"&(pound|#163);",
  591. @"&(copy|#169);",
  592. @"&#(\d+);",
  593. @"-->",
  594. @"<!--.*\n"
  595. };
  596. string strOutput = strHtml;
  597. for (int i = 0; i < aryReg.Length; i++)
  598. {
  599. Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
  600. strOutput = regex.Replace(strOutput, string.Empty);
  601. }
  602. strOutput.Replace("<", "");
  603. strOutput.Replace(">", "");
  604. strOutput.Replace("\r\n", "");
  605. return strOutput;
  606. }
  607. #endregion
  608. #region 字符串转换为 Html
  609. /// <summary>
  610. /// 字符串转换为 HtmlStringToHtml(string str)
  611. /// </summary>
  612. /// <param name="str">字符串</param>
  613. /// <returns>html标签</returns>
  614. public static string StringToHtml(this string str)
  615. {
  616. str = str.Replace("&", "&amp;");
  617. str = str.Replace(" ", "&nbsp;");
  618. str = str.Replace("'", "''");
  619. str = str.Replace("\"", "&quot;");
  620. str = str.Replace("<", "&lt;");
  621. str = str.Replace(">", "&gt;");
  622. str = str.Replace("\n", "<br />");
  623. str = str.Replace("\r", "<br />");
  624. str = str.Replace("\r\n", "<br />");
  625. return str;
  626. }
  627. #endregion
  628. #region Html转换成字符串
  629. /// <summary>
  630. /// html转换成字符串
  631. /// </summary>
  632. /// <param name="strHtml">html代码</param>
  633. /// <returns>安全的字符串</returns>
  634. public static string HtmlToString(this string strHtml)
  635. {
  636. strHtml = strHtml.Replace("<br>", "\r\n");
  637. strHtml = strHtml.Replace(@"<br />", "\r\n");
  638. strHtml = strHtml.Replace(@"<br/>", "\r\n");
  639. strHtml = strHtml.Replace("&gt;", ">");
  640. strHtml = strHtml.Replace("&lt;", "<");
  641. strHtml = strHtml.Replace("&nbsp;", " ");
  642. strHtml = strHtml.Replace("&quot;", "\"");
  643. strHtml = Regex.Replace(strHtml, @"<\/?[^>]+>", "", RegexOptions.IgnoreCase);
  644. return strHtml;
  645. }
  646. #endregion
  647. #endregion
  648. #region 获取URL编码
  649. /// <summary>
  650. /// 获取URL编码
  651. /// </summary>
  652. /// <param name="_"></param>
  653. /// <param name="url">URL</param>
  654. /// <returns>编码类型</returns>
  655. public static string GetEncoding(this HttpWebRequest _, string url)
  656. {
  657. HttpWebRequest request = null;
  658. HttpWebResponse response = null;
  659. StreamReader reader = null;
  660. try
  661. {
  662. request = (HttpWebRequest)WebRequest.Create(url);
  663. request.Timeout = 20000;
  664. request.AllowAutoRedirect = false;
  665. response = (HttpWebResponse)request.GetResponse();
  666. if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
  667. {
  668. if (response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
  669. {
  670. reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
  671. }
  672. else
  673. {
  674. reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
  675. }
  676. string html = reader.ReadToEnd();
  677. Regex regCharset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
  678. if (regCharset.IsMatch(html))
  679. {
  680. return regCharset.Match(html).Groups["charset"].Value;
  681. }
  682. else if (response.CharacterSet != string.Empty)
  683. {
  684. return response.CharacterSet;
  685. }
  686. else
  687. {
  688. return Encoding.Default.BodyName;
  689. }
  690. }
  691. }
  692. finally
  693. {
  694. if (response != null)
  695. {
  696. response.Close();
  697. response = null;
  698. }
  699. if (reader != null)
  700. reader.Close();
  701. if (request != null)
  702. request = null;
  703. }
  704. return Encoding.Default.BodyName;
  705. }
  706. #endregion
  707. #region 判断URL是否有效
  708. /// <summary>
  709. /// 判断URL是否有效
  710. /// </summary>
  711. /// <param name="_"></param>
  712. /// <param name="url">待判断的URL,可以是网页以及图片链接等</param>
  713. /// <returns>200为正确,其余为大致网页错误代码</returns>
  714. public static int GetUrlError(this HttpWebRequest _, string url)
  715. {
  716. int num = 200;
  717. try
  718. {
  719. HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(url));
  720. ServicePointManager.Expect100Continue = false;
  721. ((HttpWebResponse)request.GetResponse()).Close();
  722. }
  723. catch (WebException exception)
  724. {
  725. if (exception.Status != WebExceptionStatus.ProtocolError)
  726. {
  727. return num;
  728. }
  729. if (exception.Message.IndexOf("500 ") > 0)
  730. {
  731. return 500;
  732. }
  733. if (exception.Message.IndexOf("401 ") > 0)
  734. {
  735. return 401;
  736. }
  737. if (exception.Message.IndexOf("404") > 0)
  738. {
  739. num = 404;
  740. }
  741. }
  742. catch
  743. {
  744. num = 401;
  745. }
  746. return num;
  747. }
  748. #endregion
  749. #region 返回 HTML 字符串的编码解码结果
  750. /// <summary>
  751. /// 返回 HTML 字符串的编码结果
  752. /// </summary>
  753. /// <param name="inputData">字符串</param>
  754. /// <returns>编码结果</returns>
  755. public static string HtmlEncode(string inputData)
  756. {
  757. return HttpUtility.HtmlEncode(inputData);
  758. }
  759. /// <summary>
  760. /// 返回 HTML 字符串的解码结果
  761. /// </summary>
  762. /// <param name="str">字符串</param>
  763. /// <returns>解码结果</returns>
  764. public static string HtmlDecode(string str)
  765. {
  766. return HttpUtility.HtmlDecode(str);
  767. }
  768. #endregion
  769. /// <summary>
  770. /// 获取Cookie集合
  771. /// </summary>
  772. /// <param name="cookie"></param>
  773. /// <param name="cookieString">Cookie的键</param>
  774. /// <returns>Cookie键值集合</returns>
  775. public static CookieCollection GetCookieCollection(this CookieCollection cookie, string cookieString)
  776. {
  777. //string cookieString = "SID=ARRGy4M1QVBtTU-ymi8bL6X8mVkctYbSbyDgdH8inu48rh_7FFxHE6MKYwqBFAJqlplUxq7hnBK5eqoh3E54jqk=;Domain=.google.com;Path=/,LSID=AaMBTixN1MqutGovVSOejyb8mVkctYbSbyDgdH8inu48rh_7FFxHE6MKYwqBFAJqlhCe_QqxLg00W5OZejb_UeQ=;Domain=www.google.com;Path=/accounts";
  778. Regex re = new Regex("([^;,]+)=([^;,]+);Domain=([^;,]+);Path=([^;,]+)", RegexOptions.IgnoreCase);
  779. foreach (Match m in re.Matches(cookieString))
  780. {
  781. //name, value, path, domain
  782. Cookie c = new Cookie(m.Groups[1].Value, m.Groups[2].Value, m.Groups[3].Value, m.Groups[3].Value);
  783. cookie.Add(c);
  784. }
  785. return cookie;
  786. }
  787. #region 从HTML中获取文本,保留br,p,img
  788. /// <summary>
  789. /// 从HTML中获取文本,保留br,p,img
  790. /// </summary>
  791. /// <param name="HTML">html代码</param>
  792. /// <returns>保留br,p,img的文本</returns>
  793. public static string GetTextFromHTML(this string HTML)
  794. {
  795. Regex regEx = new Regex(@"</?(?!br|/?p|img)[^>]*>", RegexOptions.IgnoreCase);
  796. return regEx.Replace(HTML, "");
  797. }
  798. #endregion
  799. #region 获取HTML页面内制定Key的Value内容
  800. /// <summary>
  801. /// 获取HTML页面内制定Key的Value内容
  802. /// </summary>
  803. /// <param name="html">html源代码</param>
  804. /// <param name="key">键</param>
  805. /// <returns>获取到的值</returns>
  806. public static string GetHiddenKeyValue(this string html, string key)
  807. {
  808. string result = "";
  809. string sRegex = string.Format("<input\\s*type=\"hidden\".*?name=\"{0}\".*?\\s*value=[\"|'](?<value>.*?)[\"|'^/]", key);
  810. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  811. Match mc = re.Match(html);
  812. if (mc.Success)
  813. {
  814. result = mc.Groups[1].Value;
  815. }
  816. return result;
  817. }
  818. #endregion
  819. /// <summary>
  820. /// 替换回车换行符为html换行符
  821. /// </summary>
  822. /// <param name="str">html</param>
  823. public static string StrFormat(this string str)
  824. {
  825. string str2;
  826. if (str == null)
  827. {
  828. str2 = "";
  829. }
  830. else
  831. {
  832. str = str.Replace("\r\n", "<br />");
  833. str = str.Replace("\n", "<br />");
  834. str2 = str;
  835. }
  836. return str2;
  837. }
  838. /// <summary>
  839. /// 替换html字符
  840. /// </summary>
  841. /// <param name="strHtml">html</param>
  842. public static string EncodeHtml(this string strHtml)
  843. {
  844. if (strHtml != "")
  845. {
  846. strHtml = strHtml.Replace(",", "&def");
  847. strHtml = strHtml.Replace("'", "&dot");
  848. strHtml = strHtml.Replace(";", "&dec");
  849. return strHtml;
  850. }
  851. return "";
  852. }
  853. /// <summary>
  854. /// 为脚本替换特殊字符串
  855. /// </summary>
  856. /// <param name="str"> </param>
  857. /// <returns> </returns>
  858. [Obsolete("不建议使用", true)]
  859. public static string ReplaceStrToScript(string str)
  860. {
  861. str = str.Replace("\\", "\\\\");
  862. str = str.Replace("'", "\\'");
  863. str = str.Replace("\"", "\\\"");
  864. return str;
  865. }
  866. }
  867. }