HtmlHelper.cs 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926
  1. using System;
  2. using System.IO;
  3. using System.IO.Compression;
  4. using System.Net;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using System.Threading;
  8. using System.Web;
  9. using Masuit.Tools.Win32;
  10. namespace Masuit.Tools.Html
  11. {
  12. /// <summary>
  13. ///1、获取HTML<br/>
  14. ///1.1获取指定页面的HTML代码 GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)<br/>
  15. ///1.2获取HTMLGetHtml(string url, CookieContainer cookieContainer)<br/>
  16. ///2、获取字符流<br/>
  17. ///2.1获取字符流GetStream(string url, CookieContainer cookieContainer)<br/>
  18. ///3、清除HTML标记 <br/>
  19. ///3.1清除HTML标记 NoHTML(string Htmlstring)<br/>
  20. ///4、匹配页面的链接 <br/>
  21. ///4.1获取页面的链接正则 GetHref(string HtmlCode)<br/>
  22. ///5、匹配页面的图片地址<br/>
  23. /// 5.1匹配页面的图片地址 GetImgSrc(string HtmlCode, string imgHttp)<br/>
  24. ///5.2匹配<img src="" />中的图片路径实际链接 GetImg(string ImgString, string imgHttp)<br/>
  25. ///6、抓取远程页面内容<br/>
  26. /// 6.1以GET方式抓取远程页面内容 Get_Http(string tUrl)<br/>
  27. /// 6.2以POST方式抓取远程页面内容 Post_Http(string url, string postData, string encodeType)<br/>
  28. ///7、压缩HTML输出<br/>
  29. ///7.1压缩HTML输出 ZipHtml(string Html)<br/>
  30. ///8、过滤HTML标签<br/>
  31. /// 8.1过滤指定HTML标签 DelHtml(string s_TextStr, string html_Str) <br/>
  32. /// 8.2过滤HTML中的不安全标签 RemoveUnsafeHtml(string content)<br/>
  33. /// HTML转行成TEXT HtmlToTxt(string strHtml)<br/>
  34. /// 字符串转换为 HtmlStringToHtml(string str)<br/>
  35. /// html转换成字符串HtmlToString(string strHtml)<br/>
  36. /// 获取URL编码<br/>
  37. /// 判断URL是否有效<br/>
  38. /// 返回 HTML 字符串的编码解码结果
  39. /// </summary>
  40. public static partial class HtmlTools
  41. {
  42. #region 私有字段
  43. private static CookieContainer cc = new CookieContainer();
  44. private static string contentType = "application/x-www-form-urlencoded";
  45. private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg," +
  46. " application/x-shockwave-flash, application/x-silverlight, " +
  47. "application/vnd.ms-excel, application/vnd.ms-powerpoint, " +
  48. "application/msword, application/x-ms-application," +
  49. " application/x-ms-xbap," +
  50. " application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
  51. private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;" +
  52. " .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
  53. private static int delay = 1000;
  54. private static int currentTry = 0;
  55. #endregion
  56. #region 公有属性
  57. /// <summary>
  58. /// Cookie容器
  59. /// </summary>
  60. public static CookieContainer CookieContainer
  61. {
  62. get { return cc; }
  63. }
  64. /// <summary>
  65. /// 获取网页源码时使用的编码
  66. /// </summary>
  67. public static Encoding Encoding { get; set; } = Encoding.GetEncoding("utf-8");
  68. /// <summary>
  69. /// 网络延迟
  70. /// </summary>
  71. public static int NetworkDelay
  72. {
  73. get
  74. {
  75. Random r = new Random();
  76. return r.Next(delay, delay * 2);
  77. // return (r.Next(delay / 1000, delay / 1000 * 2)) * 1000;
  78. }
  79. set { delay = value; }
  80. }
  81. /// <summary>
  82. /// 最大尝试次数
  83. /// </summary>
  84. public static int MaxTry { get; set; } = 300;
  85. #endregion
  86. #region 1、获取HTML
  87. /// <summary>
  88. /// 去除html标签后并截取字符串
  89. /// </summary>
  90. /// <param name="html">源html</param>
  91. /// <param name="length">截取长度</param>
  92. /// <returns></returns>
  93. public static string RemoveHtmlTag(this string html, int length = 0)
  94. {
  95. string strText = Regex.Replace(html, "<[^>]+>", "");
  96. strText = Regex.Replace(strText, "&[^;]+;", "");
  97. if (length > 0 && strText.Length > length)
  98. {
  99. return strText.Substring(0, length);
  100. }
  101. return strText;
  102. }
  103. /// <summary>
  104. /// 获取指定页面的HTML代码
  105. /// </summary>
  106. /// <param name="_"></param>
  107. /// <param name="url">指定页面的路径</param>
  108. /// <param name="postData">post 提交的字符串</param>
  109. /// <param name="isPost">是否以post方式发送请求</param>
  110. /// <param name="cookieContainer">Cookie集合</param>
  111. public static string GetHtml(this HttpWebRequest _, string url, string postData, bool isPost, CookieContainer cookieContainer)
  112. {
  113. if (string.IsNullOrEmpty(postData))
  114. {
  115. return GetHtml(null, url, cookieContainer);
  116. }
  117. Thread.Sleep(NetworkDelay);
  118. currentTry++;
  119. HttpWebRequest httpWebRequest = null;
  120. HttpWebResponse httpWebResponse = null;
  121. try
  122. {
  123. byte[] byteRequest = Encoding.Default.GetBytes(postData);
  124. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  125. httpWebRequest.CookieContainer = cookieContainer;
  126. httpWebRequest.ContentType = contentType;
  127. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  128. httpWebRequest.Referer = url;
  129. httpWebRequest.Accept = accept;
  130. httpWebRequest.UserAgent = userAgent;
  131. httpWebRequest.Method = isPost ? "POST" : "GET";
  132. httpWebRequest.ContentLength = byteRequest.Length;
  133. httpWebRequest.AllowAutoRedirect = false;
  134. Stream stream = httpWebRequest.GetRequestStream();
  135. stream.Write(byteRequest, 0, byteRequest.Length);
  136. stream.Close();
  137. try
  138. {
  139. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  140. //redirectURL = httpWebResponse.Headers["Location"];// Get redirected uri
  141. }
  142. catch (WebException ex)
  143. {
  144. httpWebResponse = (HttpWebResponse)ex.Response;
  145. }
  146. //httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  147. Stream responseStream = httpWebResponse.GetResponseStream();
  148. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  149. string html = streamReader.ReadToEnd();
  150. streamReader.Close();
  151. responseStream.Close();
  152. currentTry = 0;
  153. httpWebRequest.Abort();
  154. httpWebResponse.Close();
  155. return html;
  156. }
  157. catch (Exception)
  158. {
  159. if (currentTry <= MaxTry)
  160. {
  161. GetHtml(null, url, postData, isPost, cookieContainer);
  162. }
  163. currentTry--;
  164. if (httpWebRequest != null) httpWebRequest.Abort();
  165. if (httpWebResponse != null) httpWebResponse.Close();
  166. return string.Empty;
  167. }
  168. }
  169. /// <summary>
  170. /// 获取HTML
  171. /// </summary>
  172. /// <param name="_"></param>
  173. /// <param name="url">地址</param>
  174. /// <param name="cookieContainer">Cookie集合</param>
  175. public static string GetHtml(this HttpWebRequest _, string url, CookieContainer cookieContainer)
  176. {
  177. Thread.Sleep(NetworkDelay);
  178. currentTry++;
  179. HttpWebRequest httpWebRequest = null;
  180. HttpWebResponse httpWebResponse = null;
  181. try
  182. {
  183. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  184. httpWebRequest.CookieContainer = cookieContainer;
  185. httpWebRequest.ContentType = contentType;
  186. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  187. httpWebRequest.Referer = url;
  188. httpWebRequest.Accept = accept;
  189. httpWebRequest.UserAgent = userAgent;
  190. httpWebRequest.Method = "GET";
  191. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  192. Stream responseStream = httpWebResponse.GetResponseStream();
  193. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  194. string html = streamReader.ReadToEnd();
  195. streamReader.Close();
  196. responseStream.Close();
  197. currentTry--;
  198. httpWebRequest.Abort();
  199. httpWebResponse.Close();
  200. return html;
  201. }
  202. catch (Exception)
  203. {
  204. if (currentTry <= MaxTry) GetHtml(null, url, cookieContainer);
  205. currentTry--;
  206. if (httpWebRequest != null) httpWebRequest.Abort();
  207. if (httpWebResponse != null) httpWebResponse.Close();
  208. return string.Empty;
  209. }
  210. }
  211. #endregion
  212. #region 2、获取字符流
  213. /// <summary>
  214. /// 2.1获取字符流
  215. /// </summary>
  216. /// ---------------------------------------------------------------------------------------------------------------
  217. /// 示例:
  218. /// System.Net.CookieContainer cookie = new System.Net.CookieContainer();
  219. /// Stream s = HttpHelper.GetStream("http://www.baidu.com", cookie);
  220. /// picVerify.Image = Image.FromStream(s);
  221. /// ---------------------------------------------------------------------------------------------------------------
  222. /// <param name="_"></param>
  223. /// <param name="url">地址</param>
  224. /// <param name="cookieContainer">cookieContainer</param>
  225. public static Stream GetStream(this HttpWebRequest _, string url, CookieContainer cookieContainer)
  226. {
  227. currentTry++;
  228. HttpWebRequest httpWebRequest = null;
  229. HttpWebResponse httpWebResponse = null;
  230. try
  231. {
  232. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  233. httpWebRequest.CookieContainer = cookieContainer;
  234. httpWebRequest.ContentType = contentType;
  235. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  236. httpWebRequest.Referer = url;
  237. httpWebRequest.Accept = accept;
  238. httpWebRequest.UserAgent = userAgent;
  239. httpWebRequest.Method = "GET";
  240. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  241. Stream responseStream = httpWebResponse.GetResponseStream();
  242. currentTry--;
  243. return responseStream;
  244. }
  245. catch (Exception)
  246. {
  247. if (currentTry <= MaxTry)
  248. {
  249. GetHtml(null, url, cookieContainer);
  250. }
  251. currentTry--;
  252. if (httpWebRequest != null)
  253. {
  254. httpWebRequest.Abort();
  255. }
  256. if (httpWebResponse != null)
  257. {
  258. httpWebResponse.Close();
  259. }
  260. return null;
  261. }
  262. }
  263. #endregion
  264. #region 3、清除HTML标记
  265. /// <summary>
  266. /// 清理Word文档转html后的冗余标签属性
  267. /// </summary>
  268. /// <param name="html"></param>
  269. /// <returns></returns>
  270. public static string ClearHtml(this string html)
  271. {
  272. string s = Regex.Match(Regex.Replace(html, @"background-color:#?\w{3,7}|font-family:'?[\w|\(|\)]*'?;?", string.Empty), @"<body[^>]*>([\s\S]*)<\/body>").Groups[1].Value.Replace("&#xa0;", string.Empty);
  273. s = Regex.Replace(s, @"\w+-?\w+:0\w+;?", string.Empty);//去除多余的零值属性
  274. s = Regex.Replace(s, "alt=\"(.+?)\"", string.Empty);//除去alt属性
  275. s = Regex.Replace(s, @"-aw.+?\s", string.Empty);//去除Word产生的-aw属性
  276. return s;
  277. }
  278. ///<summary>
  279. ///3.1清除HTML标记
  280. ///</summary>
  281. ///<param name="htmlstring">包括HTML的源码</param>
  282. ///<returns>已经去除后的文字</returns>
  283. public static string RemoveHtml(this string htmlstring)
  284. {
  285. //删除脚本
  286. htmlstring = Regex.Replace(htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
  287. //删除HTML
  288. Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
  289. htmlstring = regex.Replace(htmlstring, "");
  290. htmlstring = Regex.Replace(htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
  291. htmlstring = Regex.Replace(htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
  292. htmlstring = Regex.Replace(htmlstring, @"-->", "", RegexOptions.IgnoreCase);
  293. htmlstring = Regex.Replace(htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
  294. htmlstring = Regex.Replace(htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
  295. htmlstring = Regex.Replace(htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
  296. htmlstring = Regex.Replace(htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
  297. htmlstring = Regex.Replace(htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
  298. htmlstring = Regex.Replace(htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
  299. htmlstring = Regex.Replace(htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
  300. htmlstring = Regex.Replace(htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
  301. htmlstring = Regex.Replace(htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
  302. htmlstring = Regex.Replace(htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
  303. htmlstring = Regex.Replace(htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
  304. htmlstring.Replace("<", "");
  305. htmlstring.Replace(">", "");
  306. htmlstring.Replace("\r\n", "");
  307. return htmlstring;
  308. }
  309. #endregion
  310. #region 4、匹配页面的链接
  311. #region 4.1获取页面的链接正则
  312. /// <summary>
  313. /// 4.1获取页面的链接正则
  314. /// </summary>
  315. /// <param name="HtmlCode">html代码</param>
  316. public static string GetHref(this string HtmlCode)
  317. {
  318. string MatchVale = "";
  319. string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
  320. foreach (Match m in Regex.Matches(HtmlCode, Reg))
  321. {
  322. MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
  323. }
  324. return MatchVale;
  325. }
  326. #endregion
  327. #region 4.2取得所有链接URL
  328. /// <summary>
  329. /// 4.2取得所有链接URL
  330. /// </summary>
  331. /// <param name="html">html代码</param>
  332. /// <returns>提取到的url</returns>
  333. public static string GetAllUrl(this string html)
  334. {
  335. StringBuilder sb = new StringBuilder();
  336. Match m = Regex.Match(html.ToLower(), "<a href=(.*?)>.*?</a>");
  337. while (m.Success)
  338. {
  339. sb.AppendLine(m.Result("$1"));
  340. m.NextMatch();
  341. }
  342. return sb.ToString();
  343. }
  344. #endregion
  345. #region 4.3获取所有连接文本
  346. /// <summary>
  347. /// 4.3获取所有连接文本
  348. /// </summary>
  349. /// <param name="html">html代码</param>
  350. /// <returns>所有的带链接的a标签</returns>
  351. public static string GetAllLinkText(this string html)
  352. {
  353. StringBuilder sb = new StringBuilder();
  354. Match m = Regex.Match(html.ToLower(), "<a href=.*?>(1,100})</a>");
  355. while (m.Success)
  356. {
  357. sb.AppendLine(m.Result("$1"));
  358. m.NextMatch();
  359. }
  360. return sb.ToString();
  361. }
  362. #endregion
  363. #endregion
  364. #region 5、匹配页面的图片地址
  365. /// <summary>
  366. /// 替换html的img路径为绝对路径
  367. /// </summary>
  368. /// <param name="html"></param>
  369. /// <param name="imgDest"></param>
  370. /// <returns></returns>
  371. public static string ReplaceHtmlImgSource(this string html, string imgDest) => html.Replace("<img src=\"", "<img src=\"" + imgDest + "/");
  372. /// <summary>
  373. /// 匹配页面的图片地址
  374. /// </summary>
  375. /// <param name="htmlCode">html代码</param>
  376. /// <param name="imgHttp">要补充的http://路径信息</param>
  377. public static string GetImgSrc(this string htmlCode, string imgHttp)
  378. {
  379. string matchVale = "";
  380. string Reg = @"<img.+?>";
  381. foreach (Match m in Regex.Matches(htmlCode.ToLower(), Reg))
  382. {
  383. matchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
  384. }
  385. return matchVale;
  386. }
  387. /// <summary>
  388. /// 将src的绝对路径换成相对路径
  389. /// </summary>
  390. /// <param name="s"></param>
  391. /// <returns></returns>
  392. public static string ConvertImgSrcToRelativePath(this string s)
  393. {
  394. return Regex.Replace(s, @"<img src=""(http:\/\/.+?)/", @"<img src=""/");
  395. }
  396. /// <summary>
  397. /// 匹配html的所有img标签集合
  398. /// </summary>
  399. /// <param name="html"></param>
  400. /// <returns></returns>
  401. public static MatchCollection MatchImgTags(this string html) => Regex.Matches(html, @"<img[\s]+src[\s]*=[\s]*((['""](?<src>[^'""]*)[\'""])|(?<src>[^\s]*))");
  402. /// <summary>
  403. /// 匹配html的一个img标签
  404. /// </summary>
  405. /// <param name="html"></param>
  406. /// <returns></returns>
  407. public static Match MatchImgTag(this string html) => Regex.Match(html, @"<img[\s]+src[\s]*=[\s]*((['""](?<src>[^'""]*)[\'""])|(?<src>[^\s]*))");
  408. /// <summary>
  409. /// 获取html中第一个img标签的src
  410. /// </summary>
  411. /// <param name="html"></param>
  412. /// <returns></returns>
  413. public static string MatchFirstImgSrc(this string html)
  414. {
  415. return Regex.Match(html, @"<img\s+[^>]*\s*src\s*=\s*['""]?(\S+\.\w{3,4})['""]?[^>]*>").Groups[1].Value;
  416. }
  417. /// <summary>
  418. /// 随机获取html代码中的img标签的src属性
  419. /// </summary>
  420. /// <param name="html"></param>
  421. /// <returns></returns>
  422. public static string MatchRandomImgSrc(this string html)
  423. {
  424. MatchCollection collection = Regex.Matches(html, @"<img\s+[^>]*\s*src\s*=\s*['""]?(\S+\.\w{3,4})['""]?[^>]*>");
  425. if (collection.Count > 0)
  426. {
  427. return collection[new Random().StrictNext(collection.Count)].Groups[1].Value;
  428. }
  429. return String.Empty;
  430. }
  431. /// <summary>
  432. /// 匹配<img src="" />中的图片路径实际链接
  433. /// </summary>
  434. /// <param name="imgString"><img src="" />字符串</param>
  435. /// <param name="imgHttp">图片路径</param>
  436. public static string GetImg(this string imgString, string imgHttp)
  437. {
  438. string matchVale = "";
  439. string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
  440. foreach (Match m in Regex.Matches(imgString.ToLower(), Reg))
  441. {
  442. matchVale += (m.Value).ToLower().Trim().Replace("src=", "");
  443. }
  444. if (matchVale.IndexOf(".net") != -1 || matchVale.IndexOf(".com") != -1 || matchVale.IndexOf(".org") != -1 || matchVale.IndexOf(".cn") != -1 || matchVale.IndexOf(".cc") != -1 || matchVale.IndexOf(".info") != -1 || matchVale.IndexOf(".biz") != -1 || matchVale.IndexOf(".tv") != -1)
  445. {
  446. return matchVale;
  447. }
  448. return imgHttp + matchVale;
  449. }
  450. #endregion
  451. #region 6、抓取远程页面内容
  452. /// <summary>
  453. /// 6.1以GET方式抓取远程页面内容
  454. /// </summary>
  455. /// <param name="_"></param>
  456. /// <param name="tUrl">URL</param>
  457. public static string Get_Http(this HttpWebRequest _, string tUrl)
  458. {
  459. string strResult;
  460. try
  461. {
  462. var hwr = (HttpWebRequest)WebRequest.Create(tUrl);
  463. hwr.Timeout = 19600;
  464. var hwrs = (HttpWebResponse)hwr.GetResponse();
  465. Stream myStream = hwrs.GetResponseStream();
  466. var sr = new StreamReader(myStream, Encoding.Default);
  467. var sb = new StringBuilder();
  468. while (-1 != sr.Peek())
  469. {
  470. sb.Append(sr.ReadLine() + "\r\n");
  471. }
  472. strResult = sb.ToString();
  473. hwrs.Close();
  474. }
  475. catch (Exception ee)
  476. {
  477. strResult = ee.Message;
  478. }
  479. return strResult;
  480. }
  481. /// <summary>
  482. /// 6.2以POST方式抓取远程页面内容
  483. /// </summary>
  484. /// <param name="_"></param>
  485. /// <param name="url">URL</param>
  486. /// <param name="postData">参数列表</param>
  487. /// <param name="encodeType">编码类型</param>
  488. public static string Post_Http(this HttpWebRequest _, string url, string postData, string encodeType)
  489. {
  490. string strResult;
  491. try
  492. {
  493. Encoding encoding = Encoding.GetEncoding(encodeType);
  494. byte[] POST = encoding.GetBytes(postData);
  495. HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
  496. myRequest.Method = "POST";
  497. myRequest.ContentType = "application/x-www-form-urlencoded";
  498. myRequest.ContentLength = POST.Length;
  499. Stream newStream = myRequest.GetRequestStream();
  500. newStream.Write(POST, 0, POST.Length); //设置POST
  501. newStream.Close();
  502. HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
  503. StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
  504. strResult = reader.ReadToEnd();
  505. }
  506. catch (Exception ex)
  507. {
  508. strResult = ex.Message;
  509. }
  510. return strResult;
  511. }
  512. #endregion
  513. #region 7、压缩HTML输出
  514. /// <summary>
  515. /// 7.1压缩HTML输出
  516. /// </summary>
  517. /// <param name="html">html</param>
  518. public static string ZipHtml(this string html)
  519. {
  520. html = Regex.Replace(html, @">\s+?<", "><");//去除HTML中的空白字符
  521. html = Regex.Replace(html, @"\r\n\s*", "");
  522. html = Regex.Replace(html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
  523. return html;
  524. }
  525. #endregion
  526. #region 8、过滤HTML标签
  527. #region 8.1过滤指定HTML标签
  528. /// <summary>
  529. /// 8.1过滤指定HTML标签
  530. /// </summary>
  531. /// <param name="sTextStr">要过滤的字符</param>
  532. /// <param name="htmlStr">a img p div</param>
  533. public static string DelHtml(this string sTextStr, string htmlStr)
  534. {
  535. string rStr = "";
  536. if (!string.IsNullOrEmpty(sTextStr))
  537. {
  538. rStr = Regex.Replace(sTextStr, "<" + htmlStr + "[^>]*>", "", RegexOptions.IgnoreCase);
  539. rStr = Regex.Replace(rStr, "</" + htmlStr + ">", "", RegexOptions.IgnoreCase);
  540. }
  541. return rStr;
  542. }
  543. #endregion
  544. #region 8.2过滤HTML中的不安全标签
  545. /// <summary>
  546. /// 8.2过滤HTML中的不安全标签,去掉尖括号
  547. /// </summary>
  548. /// <param name="content">html代码</param>
  549. /// <returns>过滤后的安全内容</returns>
  550. public static string RemoveUnsafeHtml(this string content)
  551. {
  552. content = Regex.Replace(content, @"(\<|\s+)o([a-z]+\s?=)", "$1$2", RegexOptions.IgnoreCase);
  553. content = Regex.Replace(content, @"(script|frame|form|meta|behavior|style)([\s|:|>])+", "$1.$2", RegexOptions.IgnoreCase);
  554. return content;
  555. }
  556. #endregion
  557. #endregion
  558. #region 转换HTML操作
  559. #region HTML转行成TEXT
  560. /// <summary>
  561. /// HTML转行成TEXT HtmlToTxt(string strHtml)
  562. /// </summary>
  563. /// <param name="strHtml">html代码</param>
  564. /// <returns>普通文本</returns>
  565. public static string HtmlToTxt(this string strHtml)
  566. {
  567. string[] aryReg ={
  568. @"<script[^>]*?>.*?</script>",
  569. @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
  570. @"([\r\n])[\s]+",
  571. @"&(quot|#34);",
  572. @"&(amp|#38);",
  573. @"&(lt|#60);",
  574. @"&(gt|#62);",
  575. @"&(nbsp|#160);",
  576. @"&(iexcl|#161);",
  577. @"&(cent|#162);",
  578. @"&(pound|#163);",
  579. @"&(copy|#169);",
  580. @"&#(\d+);",
  581. @"-->",
  582. @"<!--.*\n"
  583. };
  584. string strOutput = strHtml;
  585. for (int i = 0; i < aryReg.Length; i++)
  586. {
  587. Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
  588. strOutput = regex.Replace(strOutput, string.Empty);
  589. }
  590. strOutput.Replace("<", "");
  591. strOutput.Replace(">", "");
  592. strOutput.Replace("\r\n", "");
  593. return strOutput;
  594. }
  595. #endregion
  596. #region 字符串转换为 Html
  597. /// <summary>
  598. /// 字符串转换为 HtmlStringToHtml(string str)
  599. /// </summary>
  600. /// <param name="str">字符串</param>
  601. /// <returns>html标签</returns>
  602. public static string StringToHtml(this string str)
  603. {
  604. str = str.Replace("&", "&amp;");
  605. str = str.Replace(" ", "&nbsp;");
  606. str = str.Replace("'", "''");
  607. str = str.Replace("\"", "&quot;");
  608. str = str.Replace("<", "&lt;");
  609. str = str.Replace(">", "&gt;");
  610. str = str.Replace("\n", "<br />");
  611. str = str.Replace("\r", "<br />");
  612. str = str.Replace("\r\n", "<br />");
  613. return str;
  614. }
  615. #endregion
  616. #region Html转换成字符串
  617. /// <summary>
  618. /// html转换成字符串
  619. /// </summary>
  620. /// <param name="strHtml">html代码</param>
  621. /// <returns>安全的字符串</returns>
  622. public static string HtmlToString(this string strHtml)
  623. {
  624. strHtml = strHtml.Replace("<br>", "\r\n");
  625. strHtml = strHtml.Replace(@"<br />", "\r\n");
  626. strHtml = strHtml.Replace(@"<br/>", "\r\n");
  627. strHtml = strHtml.Replace("&gt;", ">");
  628. strHtml = strHtml.Replace("&lt;", "<");
  629. strHtml = strHtml.Replace("&nbsp;", " ");
  630. strHtml = strHtml.Replace("&quot;", "\"");
  631. strHtml = Regex.Replace(strHtml, @"<\/?[^>]+>", "", RegexOptions.IgnoreCase);
  632. return strHtml;
  633. }
  634. #endregion
  635. #endregion
  636. #region 获取URL编码
  637. /// <summary>
  638. /// 获取URL编码
  639. /// </summary>
  640. /// <param name="_"></param>
  641. /// <param name="url">URL</param>
  642. /// <returns>编码类型</returns>
  643. public static string GetEncoding(this HttpWebRequest _, string url)
  644. {
  645. HttpWebRequest request = null;
  646. HttpWebResponse response = null;
  647. StreamReader reader = null;
  648. try
  649. {
  650. request = (HttpWebRequest)WebRequest.Create(url);
  651. request.Timeout = 20000;
  652. request.AllowAutoRedirect = false;
  653. response = (HttpWebResponse)request.GetResponse();
  654. if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
  655. {
  656. if (response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
  657. {
  658. reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
  659. }
  660. else
  661. {
  662. reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
  663. }
  664. string html = reader.ReadToEnd();
  665. Regex regCharset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
  666. if (regCharset.IsMatch(html))
  667. {
  668. return regCharset.Match(html).Groups["charset"].Value;
  669. }
  670. else if (response.CharacterSet != string.Empty)
  671. {
  672. return response.CharacterSet;
  673. }
  674. else
  675. {
  676. return Encoding.Default.BodyName;
  677. }
  678. }
  679. }
  680. finally
  681. {
  682. if (response != null)
  683. {
  684. response.Close();
  685. response = null;
  686. }
  687. if (reader != null)
  688. reader.Close();
  689. if (request != null)
  690. request = null;
  691. }
  692. return Encoding.Default.BodyName;
  693. }
  694. #endregion
  695. #region 判断URL是否有效
  696. /// <summary>
  697. /// 判断URL是否有效
  698. /// </summary>
  699. /// <param name="_"></param>
  700. /// <param name="url">待判断的URL,可以是网页以及图片链接等</param>
  701. /// <returns>200为正确,其余为大致网页错误代码</returns>
  702. public static int GetUrlError(this HttpWebRequest _, string url)
  703. {
  704. int num = 200;
  705. try
  706. {
  707. HttpWebRequest request = (HttpWebRequest)WebRequest.Create(new Uri(url));
  708. ServicePointManager.Expect100Continue = false;
  709. ((HttpWebResponse)request.GetResponse()).Close();
  710. }
  711. catch (WebException exception)
  712. {
  713. if (exception.Status != WebExceptionStatus.ProtocolError)
  714. {
  715. return num;
  716. }
  717. if (exception.Message.IndexOf("500 ") > 0)
  718. {
  719. return 500;
  720. }
  721. if (exception.Message.IndexOf("401 ") > 0)
  722. {
  723. return 401;
  724. }
  725. if (exception.Message.IndexOf("404") > 0)
  726. {
  727. num = 404;
  728. }
  729. }
  730. catch
  731. {
  732. num = 401;
  733. }
  734. return num;
  735. }
  736. #endregion
  737. #region 返回 HTML 字符串的编码解码结果
  738. /// <summary>
  739. /// 返回 HTML 字符串的编码结果
  740. /// </summary>
  741. /// <param name="inputData">字符串</param>
  742. /// <returns>编码结果</returns>
  743. public static string HtmlEncode(string inputData)
  744. {
  745. return HttpUtility.HtmlEncode(inputData);
  746. }
  747. /// <summary>
  748. /// 返回 HTML 字符串的解码结果
  749. /// </summary>
  750. /// <param name="str">字符串</param>
  751. /// <returns>解码结果</returns>
  752. public static string HtmlDecode(string str)
  753. {
  754. return HttpUtility.HtmlDecode(str);
  755. }
  756. #endregion
  757. /// <summary>
  758. /// 获取Cookie集合
  759. /// </summary>
  760. /// <param name="cookie"></param>
  761. /// <param name="cookieString">Cookie的键</param>
  762. /// <returns>Cookie键值集合</returns>
  763. public static CookieCollection GetCookieCollection(this CookieCollection cookie, string cookieString)
  764. {
  765. //string cookieString = "SID=ARRGy4M1QVBtTU-ymi8bL6X8mVkctYbSbyDgdH8inu48rh_7FFxHE6MKYwqBFAJqlplUxq7hnBK5eqoh3E54jqk=;Domain=.google.com;Path=/,LSID=AaMBTixN1MqutGovVSOejyb8mVkctYbSbyDgdH8inu48rh_7FFxHE6MKYwqBFAJqlhCe_QqxLg00W5OZejb_UeQ=;Domain=www.google.com;Path=/accounts";
  766. Regex re = new Regex("([^;,]+)=([^;,]+);Domain=([^;,]+);Path=([^;,]+)", RegexOptions.IgnoreCase);
  767. foreach (Match m in re.Matches(cookieString))
  768. {
  769. //name, value, path, domain
  770. Cookie c = new Cookie(m.Groups[1].Value, m.Groups[2].Value, m.Groups[3].Value, m.Groups[3].Value);
  771. cookie.Add(c);
  772. }
  773. return cookie;
  774. }
  775. #region 从HTML中获取文本,保留br,p,img
  776. /// <summary>
  777. /// 从HTML中获取文本,保留br,p,img
  778. /// </summary>
  779. /// <param name="HTML">html代码</param>
  780. /// <returns>保留br,p,img的文本</returns>
  781. public static string GetTextFromHTML(this string HTML)
  782. {
  783. Regex regEx = new Regex(@"</?(?!br|/?p|img)[^>]*>", RegexOptions.IgnoreCase);
  784. return regEx.Replace(HTML, "");
  785. }
  786. #endregion
  787. #region 获取HTML页面内制定Key的Value内容
  788. /// <summary>
  789. /// 获取HTML页面内制定Key的Value内容
  790. /// </summary>
  791. /// <param name="html">html源代码</param>
  792. /// <param name="key">键</param>
  793. /// <returns>获取到的值</returns>
  794. public static string GetHiddenKeyValue(this string html, string key)
  795. {
  796. string result = "";
  797. string sRegex = string.Format("<input\\s*type=\"hidden\".*?name=\"{0}\".*?\\s*value=[\"|'](?<value>.*?)[\"|'^/]", key);
  798. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  799. Match mc = re.Match(html);
  800. if (mc.Success)
  801. {
  802. result = mc.Groups[1].Value;
  803. }
  804. return result;
  805. }
  806. #endregion
  807. /// <summary>
  808. /// 替换回车换行符为html换行符
  809. /// </summary>
  810. /// <param name="str">html</param>
  811. public static string StrFormat(this string str)
  812. {
  813. string str2;
  814. if (str == null)
  815. {
  816. str2 = "";
  817. }
  818. else
  819. {
  820. str = str.Replace("\r\n", "<br />");
  821. str = str.Replace("\n", "<br />");
  822. str2 = str;
  823. }
  824. return str2;
  825. }
  826. /// <summary>
  827. /// 替换html字符
  828. /// </summary>
  829. /// <param name="strHtml">html</param>
  830. public static string EncodeHtml(this string strHtml)
  831. {
  832. if (strHtml != "")
  833. {
  834. strHtml = strHtml.Replace(",", "&def");
  835. strHtml = strHtml.Replace("'", "&dot");
  836. strHtml = strHtml.Replace(";", "&dec");
  837. return strHtml;
  838. }
  839. return "";
  840. }
  841. /// <summary>
  842. /// 为脚本替换特殊字符串
  843. /// </summary>
  844. /// <param name="str"> </param>
  845. /// <returns> </returns>
  846. [Obsolete("不建议使用", true)]
  847. public static string ReplaceStrToScript(string str)
  848. {
  849. str = str.Replace("\\", "\\\\");
  850. str = str.Replace("'", "\\'");
  851. str = str.Replace("\"", "\\\"");
  852. return str;
  853. }
  854. }
  855. }