HtmlTools.cs 63 KB


  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.Data;
  5. using System.IO;
  6. using System.Net;
  7. using System.Net.Sockets;
  8. using System.Text;
  9. using System.Text.RegularExpressions;
  10. using System.Web;
  11. using System.Xml;
  12. using Ganss.XSS;
  13. using Masuit.Tools.Core.Logging;
  14. namespace Masuit.Tools.Core.Html
  15. {
  16. /// <summary>
  17. /// html工具类
  18. /// </summary>
  19. public static partial class HtmlTools
  20. {
  21. #region 防止html的xss净化器
  22. /// <summary>
  23. /// 标准的防止html的xss净化器
  24. /// </summary>
  25. /// <param name="html"></param>
  26. /// <returns></returns>
  27. public static string HtmlSantinizerStandard(this string html)
  28. {
  29. var sanitizer = new HtmlSanitizer();
  30. sanitizer.AllowedAttributes.Remove("id");
  31. sanitizer.AllowedAttributes.Remove("alt");
  32. sanitizer.AllowedCssProperties.Remove("font-family");
  33. sanitizer.AllowedCssProperties.Remove("background-color");
  34. sanitizer.KeepChildNodes = true;
  35. sanitizer.AllowedTags.Remove("input");
  36. sanitizer.AllowedTags.Remove("button");
  37. sanitizer.AllowedTags.Remove("iframe");
  38. sanitizer.AllowedTags.Remove("frame");
  39. sanitizer.AllowedTags.Remove("textarea");
  40. sanitizer.AllowedTags.Remove("select");
  41. sanitizer.AllowedTags.Remove("form");
  42. return sanitizer.Sanitize(html);
  43. }
  44. /// <summary>
  45. /// 自定义的防止html的xss净化器
  46. /// </summary>
  47. /// <param name="html">源html</param>
  48. /// <param name="labels">需要移除的标签集合</param>
  49. /// <param name="attributes">需要移除的属性集合</param>
  50. /// <param name="styles">需要移除的样式集合</param>
  51. /// <returns></returns>
  52. public static string HtmlSantinizerCustom(this string html, string[] labels = null, string[] attributes = null, string[] styles = null)
  53. {
  54. var sanitizer = new HtmlSanitizer();
  55. if (labels != null)
  56. {
  57. foreach (string label in labels)
  58. {
  59. sanitizer.AllowedTags.Remove(label);
  60. }
  61. }
  62. if (attributes != null)
  63. {
  64. foreach (string attr in attributes)
  65. {
  66. sanitizer.AllowedAttributes.Remove(attr);
  67. }
  68. }
  69. if (styles != null)
  70. {
  71. foreach (string p in styles)
  72. {
  73. sanitizer.AllowedCssProperties.Remove(p);
  74. }
  75. }
  76. sanitizer.KeepChildNodes = true;
  77. return sanitizer.Sanitize(html);
  78. }
  79. #endregion
  80. #region BaseMethod
  81. /// <summary>
  82. /// 多个匹配内容
  83. /// </summary>
  84. /// <param name="sInput">输入内容</param>
  85. /// <param name="sRegex">表达式字符串</param>
  86. /// <param name="iGroupIndex">第几个分组, 从1开始, 0代表不分组</param>
  87. public static List<string> GetList(string sInput, string sRegex, int iGroupIndex)
  88. {
  89. List<string> list = new List<string>();
  90. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  91. MatchCollection mcs = re.Matches(sInput);
  92. foreach (Match mc in mcs)
  93. {
  94. if (iGroupIndex > 0)
  95. {
  96. list.Add(mc.Groups[iGroupIndex].Value);
  97. }
  98. else
  99. {
  100. list.Add(mc.Value);
  101. }
  102. }
  103. return list;
  104. }
  105. /// <summary>
  106. /// 多个匹配内容
  107. /// </summary>
  108. /// <param name="sInput">输入内容</param>
  109. /// <param name="sRegex">表达式字符串</param>
  110. /// <param name="sGroupName">分组名, ""代表不分组</param>
  111. public static List<string> GetList(string sInput, string sRegex, string sGroupName)
  112. {
  113. List<string> list = new List<string>();
  114. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  115. MatchCollection mcs = re.Matches(sInput);
  116. foreach (Match mc in mcs)
  117. {
  118. if (sGroupName != "")
  119. {
  120. list.Add(mc.Groups[sGroupName].Value);
  121. }
  122. else
  123. {
  124. list.Add(mc.Value);
  125. }
  126. }
  127. return list;
  128. }
  129. /// <summary>
  130. /// 单个匹配内容
  131. /// </summary>
  132. /// <param name="sInput">输入内容</param>
  133. /// <param name="sRegex">表达式字符串</param>
  134. /// <param name="iGroupIndex">分组序号, 从1开始, 0不分组</param>
  135. public static string GetText(string sInput, string sRegex, int iGroupIndex)
  136. {
  137. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  138. Match mc = re.Match(sInput);
  139. string result = "";
  140. if (mc.Success)
  141. {
  142. if (iGroupIndex > 0)
  143. {
  144. result = mc.Groups[iGroupIndex].Value;
  145. }
  146. else
  147. {
  148. result = mc.Value;
  149. }
  150. }
  151. return result;
  152. }
  153. /// <summary>
  154. /// 单个匹配内容
  155. /// </summary>
  156. /// <param name="sInput">输入内容</param>
  157. /// <param name="sRegex">表达式字符串</param>
  158. /// <param name="sGroupName">分组名, ""代表不分组</param>
  159. public static string GetText(string sInput, string sRegex, string sGroupName)
  160. {
  161. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  162. Match mc = re.Match(sInput);
  163. string result = "";
  164. if (mc.Success)
  165. {
  166. if (sGroupName != "")
  167. {
  168. result = mc.Groups[sGroupName].Value;
  169. }
  170. else
  171. {
  172. result = mc.Value;
  173. }
  174. }
  175. return result;
  176. }
  177. /// <summary>
  178. /// 替换指定内容
  179. /// </summary>
  180. /// <param name="sInput">输入内容</param>
  181. /// <param name="sRegex">表达式字符串</param>
  182. /// <param name="sReplace">替换值</param>
  183. /// <param name="iGroupIndex">分组序号, 0代表不分组</param>
  184. public static string Replace(string sInput, string sRegex, string sReplace, int iGroupIndex)
  185. {
  186. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  187. MatchCollection mcs = re.Matches(sInput);
  188. foreach (Match mc in mcs)
  189. {
  190. if (iGroupIndex > 0)
  191. {
  192. sInput = sInput.Replace(mc.Groups[iGroupIndex].Value, sReplace);
  193. }
  194. else
  195. {
  196. sInput = sInput.Replace(mc.Value, sReplace);
  197. }
  198. }
  199. return sInput;
  200. }
  201. /// <summary>
  202. /// 替换指定内容
  203. /// </summary>
  204. /// <param name="sInput">输入内容</param>
  205. /// <param name="sRegex">表达式字符串</param>
  206. /// <param name="sReplace">替换值</param>
  207. /// <param name="sGroupName">分组名, "" 代表不分组</param>
  208. public static string Replace(string sInput, string sRegex, string sReplace, string sGroupName)
  209. {
  210. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  211. MatchCollection mcs = re.Matches(sInput);
  212. foreach (Match mc in mcs)
  213. {
  214. if (sGroupName != "")
  215. {
  216. sInput = sInput.Replace(mc.Groups[sGroupName].Value, sReplace);
  217. }
  218. else
  219. {
  220. sInput = sInput.Replace(mc.Value, sReplace);
  221. }
  222. }
  223. return sInput;
  224. }
  225. /// <summary>
  226. /// 分割指定内容
  227. /// </summary>
  228. /// <param name="sInput">输入内容</param>
  229. /// <param name="sRegex">表达式字符串</param>
  230. /// <param name="iStrLen">最小保留字符串长度</param>
  231. public static List<string> Split(string sInput, string sRegex, int iStrLen)
  232. {
  233. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  234. string[] sArray = re.Split(sInput);
  235. List<string> list = new List<string>();
  236. list.Clear();
  237. foreach (string s in sArray)
  238. {
  239. if (s.Trim().Length < iStrLen)
  240. continue;
  241. list.Add(s.Trim());
  242. }
  243. return list;
  244. }
  245. #endregion BaseMethod
  246. #region 获得特定内容
  247. /// <summary>
  248. /// 多个链接
  249. /// </summary>
  250. /// <param name="sInput">输入内容</param>
  251. public static List<string> GetLinks(string sInput)
  252. {
  253. return GetList(sInput, @"<a[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href");
  254. }
  255. /// <summary>
  256. /// 单个链接
  257. /// </summary>
  258. /// <param name="sInput">输入内容</param>
  259. public static string GetLinkHelp(string sInput)
  260. {
  261. return GetText(sInput, @"<a[^>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href");
  262. }
  263. /// <summary>
  264. /// 图片标签
  265. /// </summary>
  266. /// <param name="sInput">输入内容</param>
  267. public static List<string> GetImgTag(string sInput)
  268. {
  269. return GetList(sInput, "<img[^>]+src=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "");
  270. }
  271. /// <summary>
  272. /// 图片地址
  273. /// </summary>
  274. /// <param name="sInput">输入内容</param>
  275. public static string GetImgSrc(string sInput)
  276. {
  277. return GetText(sInput, "<img[^>]+src=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src");
  278. }
  279. /// <summary>
  280. /// 根据URL获得域名
  281. /// </summary>
  282. /// <param name="sInput">输入内容</param>
  283. public static string GetDomain(string sInput)
  284. {
  285. return GetText(sInput, @"http(s)?://([\w-]+\.)+(\w){2,}", 0);
  286. }
  287. #endregion 获得特定内容
  288. #region 根据表达式,获得文章内容
  289. /// <summary>
  290. /// 文章标题
  291. /// </summary>
  292. /// <param name="sInput">输入内容</param>
  293. /// <param name="sRegex">表达式字符串</param>
  294. public static string GetTitle(string sInput, string sRegex)
  295. {
  296. string sTitle = GetText(sInput, sRegex, "Title");
  297. sTitle = ClearTag(sTitle);
  298. if (sTitle.Length > 99)
  299. {
  300. sTitle = sTitle.Substring(0, 99);
  301. }
  302. return sTitle;
  303. }
  304. /// <summary>
  305. /// 网页标题
  306. /// </summary>
  307. /// <param name="sInput">html</param>
  308. public static string GetTitle(string sInput)
  309. {
  310. return GetText(sInput, @"<Title[^>]*>(?<Title>[\s\S]{10,})</Title>", "Title");
  311. }
  312. /// <summary>
  313. /// 网页内容
  314. /// </summary>
  315. /// <param name="sInput">输入内容</param>
  316. public static string GetHtml(string sInput)
  317. {
  318. return Replace(sInput, @"(?<Head>[^<]+)<", "", "Head");
  319. }
  320. /// <summary>
  321. /// 网页Body内容
  322. /// </summary>
  323. /// <param name="sInput">html</param>
  324. public static string GetBodyHelp(string sInput)
  325. {
  326. return GetText(sInput, @"<Body[^>]*>(?<Body>[\s\S]{10,})</body>", "Body");
  327. }
  328. /// <summary>
  329. /// 网页Body内容
  330. /// </summary>
  331. /// <param name="sInput">输入内容</param>
  332. /// <param name="sRegex">表达式字符串</param>
  333. public static string GetBody(string sInput, string sRegex)
  334. {
  335. return GetText(sInput, sRegex, "Body");
  336. }
  337. /// <summary>
  338. /// 文章来源
  339. /// </summary>
  340. /// <param name="sInput">输入内容</param>
  341. /// <param name="sRegex">表达式字符串</param>
  342. public static string GetSource(string sInput, string sRegex)
  343. {
  344. string sSource = GetText(sInput, sRegex, "Source");
  345. sSource = ClearTag(sSource);
  346. if (sSource.Length > 99)
  347. sSource = sSource.Substring(0, 99);
  348. return sSource;
  349. }
  350. /// <summary>
  351. /// 作者名
  352. /// </summary>
  353. /// <param name="sInput">输入内容</param>
  354. /// <param name="sRegex">表达式字符串</param>
  355. public static string GetAuthor(string sInput, string sRegex)
  356. {
  357. string sAuthor = GetText(sInput, sRegex, "Author");
  358. sAuthor = ClearTag(sAuthor);
  359. if (sAuthor.Length > 99)
  360. sAuthor = sAuthor.Substring(0, 99);
  361. return sAuthor;
  362. }
  363. /// <summary>
  364. /// 分页链接地址
  365. /// </summary>
  366. /// <param name="sInput">输入内容</param>
  367. /// <param name="sRegex">表达式字符串</param>
  368. public static List<string> GetPageLinks(string sInput, string sRegex)
  369. {
  370. return GetList(sInput, sRegex, "href");
  371. }
  372. /// <summary>
  373. /// 根据相对路径得到绝对路径
  374. /// </summary>
  375. /// <param name="sInput">原始网站地址</param>
  376. /// <param name="sRelativeUrl">相对链接地址</param>
  377. public static string GetUrl(string sInput, string sRelativeUrl)
  378. {
  379. string sReturnUrl = "";
  380. string sUrl = _GetStandardUrlDepth(sInput);//返回了http://www.163.com/news/这种形式
  381. if (sRelativeUrl.ToLower().StartsWith("http") || sRelativeUrl.ToLower().StartsWith("https"))
  382. {
  383. sReturnUrl = sRelativeUrl.Trim();
  384. }
  385. else if (sRelativeUrl.StartsWith("/"))
  386. {
  387. sReturnUrl = GetDomain(sInput) + sRelativeUrl;
  388. }
  389. else if (sRelativeUrl.StartsWith("../"))
  390. {
  391. sUrl = sUrl.Substring(0, sUrl.Length - 1);
  392. while (sRelativeUrl.IndexOf("../") >= 0)
  393. {
  394. string temp = sUrl.Substring(0, sUrl.LastIndexOf("/")); // CString.GetPreStrByLast(sUrl, "/");
  395. if (temp.Length > 6)
  396. {//temp != "http:/",否则的话,说明已经回溯到尽头了,"../"与网址的层次对应不上。存在这种情况,网页上面的链接是错误的,但浏览器还能正常显示
  397. sUrl = temp;
  398. }
  399. sRelativeUrl = sRelativeUrl.Substring(3);
  400. }
  401. sReturnUrl = sUrl + "/" + sRelativeUrl.Trim();
  402. }
  403. else if (sRelativeUrl.StartsWith("./"))
  404. {
  405. sReturnUrl = sUrl + sRelativeUrl.Trim().Substring(2);
  406. }
  407. else if (sRelativeUrl.Trim() != "")
  408. {//2007images/modecss.css
  409. sReturnUrl = sUrl + sRelativeUrl.Trim();
  410. }
  411. return sReturnUrl;
  412. }
  413. /// <summary>
  414. /// 获得标准的URL路径深度
  415. /// </summary>
  416. /// <param name="url">URL路径</param>
  417. /// <returns>返回标准的形式:http://www.163.com/或http://www.163.com/news/。</returns>
  418. private static string _GetStandardUrlDepth(string url)
  419. {
  420. string sheep = url.Trim().ToLower();
  421. string header = "http://";
  422. if (sheep.IndexOf("https://") != -1)
  423. {
  424. header = "https://";
  425. sheep = sheep.Replace("https://", "");
  426. }
  427. else
  428. {
  429. sheep = sheep.Replace("http://", "");
  430. }
  431. int p = sheep.LastIndexOf("/");
  432. if (p == -1)
  433. {//www.163.com
  434. sheep += "/";
  435. }
  436. else if (p == sheep.Length - 1)
  437. {//传来的是:http://www.163.com/news/
  438. }
  439. else if (sheep.Substring(p).IndexOf(".") != -1)
  440. {//传来的是:http://www.163.com/news/hello.htm 这种形式
  441. sheep = sheep.Substring(0, p + 1);
  442. }
  443. else
  444. {
  445. sheep += "/";
  446. }
  447. return header + sheep;
  448. }
  449. /// <summary>
  450. /// 关键字
  451. /// </summary>
  452. /// <param name="sInput">输入内容</param>
  453. public static string GetKeyWord(string sInput)
  454. {
  455. List<string> list = Split(sInput, "(,|,|\\+|+|。|;|;|:|:|“)|”|、|_|\\(|(|\\)|)", 2);
  456. List<string> listReturn = new List<string>();
  457. Regex re;
  458. foreach (string str in list)
  459. {
  460. re = new Regex(@"[a-zA-z]+", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
  461. MatchCollection mcs = re.Matches(str);
  462. string sTemp = str;
  463. foreach (Match mc in mcs)
  464. {
  465. if (mc.Value.Length > 2)
  466. listReturn.Add(mc.Value);
  467. sTemp = sTemp.Replace(mc.Value, ",");
  468. }
  469. re = new Regex(@",{1}", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
  470. mcs = re.Matches(sTemp);
  471. foreach (string s in re.Split(sTemp))
  472. {
  473. if (s.Trim().Length <= 2)
  474. continue;
  475. listReturn.Add(s);
  476. }
  477. }
  478. string sReturn = "";
  479. for (int i = 0; i < listReturn.Count - 1; i++)
  480. {
  481. for (int j = i + 1; j < listReturn.Count; j++)
  482. {
  483. if (listReturn[i] == listReturn[j])
  484. {
  485. listReturn[j] = "";
  486. }
  487. }
  488. }
  489. foreach (string str in listReturn)
  490. {
  491. if (str.Length > 2)
  492. sReturn += str + ",";
  493. }
  494. if (sReturn.Length > 0)
  495. sReturn = sReturn.Substring(0, sReturn.Length - 1);
  496. else
  497. sReturn = sInput;
  498. if (sReturn.Length > 99)
  499. sReturn = sReturn.Substring(0, 99);
  500. return sReturn;
  501. }
  502. /// <summary>
  503. /// 获取内容
  504. /// </summary>
  505. /// <param name="sOriContent">原始数据</param>
  506. /// <param name="sOtherRemoveReg">需要移除的字符</param>
  507. /// <param name="sPageUrl">URL</param>
  508. /// <param name="dtAntiLink">反链 表数据</param>
  509. /// <returns>转码后的内容</returns>
  510. public static string GetContent(string sOriContent, string sOtherRemoveReg, string sPageUrl, DataTable dtAntiLink)
  511. {
  512. string sFormartted = sOriContent;
  513. //去掉有危险的标记
  514. sFormartted = Regex.Replace(sFormartted, @"<script[\s\S]*?</script>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
  515. sFormartted = Regex.Replace(sFormartted, @"<iframe[^>]*>[\s\S]*?</iframe>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
  516. Regex r = new Regex(@"<input[\s\S]+?>|<form[\s\S]+?>|</form[\s\S]*?>|<select[\s\S]+?>?</select>|<textarea[\s\S]*?>?</textarea>|<file[\s\S]*?>|<noscript>|</noscript>", RegexOptions.IgnoreCase);
  517. sFormartted = r.Replace(sFormartted, "");
  518. string[] sOtherReg = sOtherRemoveReg.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
  519. foreach (string sRemoveReg in sOtherReg)
  520. {
  521. sFormartted = Replace(sFormartted, sRemoveReg, "", 0);
  522. }
  523. //图片路径
  524. sFormartted = _ReplaceUrl("<img[\\s\\S]+?src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src", sFormartted, sPageUrl);
  525. //反防盗链
  526. string domain = GetDomain(sPageUrl);
  527. DataRow[] drs = dtAntiLink.Select("Domain='" + domain + "'");
  528. if (drs.Length > 0)
  529. {
  530. foreach (DataRow dr in drs)
  531. {
  532. switch (Convert.ToInt32(dr["Type"]))
  533. {
  534. case 1://置换
  535. sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=");
  536. break;
  537. default://附加
  538. sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=" + dr["imgUrl"].ToString());
  539. break;
  540. }
  541. }
  542. }
  543. //A链接
  544. sFormartted = _ReplaceUrl(@"<a[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl);
  545. //CSS
  546. sFormartted = _ReplaceUrl(@"<link[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl);
  547. //BACKGROUND
  548. sFormartted = _ReplaceUrl(@"background\s*=\s*(?:'(?<img>[^']+)'|""(?<img>[^""]+)""|(?<img>[^>\s]+))", "img", sFormartted, sPageUrl);
  549. //style方式的背景:background-image:url(...)
  550. sFormartted = _ReplaceUrl(@"background-image\s*:\s*url\s*\x28(?<img>[^\x29]+)\x29", "img", sFormartted, sPageUrl);
  551. //FLASH
  552. sFormartted = _ReplaceUrl(@"<param\s[^>]+""movie""[^>]+value\s*=\s*""(?<flash>[^"">]+\x2eswf)""[^>]*>", "flash", sFormartted, sPageUrl);
  553. //XSL
  554. if (IsXml(sFormartted))
  555. {
  556. sFormartted = _ReplaceUrl(@"<\x3fxml-stylesheet\s+[^\x3f>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)"")\s*[^\x3f>]*\x3f>", "href", sFormartted, sPageUrl);
  557. }
  558. //script
  559. //sFormartted = _ReplaceUrl(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", "src", sFormartted,sPageUrl);
  560. return sFormartted;
  561. }
  562. private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl)
  563. {
  564. Regex re = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
  565. MatchCollection mcs = re.Matches(sFormartted);
  566. string sOriStr = "";
  567. string sSubMatch = "";
  568. string sReplaceStr = "";
  569. foreach (Match mc in mcs)
  570. {
  571. sOriStr = mc.Value;
  572. sSubMatch = mc.Groups[subMatch].Value;
  573. sReplaceStr = sOriStr.Replace(sSubMatch, GetUrl(sPageUrl, sSubMatch));
  574. sFormartted = sFormartted.Replace(sOriStr, sReplaceStr);
  575. }
  576. return sFormartted;
  577. }
  578. /// <summary>
  579. /// 判断是否是xml格式
  580. /// </summary>
  581. /// <param name="sFormartted">输入内容</param>
  582. /// <returns>是否是xml数据</returns>
  583. public static bool IsXml(string sFormartted)
  584. {
  585. Regex re = new Regex(@"<\x3fxml\s+", RegexOptions.IgnoreCase);
  586. MatchCollection mcs = re.Matches(sFormartted);
  587. return mcs.Count > 0;
  588. }
  589. #endregion 根据表达式,获得文章内容
  590. #region HTML相关操作
  591. /// <summary>
  592. /// 清除html标签
  593. /// </summary>
  594. /// <param name="sHtml">html代码</param>
  595. /// <returns>清理后的内容</returns>
  596. public static string ClearTag(string sHtml)
  597. {
  598. if (sHtml?.Length == 0)
  599. return "";
  600. string sTemp = sHtml;
  601. Regex re = new Regex(@"(<[^>\s]*\b(\w)+\b[^>]*>)|(<>)|(&nbsp;)|(&gt;)|(&lt;)|(&amp;)|\r|\n|\t", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  602. return re.Replace(sHtml, "");
  603. }
  604. /// <summary>
  605. /// 根据正则清除html标签
  606. /// </summary>
  607. /// <param name="sHtml">html代码</param>
  608. /// <param name="sRegex">正则表达式</param>
  609. /// <returns>清理后的内容</returns>
  610. public static string ClearTag(string sHtml, string sRegex)
  611. {
  612. string sTemp = sHtml;
  613. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  614. return re.Replace(sHtml, "");
  615. }
  616. /// <summary>
  617. /// 将html转换成js代码
  618. /// </summary>
  619. /// <param name="sHtml">html代码</param>
  620. /// <returns>js代码</returns>
  621. public static string ConvertToJavascript(string sHtml)
  622. {
  623. StringBuilder sText = new StringBuilder();
  624. var re = new Regex(@"\r\n", RegexOptions.IgnoreCase);
  625. string[] strArray = re.Split(sHtml);
  626. foreach (string strLine in strArray)
  627. {
  628. sText.Append("document.writeln(\"" + strLine.Replace("\"", "\\\"") + "\");\r\n");
  629. }
  630. return sText.ToString();
  631. }
  632. /// <summary>
  633. /// 删除字符串中的特定标记
  634. /// </summary>
  635. /// <param name="str">html代码</param>
  636. /// <param name="tag">指定的标记</param>
  637. /// <param name="isContent">是否清除内容 </param>
  638. /// <returns>清理后的代码</returns>
  639. public static string DelTag(string str, string tag, bool isContent)
  640. {
  641. if (tag == null || tag == " ")
  642. {
  643. return str;
  644. }
  645. if (isContent) //要求清除内容
  646. {
  647. return Regex.Replace(str, string.Format("<({0})[^>]*>([\\s\\S]*?)<\\/\\1>", tag), "", RegexOptions.IgnoreCase);
  648. }
  649. return Regex.Replace(str, string.Format(@"(<{0}[^>]*(>)?)|(</{0}[^>] *>)|", tag), "", RegexOptions.IgnoreCase);
  650. }
  651. /// <summary>
  652. /// 删除字符串中的一组标记
  653. /// </summary>
  654. /// <param name="str">html代码</param>
  655. /// <param name="tagA">标记</param>
  656. /// <param name="isContent">是否清除内容 </param>
  657. /// <returns>清理后的代码</returns>
  658. public static string DelTagArray(string str, string tagA, bool isContent)
  659. {
  660. string[] tagAa = tagA.Split(',');
  661. foreach (string sr1 in tagAa) //遍历所有标记,删除
  662. {
  663. str = DelTag(str, sr1, isContent);
  664. }
  665. return str;
  666. }
  667. #endregion HTML相关操作
  668. #region 根据内容获得链接
  669. /// <summary>
  670. /// 根据内容获得链接
  671. /// </summary>
  672. /// <param name="sContent">html代码</param>
  673. /// <returns>链接</returns>
  674. public static string GetLink(string sContent)
  675. {
  676. string strReturn = "";
  677. Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  678. Regex js = new Regex(@"(href|onclick)=[^>]+javascript[^>]+(('(?<href>[\w\d/-]+\.[^']*)')|(&quot;(?<href>[\w\d/-]+\.[^;]*)&quot;))[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
  679. Match mc = js.Match(sContent);//获取javascript中的链接,有待改进
  680. if (mc.Success)
  681. {
  682. strReturn = mc.Groups["href"].Value;
  683. }
  684. else
  685. {
  686. Match me = re.Match(sContent);
  687. if (me.Success)
  688. {
  689. strReturn = System.Web.HttpUtility.HtmlDecode(me.Groups["href"].Value);
  690. //strReturn = RemoveByReg(strReturn, @";.*|javascript:.*");
  691. strReturn = RemoveByReg(strReturn, @";[^?&]*|javascript:.*");
  692. }
  693. }
  694. return strReturn;
  695. }
  696. /// <summary>
  697. /// 根据链接得到文本
  698. /// </summary>
  699. /// <param name="sContent">链接</param>
  700. /// <returns>文本</returns>
  701. public static string GetTextByLink(string sContent)
  702. {
  703. Regex re = new Regex(@"<a(?:\s+[^>]*)?>([\s\S]*)?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  704. Regex email = new Regex(@"(href|onclick)=[^>]+mailto[^>]+@[^>]+>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  705. Match me = email.Match(sContent);
  706. if (me.Success)
  707. return "";
  708. Match mc = re.Match(sContent);
  709. if (mc.Success)
  710. return mc.Groups[1].Value;
  711. else
  712. return "";
  713. }
  714. private static void _GetLinks(string sContent, string sUrl, ref Dictionary<string, string> lisA)
  715. {
  716. const string sFilter =
  717. @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采
  718. |登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入
  719. |更多|more|专题|精选|热卖|热销|推荐|精彩
  720. |加盟|联盟|友情|链接|相关
  721. |订阅|阅读器|RSS
  722. |免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私
  723. |〖|〗|【|】|(|)|[|]|『|』|\.";
  724. Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  725. Regex re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  726. MatchCollection mcs = re.Matches(sContent);
  727. //foreach (Match mc in mcs)
  728. for (int i = mcs.Count - 1; i >= 0; i--)
  729. {
  730. Match mc = mcs[i];
  731. string strHref = GetLink(mc.Value).Trim();
  732. strHref = strHref.Replace("\\\"", "");//针对JS输出链接
  733. strHref = strHref.Replace("\\\'", "");
  734. string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
  735. if (strTemp.Length < 2)
  736. {
  737. continue;
  738. }
  739. //过滤广告或无意义的链接
  740. string strText = ClearTag(GetTextByLink(mc.Value)).Trim();
  741. strTemp = RemoveByReg(strText, sFilter);
  742. if (Encoding.Default.GetBytes(strTemp).Length < 9)
  743. {
  744. continue;
  745. }
  746. if (re2.IsMatch(strText))
  747. {
  748. continue;
  749. }
  750. //换上绝对地址
  751. strHref = GetUrlByRelative(sUrl, strHref);
  752. if (strHref.Length <= 18)//例如,http://www.163.com = 18
  753. {
  754. continue;
  755. }
  756. //计算#字符出现的位置,移除它后面的内容
  757. //如果是域名地址,就跳过
  758. int charIndex = strHref.IndexOf('#');
  759. if (charIndex > -1)
  760. {
  761. strHref = strHref.Substring(0, charIndex);
  762. }
  763. strHref = strHref.Trim(new char[] { '/', '\\' });
  764. string tmpDomainURL = GetDomain(strHref);
  765. if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase))
  766. {
  767. continue;
  768. }
  769. if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText))
  770. {
  771. lisA.Add(strHref, strText);
  772. }
  773. }
  774. }
  775. /// <summary>
  776. /// 判断是否是js链接
  777. /// </summary>
  778. /// <param name="sHtml">html</param>
  779. /// <returns>判断是否是js链接</returns>
  780. public static bool IsExistsScriptLink(string sHtml)
  781. {
  782. Regex re = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
  783. return re.IsMatch(sHtml);
  784. }
  785. /// <summary>
  786. /// 从RSS FEED中读取
  787. /// </summary>
  788. /// <param name="sContent">RSS内容</param>
  789. /// <param name="sUrl">URL</param>
  790. /// <returns>读取到的内容</returns>
  791. public static Dictionary<string, string> GetLinksFromRss(string sContent, string sUrl)
  792. {
  793. Dictionary<string, string> lisDes = new Dictionary<string, string>();
  794. return GetLinksFromRss(sContent, sUrl, ref lisDes);
  795. }
  796. /// <summary>
  797. /// 从RSS FEED中读取
  798. /// </summary>
  799. /// <param name="sContent">RSS内容</param>
  800. /// <param name="sUrl">URL</param>
  801. /// <param name="lisDes">过滤条件</param>
  802. /// <returns>读取到的内容</returns>
  803. public static Dictionary<string, string> GetLinksFromRss(string sContent, string sUrl, ref Dictionary<string, string> lisDes)
  804. {
  805. Dictionary<string, string> listResult = new Dictionary<string, string>();
  806. XmlDocument xml = new XmlDocument();
  807. //RSS2.0
  808. try
  809. {
  810. xml.LoadXml(sContent.Trim());
  811. XmlNodeList nodes = xml.SelectNodes("/rss/channel/item");
  812. if (nodes.Count > 0)
  813. {
  814. for (int i = nodes.Count - 1; i >= 0; i--)
  815. {
  816. try
  817. {
  818. string sLink = GetUrlByRelative(sUrl, nodes[i].SelectSingleNode("link").InnerText);
  819. listResult.Add(sLink, nodes[i].SelectSingleNode("title").InnerText);
  820. lisDes.Add(sLink, nodes[i].SelectSingleNode("description").InnerText);
  821. }
  822. catch (Exception e)
  823. {
  824. LogManager.Error(e);
  825. }
  826. }
  827. return listResult;
  828. }
  829. }
  830. catch (Exception e)
  831. {
  832. LogManager.Error(e);
  833. }
  834. //RSS1.0(RDF)
  835. try
  836. {
  837. XmlNamespaceManager nsMgr = new XmlNamespaceManager(xml.NameTable);
  838. nsMgr.AddNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
  839. nsMgr.AddNamespace("rss", "http://purl.org/rss/1.0/");
  840. XmlNodeList nodes = xml.SelectNodes("/rdf:RDF//rss:item", nsMgr);
  841. if (nodes.Count > 0)
  842. {
  843. for (int i = nodes.Count - 1; i >= 0; i--)
  844. {
  845. try
  846. {
  847. string sLink = GetUrlByRelative(sUrl, nodes[i].SelectSingleNode("rss:link", nsMgr).InnerText);
  848. listResult.Add(sLink, nodes[i].SelectSingleNode("rss:title", nsMgr).InnerText);
  849. lisDes.Add(sLink, nodes[i].SelectSingleNode("rss:description", nsMgr).InnerText);
  850. }
  851. catch (Exception e)
  852. {
  853. LogManager.Error(e);
  854. }
  855. //listResult.Add("<a href=\"" + nodes[i].SelectSingleNode("rss:link",nsMgr).InnerText + "\">" + nodes[i].SelectSingleNode("rss:title",nsMgr).InnerText + "</a>");
  856. }
  857. return listResult;
  858. }
  859. }
  860. catch (Exception e)
  861. {
  862. LogManager.Error(e);
  863. }
  864. //RSS ATOM
  865. try
  866. {
  867. XmlNamespaceManager nsMgr = new XmlNamespaceManager(xml.NameTable);
  868. nsMgr.AddNamespace("atom", "http://purl.org/atom/ns#");
  869. XmlNodeList nodes = xml.SelectNodes("/atom:feed/atom:entry", nsMgr);
  870. if (nodes.Count > 0)
  871. {
  872. for (int i = nodes.Count - 1; i >= 0; i--)
  873. {
  874. try
  875. {
  876. string sLink = GetUrlByRelative(sUrl, nodes[i].SelectSingleNode("atom:link", nsMgr).Attributes["href"].InnerText);
  877. listResult.Add(sLink, nodes[i].SelectSingleNode("atom:title", nsMgr).InnerText);
  878. lisDes.Add(sLink, nodes[i].SelectSingleNode("atom:content", nsMgr).InnerText);
  879. }
  880. catch (Exception e)
  881. {
  882. LogManager.Error(e);
  883. }
  884. //listResult.Add("<a href=\"" + nodes[i].SelectSingleNode("atom:link",nsMgr).Attributes["href"].InnerText + "\">" + nodes[i].SelectSingleNode("atom:title",nsMgr).InnerText + "</a>");
  885. }
  886. return listResult;
  887. }
  888. }
  889. catch (Exception e)
  890. {
  891. LogManager.Error(e);
  892. }
  893. return listResult;
  894. }
  895. /// <summary>
  896. /// 从RSS FEED中读取标题
  897. /// </summary>
  898. /// <param name="sContent">RSS</param>
  899. /// <returns>标题</returns>
  900. public static string GetTitleFromRss(string sContent)
  901. {
  902. string title = "";
  903. XmlDocument xml = new XmlDocument();
  904. //RSS2.0
  905. try
  906. {
  907. xml.LoadXml(sContent.Trim());
  908. title = xml.SelectSingleNode("/rss/channel/title").InnerText;
  909. }
  910. catch (Exception e)
  911. {
  912. LogManager.Error(e);
  913. }
  914. return title;
  915. }
  916. /// <summary>
  917. /// 根据标签进行移除
  918. /// </summary>
  919. /// <param name="sContent">html</param>
  920. /// <param name="sRegex">正则表达式</param>
  921. /// <returns>清理后的代码</returns>
  922. public static string RemoveByReg(string sContent, string sRegex)
  923. {
  924. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  925. MatchCollection mcs = re.Matches(sContent);
  926. foreach (Match mc in mcs)
  927. {
  928. sContent = sContent.Replace(mc.Value, "");
  929. }
  930. return sContent;
  931. }
  932. /// <summary>
  933. /// 根据正则表达式替换内容
  934. /// </summary>
  935. /// <param name="sContent">html</param>
  936. /// <param name="sReplace">需要替换的内容</param>
  937. /// <param name="sRegex">符合正则的内容</param>
  938. /// <returns>替换后内容</returns>
  939. public static string ReplaceByReg(string sContent, string sReplace, string sRegex)
  940. {
  941. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  942. sContent = re.Replace(sContent, sReplace);
  943. return sContent;
  944. }
  945. /// <summary>
  946. /// 网页Body内容
  947. /// </summary>
  948. /// <param name="sContent">html源代码</param>
  949. /// <returns>网页Body内容</returns>
  950. public static string GetBody(string sContent)
  951. {
  952. Regex re = new Regex(@"[\s\S]*?<\bbody\b[^>]*>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
  953. sContent = re.Replace(sContent, "");
  954. re = new Regex(@"</\bbody\b[^>]*>\s*</html>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.RightToLeft);
  955. sContent = re.Replace(sContent, "");
  956. return sContent;
  957. }
  958. #endregion 根据超链接地址获取页面内容
  959. #region 根据内容作字符串分析
  960. /// <summary>
  961. /// 根据标签获取文本
  962. /// </summary>
  963. /// <param name="sContent">html</param>
  964. /// <param name="sRegex">正则表达式</param>
  965. /// <returns>文本</returns>
  966. public static string GetTextByReg(string sContent, string sRegex)
  967. {
  968. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  969. Match mc = re.Match(sContent);
  970. string str = "";
  971. if (mc.Success)
  972. str = mc.Groups[0].Value;
  973. while (str.EndsWith("_"))
  974. {
  975. str = RemoveEndWith(str, "_");
  976. }
  977. return str;
  978. }
  979. // charset=[\s]*(?<Coding>[^'"]+)[\s]*['"]?[\s]*[/]?>
  980. /// <summary>
  981. /// 根据标签获取文本
  982. /// </summary>
  983. /// <param name="sContent">html</param>
  984. /// <param name="sRegex">正则表达式</param>
  985. /// <param name="sGroupName">分组名</param>
  986. /// <returns>文本</returns>
  987. public static string GetTextByReg(string sContent, string sRegex, string sGroupName)
  988. {
  989. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  990. Match mc = re.Match(sContent);
  991. string str = "";
  992. if (mc.Success)
  993. str = mc.Groups[sGroupName].Value;
  994. return str;
  995. }
  996. /// <summary>
  997. /// 获得链接的绝对路径
  998. /// </summary>
  999. /// <param name="sUrl">原链接地址</param>
  1000. /// <param name="sRUrl">相对地址</param>
  1001. /// <returns>获得链接的绝对路径</returns>
  1002. public static string GetUrlByRelative(string sUrl, string sRUrl)
  1003. {
  1004. try
  1005. {
  1006. //http://q.yesky.com/grp/dsc/view.do;jsessionid=A6324FD46B4893303124F70C0B2AAC1E?grpId=201595&rvId=8215876
  1007. Uri baseUri = new Uri(sUrl);
  1008. if (!sUrl.EndsWith("/"))
  1009. {
  1010. int i = baseUri.Segments.Length - 1;
  1011. if (i > 0)
  1012. {
  1013. string file = baseUri.Segments[i];
  1014. if (file.IndexOf('.') < 1)
  1015. {
  1016. baseUri = new Uri(sUrl + "/");
  1017. }
  1018. }
  1019. }
  1020. Uri myUri = new Uri(baseUri, sRUrl);
  1021. return myUri.AbsoluteUri;
  1022. }
  1023. catch
  1024. {
  1025. return sUrl;
  1026. }
  1027. }
  1028. /// <summary>
  1029. /// 根据标签获取数据集合
  1030. /// </summary>
  1031. /// <param name="sContent">html</param>
  1032. /// <param name="sRegex">正则表达式</param>
  1033. /// <returns>数据集合</returns>
  1034. public static List<string> GetListByReg(string sContent, string sRegex)
  1035. {
  1036. List<string> list = new List<string>();
  1037. Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
  1038. MatchCollection mcs = re.Matches(sContent);
  1039. foreach (Match mc in mcs)
  1040. {
  1041. list.Add(mc.Groups["href"].Value);
  1042. }
  1043. return list;
  1044. }
  1045. /// <summary>
  1046. /// 获得主域
  1047. /// </summary>
  1048. /// <param name="sUrl">URL</param>
  1049. /// <returns>域名</returns>
  1050. public static string GetDomainUrl(string sUrl)
  1051. {
  1052. try
  1053. {
  1054. Uri baseUri = new Uri(sUrl);
  1055. return baseUri.Scheme + "://" + baseUri.Authority;
  1056. }
  1057. catch
  1058. {
  1059. return sUrl;
  1060. }
  1061. }
  1062. #endregion
  1063. #region 杂项
  1064. /// <summary>
  1065. /// 从html中过滤出文本
  1066. /// </summary>
  1067. /// <param name="sHtml">html</param>
  1068. /// <returns>纯文本</returns>
  1069. public static string GetTxtFromHtml(this string sHtml)
  1070. {
  1071. string del = @"<head[^>]*>[\s\S]*?</head>";
  1072. string content = RemoveByReg(sHtml, del);
  1073. del = @"(<script[^>]*>[\s\S]*?</script>)|(<IFRAME[^>]*>[\s\S]*?</IFRAME>)|(<style[^>]*>[\s\S]*?</style>|<title[^>]*>[\s\S]*?</title>|<meta[^>]*>|<option[^>]*>[\s\S]*?</option>)";
  1074. content = RemoveByReg(content, del);
  1075. del = @"(&nbsp;)|([\n\t]+)";
  1076. content = RemoveByReg(content, del);
  1077. string re = @"(<table(\s+[^>]*)*>)|(<td(\s+[^>]*)*>)|(<tr(\s+[^>]*)*>)|(<p(\s+[^>]*)*>)|(<div(\s+[^>]*)*>)|(<ul(\s+[^>]*)*>)|(<li(\s+[^>]*)*>)|</table>|</td>|</tr>|</p>|<br>|</div>|</li>|</ul>|<p />|<br />";
  1078. content = ReplaceByReg(content, "", re);
  1079. content = ReplaceByReg(content, "", @"[\f\n\r\v]+");
  1080. content = RemoveByReg(content, @"<a(\s+[^>]*)*>[\s\S]*?</a>");
  1081. content = RemoveByReg(content, "<[^>]+>");//去除各种HTML标记,获得纯内容
  1082. content = content.Replace("\n", "");
  1083. content = content.Replace("\r", "");
  1084. content = content.Trim();
  1085. return content;
  1086. }
  1087. /// <summary>
  1088. /// 从html中过滤出文本,不过保留换行符号
  1089. /// </summary>
  1090. /// <param name="sHtml">html源代码</param>
  1091. /// <returns>从html中过滤出文本,不过保留换行符号</returns>
  1092. public static string GetTxtFromHtml2(this string sHtml)
  1093. {
  1094. string del = @"<head[^>]*>[\s\S]*?</head>";
  1095. string content = RemoveByReg(sHtml, del);
  1096. del = @"(<script[^>]*>[\s\S]*?</script>)|(<IFRAME[^>]*>[\s\S]*?</IFRAME>)|(<style[^>]*>[\s\S]*?</style>|<title[^>]*>[\s\S]*?</title>|<meta[^>]*>|<option[^>]*>[\s\S]*?</option>)";
  1097. content = RemoveByReg(content, del);
  1098. del = @"(&nbsp;)|([\t]+)";//del = @"(&nbsp;)|([\n\t]+)";
  1099. content = RemoveByReg(content, del);
  1100. string re = @"(<table(\s+[^>]*)*>)|(<td(\s+[^>]*)*>)|(<tr(\s+[^>]*)*>)|(<p(\s+[^>]*)*>)|(<div(\s+[^>]*)*>)|(<ul(\s+[^>]*)*>)|(<li(\s+[^>]*)*>)|</table>|</td>|</tr>|</p>|<br>|</div>|</li>|</ul>|<p />|<br />";
  1101. content = ReplaceByReg(content, "", re);
  1102. //content = CText.ReplaceByReg(content, "", @"[\f\n\r\v]+");
  1103. content = RemoveByReg(content, @"<a(\s+[^>]*)*>[\s\S]*?</a>");
  1104. content = RemoveByReg(content, "<[^>]+>");//去除各种HTML标记,获得纯内容
  1105. content = content.Trim();
  1106. return content;
  1107. }
  1108. #endregion
  1109. /// <summary>
  1110. /// 按结尾移除内容
  1111. /// </summary>
  1112. /// <param name="sOrg">原始数据</param>
  1113. /// <param name="sEnd">结束的字符串</param>
  1114. /// <returns>清理后的内容</returns>
  1115. public static string RemoveEndWith(string sOrg, string sEnd)
  1116. {
  1117. if (sOrg.EndsWith(sEnd))
  1118. sOrg = sOrg.Remove(sOrg.IndexOf(sEnd), sEnd.Length);
  1119. return sOrg;
  1120. }
  1121. #region 根据超链接地址获取页面内容
  1122. /// <summary>
  1123. /// 根据超链接地址获取页面内容
  1124. /// </summary>
  1125. /// <param name="sUrl">URL</param>
  1126. /// <returns>页面内容</returns>
  1127. public static string GetHtmlByUrl(string sUrl)
  1128. {
  1129. return GetHtmlByUrl(sUrl, "auto");
  1130. }
  1131. /// <summary>
  1132. /// 根据超链接地址获取页面内容
  1133. /// </summary>
  1134. /// <param name="sUrl">URL</param>
  1135. /// <param name="sCoding">文件编码</param>
  1136. /// <returns>页面内容</returns>
  1137. public static string GetHtmlByUrl(string sUrl, string sCoding)
  1138. {
  1139. return GetHtmlByUrl(ref sUrl, sCoding);
  1140. }
  1141. /// <summary>
  1142. /// 根据超链接地址获取页面内容,并将url作为引用类型
  1143. /// </summary>
  1144. /// <param name="sUrl">URL</param>
  1145. /// <param name="sCoding">文件编码</param>
  1146. /// <returns>页面内容</returns>
  1147. public static string GetHtmlByUrl(ref string sUrl, string sCoding)
  1148. {
  1149. string content = "";
  1150. try
  1151. {
  1152. HttpWebResponse response = _MyGetResponse(sUrl);
  1153. if (response == null)
  1154. {
  1155. return content;
  1156. }
  1157. sUrl = response.ResponseUri.AbsoluteUri;
  1158. Stream stream = response.GetResponseStream();
  1159. byte[] buffer = GetContent(stream);
  1160. stream.Close();
  1161. stream.Dispose();
  1162. string charset = "";
  1163. if (string.IsNullOrEmpty(sCoding) || string.Equals(sCoding, "auto", StringComparison.CurrentCultureIgnoreCase))
  1164. {//如果不指定编码,那么系统代为指定
  1165. //首先,从返回头信息中寻找
  1166. string ht = response.GetResponseHeader("Content-Type");
  1167. response.Close();
  1168. string regCharSet = "[\\s\\S]*charset=(?<charset>[\\S]*)";
  1169. Regex r = new Regex(regCharSet, RegexOptions.IgnoreCase);
  1170. Match m = r.Match(ht);
  1171. charset = (m.Captures.Count != 0) ? m.Result("${charset}") : "";
  1172. if (charset == "-8") charset = "utf-8";
  1173. if (charset?.Length == 0)
  1174. {//找不到,则在文件信息本身中查找
  1175. //先按gb2312来获取文件信息
  1176. content = System.Text.Encoding.GetEncoding("gb2312").GetString(buffer);
  1177. regCharSet = "(<meta[^>]*charset=(?<charset>[^>'\"]*)[\\s\\S]*?>)|(xml[^>]+encoding=(\"|')*(?<charset>[^>'\"]*)[\\s\\S]*?>)";
  1178. r = new Regex(regCharSet, RegexOptions.IgnoreCase);
  1179. m = r.Match(content);
  1180. if (m.Captures.Count == 0)
  1181. {//没办法,都找不到编码,只能返回按"gb2312"获取的信息
  1182. //content = CText.RemoveByReg(content, @"<!--[\s\S]*?-->");
  1183. return content;
  1184. }
  1185. charset = m.Result("${charset}");
  1186. }
  1187. }
  1188. else
  1189. {
  1190. response.Close();
  1191. charset = sCoding.ToLower();
  1192. }
  1193. try
  1194. {
  1195. content = System.Text.Encoding.GetEncoding(charset).GetString(buffer);
  1196. }
  1197. catch (ArgumentException)
  1198. {//指定的编码不可识别
  1199. content = Encoding.GetEncoding("gb2312").GetString(buffer);
  1200. }
  1201. //content = CText.RemoveByReg(content, @"<!--[\s\S]*?-->");
  1202. }
  1203. catch
  1204. {
  1205. content = "";
  1206. }
  1207. return content;
  1208. }
  1209. private static HttpWebResponse _MyGetResponse(string sUrl)
  1210. {
  1211. int iTimeOut = 10000;
  1212. //try
  1213. //{
  1214. // //iTimeOut = int.Parse(System.Configuration.ConfigurationManager.AppSettings["SocketTimeOut"]);
  1215. //}
  1216. //catch { iTimeOut = 10000; }
  1217. bool bCookie = false;
  1218. bool bRepeat = false;
  1219. Uri target = new Uri(sUrl);
  1220. ReCatch:
  1221. try
  1222. {
  1223. HttpWebRequest resquest = (HttpWebRequest)WebRequest.Create(target);
  1224. resquest.MaximumResponseHeadersLength = -1;
  1225. resquest.ReadWriteTimeout = 120000;//120秒就超时
  1226. resquest.Timeout = iTimeOut;
  1227. resquest.MaximumAutomaticRedirections = 50;
  1228. resquest.MaximumResponseHeadersLength = 5;
  1229. resquest.AllowAutoRedirect = true;
  1230. if (bCookie)
  1231. {
  1232. resquest.CookieContainer = new CookieContainer();
  1233. }
  1234. resquest.UserAgent = "Mozilla/6.0 (compatible; MSIE 6.0; Windows NT 5.1)";
  1235. //resquest.UserAgent = @"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24";
  1236. //resquest.KeepAlive = true;
  1237. return (HttpWebResponse)resquest.GetResponse();
  1238. }
  1239. catch (WebException)
  1240. {
  1241. if (!bRepeat)
  1242. {
  1243. bRepeat = true;
  1244. bCookie = true;
  1245. goto ReCatch;
  1246. }
  1247. return null;
  1248. }
  1249. catch
  1250. {
  1251. return null;
  1252. }
  1253. }
  1254. private static byte[] GetContent(Stream stream)
  1255. {
  1256. ArrayList arBuffer = new ArrayList();
  1257. const int BUFFSIZE = 4096;
  1258. try
  1259. {
  1260. byte[] buffer = new byte[BUFFSIZE];
  1261. int count = stream.Read(buffer, 0, BUFFSIZE);
  1262. while (count > 0)
  1263. {
  1264. for (int i = 0; i < count; i++)
  1265. {
  1266. arBuffer.Add(buffer[i]);
  1267. }
  1268. count = stream.Read(buffer, 0, BUFFSIZE);
  1269. }
  1270. }
  1271. catch (Exception e)
  1272. {
  1273. LogManager.Error(e);
  1274. }
  1275. return (byte[])arBuffer.ToArray(System.Type.GetType("System.Byte"));
  1276. }
  1277. /// <summary>
  1278. /// 获取http报文头
  1279. /// </summary>
  1280. /// <param name="sUrl">URL</param>
  1281. /// <returns>报文信息</returns>
  1282. public static string GetHttpHead(string sUrl)
  1283. {
  1284. string sHead = "";
  1285. Uri uri = new Uri(sUrl);
  1286. try
  1287. {
  1288. WebRequest req = WebRequest.Create(uri);
  1289. WebResponse resp = req.GetResponse();
  1290. WebHeaderCollection headers = resp.Headers;
  1291. string[] sKeys = headers.AllKeys;
  1292. foreach (string sKey in sKeys)
  1293. {
  1294. sHead += sKey + ":" + headers[sKey] + "\r\n";
  1295. }
  1296. }
  1297. catch (Exception e)
  1298. {
  1299. LogManager.Error(e);
  1300. }
  1301. return sHead;
  1302. }
  1303. /// <summary>
  1304. /// 处理框架页面问题。如果该页面是框架结构的话,返回该框架
  1305. /// </summary>
  1306. /// <param name="url">URL</param>
  1307. /// <param name="content">内容</param>
  1308. /// <returns>框架结构</returns>
  1309. public static string[] DealWithFrame(string url, string content)
  1310. {
  1311. string regFrame = @"<frame\s+[^>]*src\s*=\s*(?:""(?<src>[^""]+)""|'(?<src>[^']+)'|(?<src>[^\s>""']+))[^>]*>";
  1312. return DealWithFrame(regFrame, url, content);
  1313. }
  1314. /// <summary>
  1315. /// 处理浮动桢问题。如果该页面存在浮动桢,返回浮动桢
  1316. /// </summary>
  1317. /// <param name="url">URL</param>
  1318. /// <param name="content">内容</param>
  1319. /// <returns>浮动桢</returns>
  1320. public static string[] DealWithIFrame(string url, string content)
  1321. {
  1322. string regiFrame = @"<iframe\s+[^>]*src\s*=\s*(?:""(?<src>[^""]+)""|'(?<src>[^']+)'|(?<src>[^\s>""']+))[^>]*>";
  1323. return DealWithFrame(regiFrame, url, content);
  1324. }
  1325. private static string[] DealWithFrame(string strReg, string url, string content)
  1326. {
  1327. ArrayList alFrame = new ArrayList();
  1328. Regex r = new Regex(strReg, RegexOptions.IgnoreCase);
  1329. Match m = r.Match(content);
  1330. while (m.Success)
  1331. {
  1332. alFrame.Add(GetUrl(url, m.Groups["src"].Value));
  1333. m = m.NextMatch();
  1334. }
  1335. return (string[])alFrame.ToArray(System.Type.GetType("System.String"));
  1336. }
  1337. #endregion 根据超链接地址获取页面内容
  1338. #region 获得多个页面
  1339. /// <summary>
  1340. /// 获得多个页面
  1341. /// </summary>
  1342. /// <param name="listUrl">URL集合</param>
  1343. /// <param name="sCoding">文件编码</param>
  1344. /// <returns>页面集合</returns>
  1345. /// <exception cref="Exception"> </exception>
  1346. public static List<KeyValuePair<int, string>> GetHtmlByUrlList(List<KeyValuePair<int, string>> listUrl, string sCoding)
  1347. {
  1348. int iTimeOut = 120000;
  1349. StringBuilder sbHtml = new StringBuilder();
  1350. List<KeyValuePair<int, string>> listResult = new List<KeyValuePair<int, string>>();
  1351. Socket sock = null;
  1352. try
  1353. {
  1354. // 初始化
  1355. Uri site = new Uri(listUrl[0].Value);
  1356. var ipHostInfo = Dns.GetHostEntry(site.Host);
  1357. IPAddress ipAddress = ipHostInfo.AddressList[0];
  1358. IPEndPoint remoteEP = new IPEndPoint(ipAddress, site.Port);
  1359. sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
  1360. sock.SendTimeout = iTimeOut;
  1361. sock.ReceiveTimeout = iTimeOut;
  1362. sock.Connect(remoteEP);
  1363. foreach (KeyValuePair<int, string> kvUrl in listUrl)
  1364. {
  1365. site = new Uri(kvUrl.Value);
  1366. string sendMsg = "GET " + HttpUtility.UrlDecode(site.PathAndQuery) + " HTTP/1.1\r\n" +
  1367. "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/msword, application/vnd.ms-powerpoint, */*\r\n" +
  1368. "Accept-Language:en-us\r\n" +
  1369. "Accept-Encoding:gb2312, deflate\r\n" +
  1370. "User-Agent: Mozilla/4.0\r\n" +
  1371. "Host: " + site.Host + "\r\n\r\n" + '\0';
  1372. // 发送
  1373. byte[] msg = Encoding.GetEncoding(sCoding).GetBytes(sendMsg);
  1374. int nBytes;
  1375. if ((nBytes = sock.Send(msg)) == 0)
  1376. {
  1377. sock.Shutdown(SocketShutdown.Both);
  1378. sock.Close();
  1379. return listResult;
  1380. }
  1381. // 接受
  1382. byte[] bytes = new byte[2048];
  1383. byte bt = Convert.ToByte('\x7f');
  1384. do
  1385. {
  1386. int count = 0;
  1387. try
  1388. {
  1389. nBytes = sock.Receive(bytes, bytes.Length - 1, 0);
  1390. }
  1391. catch (Exception Ex)
  1392. {
  1393. //string str = Ex.Message;
  1394. nBytes = -1;
  1395. }
  1396. if (nBytes <= 0) break;
  1397. if (bytes[nBytes - 1] > bt)
  1398. {
  1399. for (int i = nBytes - 1; i >= 0; i--)
  1400. {
  1401. if (bytes[i] > bt)
  1402. count++;
  1403. else
  1404. break;
  1405. }
  1406. if (count % 2 == 1)
  1407. {
  1408. count = sock.Receive(bytes, nBytes, 1, 0);
  1409. if (count < 0)
  1410. break;
  1411. nBytes += count;
  1412. }
  1413. }
  1414. else
  1415. {
  1416. bytes[nBytes] = (byte)'\0';
  1417. }
  1418. string s = Encoding.GetEncoding(sCoding).GetString(bytes, 0, nBytes);
  1419. sbHtml.Append(s);
  1420. } while (nBytes > 0);
  1421. listResult.Add(new KeyValuePair<int, string>(kvUrl.Key, sbHtml.ToString()));
  1422. //sbHtml = null;
  1423. sbHtml = new StringBuilder();
  1424. }
  1425. }
  1426. catch (Exception Ex)
  1427. {
  1428. //string s = Ex.Message;
  1429. try
  1430. {
  1431. sock.Shutdown(SocketShutdown.Both);
  1432. sock.Close();
  1433. }
  1434. catch (Exception e)
  1435. {
  1436. LogManager.Error(e);
  1437. }
  1438. }
  1439. finally
  1440. {
  1441. try
  1442. {
  1443. sock.Shutdown(SocketShutdown.Both);
  1444. sock.Close();
  1445. }
  1446. catch (Exception e)
  1447. {
  1448. LogManager.Error(e);
  1449. }
  1450. }
  1451. return listResult;
  1452. }
  1453. #endregion 根据超链接地址获取页面内容
  1454. /// <summary>
  1455. /// 页面类型枚举
  1456. /// </summary>
  1457. public enum PageType : int
  1458. {
  1459. /// <summary>
  1460. /// HTML格式
  1461. /// </summary>
  1462. HTML = 0,
  1463. /// <summary>
  1464. /// RSS格式
  1465. /// </summary>
  1466. RSS = 1
  1467. }
  1468. /// <summary>
  1469. /// 获取页面类型
  1470. /// </summary>
  1471. /// <param name="sUrl">URL</param>
  1472. /// <param name="sHtml">内容</param>
  1473. /// <returns>页面类型枚举</returns>
  1474. public static PageType GetPageType(string sUrl, ref string sHtml)
  1475. {
  1476. PageType pt = PageType.HTML;
  1477. //看有没有RSS FEED
  1478. string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>";
  1479. Regex r = new Regex(regRss, RegexOptions.IgnoreCase);
  1480. Match m = r.Match(sHtml);
  1481. if (m.Captures.Count != 0)
  1482. {//有,则转向从RSS FEED中抓取
  1483. string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))";
  1484. r = new Regex(regHref, RegexOptions.IgnoreCase);
  1485. m = r.Match(m.Captures[0].Value);
  1486. if (m.Captures.Count > 0)
  1487. {
  1488. //有可能是相对路径,加上绝对路径
  1489. string rssFile = GetUrl(sUrl, m.Groups["href"].Value);
  1490. sHtml = GetHtmlByUrl(rssFile);
  1491. pt = PageType.RSS;
  1492. }
  1493. }
  1494. else
  1495. {//看这个地址本身是不是一个Rss feed
  1496. r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
  1497. m = r.Match(sHtml);
  1498. if (m.Captures.Count > 0)
  1499. {
  1500. pt = PageType.RSS;
  1501. }
  1502. }
  1503. return pt;
  1504. }
  1505. }
  1506. }