fcp-html-analytic.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. /**
  2. * 注册命名空间:baidu.htmlAnalytic
  3. */
  4. baidu.namespace.register("baidu.htmlAnalytic");
  5. /**
  6. *
  7. * html词法分析类
  8. *
  9. * @author lichengyin (FCP:PHP代码)
  10. * @cover zhaoxianlie (FCPHelper:将PHP代码重写为Javascript代码)
  11. */
  12. baidu.htmlAnalytic = function(){
  13. /**
  14. *
  15. * 当前解析到的位置
  16. * @var int
  17. */
  18. this.parsePos = 0;
  19. /**
  20. *
  21. * 要解析的内容
  22. * @var string
  23. */
  24. this.content = '';
  25. /**
  26. *
  27. * 要解析的内容长度
  28. * @var int
  29. */
  30. this.contentLength = 0;
  31. /**
  32. *
  33. * 单个标签
  34. * @var array
  35. */
  36. this.singleTag = [
  37. "br", "input", "link", "meta", "!doctype", "basefont", "base", "col",
  38. "area", "hr", "wbr", "param", "img", "isindex", "?xml", "embed"
  39. ];
  40. /**
  41. * 闭合标签检测时候的白名单
  42. */
  43. this.closeTagWhiteList = [
  44. 'html','body','li','tr','td'
  45. ];
  46. /**
  47. *
  48. * 解析后的token存放处
  49. * @var array
  50. */
  51. this._output = [];
  52. this.__construct = function(){
  53. };
  54. /**
  55. *
  56. * 默认是进行html分析
  57. * type不为1的时候进行tag属性分析
  58. * @param string $content
  59. * @param int $type
  60. */
  61. this.run = function($content, $type){
  62. if($type == undefined) $type = 1;
  63. this.content = $content.trim().replace(/\r\n/g, "\n");
  64. if (this.content.indexOf('<?xml') > -1){
  65. return [[$content, baidu.FL.HTML_XML]];
  66. }
  67. this.contentLength = this.content.length;
  68. if ($type === 1){
  69. this.tokenAnalytic();
  70. return this._output;
  71. }
  72. return this.getTagAttributes($content);
  73. };
  74. /**
  75. *
  76. * 使用特征值进行分析
  77. */
  78. this.tokenAnalytic = function(){
  79. var $token;
  80. while (true){
  81. $token = this.getNextToken();
  82. if ($token){
  83. if ($token[1] === baidu.FL.FL_EOF) break;
  84. this._output.push($token);
  85. }
  86. }
  87. };
  88. /**
  89. *
  90. * 解析下一个特征值
  91. */
  92. this.getNextToken = function(){
  93. if (this.parsePos >= this.contentLength){
  94. return ['', baidu.FL.FL_EOF];
  95. }
  96. var $char = this.content[this.parsePos];
  97. this.parsePos++;
  98. var $outputCount = this._output.length;
  99. var $result;
  100. if ($outputCount){
  101. var $tokenType = this._output[$outputCount - 1][1];
  102. if ( $tokenType === baidu.FL.HTML_JS_START){
  103. //js标签里任何内容都直接通过,不做任何处理
  104. $result = this._getScriptOrStyleContent($char, 1);
  105. if ($result) return $result;
  106. }else if ($tokenType === baidu.FL.HTML_CSS_START){
  107. //style标签里任何内容都直接通过,不做任何处理
  108. $result = this._getScriptOrStyleContent($char, 2);
  109. if ($result) return $result;
  110. }else if($tokenType === baidu.FL.HTML_TEXTAREA_START){
  111. //textarea标签里任何内容都直接通过,不做任何处理
  112. $result = this._getTextareaOrPreContent($char, 1);
  113. if ($result) return $result;
  114. }else if($tokenType === baidu.FL.HTML_PRE_START){
  115. //pre标签里任何内容都直接通过,不做任何处理
  116. $result = this._getTextareaOrPreContent($char, 2);
  117. if ($result) return $result;
  118. }
  119. }
  120. if ($char === "\x0d") return ''; // \r
  121. if ($char === "\x0a"){
  122. return [$char, baidu.FL.FL_NEW_LINE];
  123. }
  124. //处理一般性的标签,当前字符为<并且下一个字符不为<
  125. if ($char === '<' && this.content[this.parsePos] !== '<'){
  126. $result = this._getTagToken($char);
  127. if ($result) return $result;
  128. }
  129. $result = this._getContentToken($char);
  130. if ($result) return $result;
  131. return [$char, baidu.FL.FL_NORMAL];
  132. };
  133. /**
  134. * 标签
  135. * @param {Object} $char
  136. */
  137. this._getTagToken = function($char){
  138. var $resultString = $char;
  139. do {
  140. if (this.parsePos >= this.contentLength){
  141. break;
  142. }
  143. $char = this.content[this.parsePos];
  144. this.parsePos++;
  145. if ($char === '"' || $char === "'"){
  146. if ($resultString[1] !== '!'){
  147. $resultString += $char;
  148. $resultString += this._getUnformated($char);
  149. }
  150. }else {
  151. $resultString += $char;
  152. }
  153. }while ($char !== '>');
  154. //注释或者ie hack
  155. if ($resultString[1] === '!'){
  156. if ($resultString.indexOf('[if') > -1){
  157. if ($resultString.indexOf('!IE') > -1){
  158. $resultString += this._getUnformated('-->', $resultString);
  159. }
  160. return [$resultString, baidu.FL.HTML_IE_HACK_START];
  161. }else if ($resultString.indexOf('[[endif') > -1){
  162. return [$resultString, baidu.FL.HTML_IE_HACK_EDN];
  163. }else if (this._checkEqual($resultString, 2, 7, 'doctype')){
  164. return [$resultString, baidu.FL.HTML_DOC_TYPE];
  165. }else if(this._checkEqual($resultString, 4, 6, 'status')){
  166. $resultString += this._getUnformated('-->', $resultString);
  167. return [$resultString, baidu.FL.HTML_STATUS_OK];
  168. }else {
  169. $resultString += this._getUnformated('-->', $resultString);
  170. return [$resultString, baidu.FL.HTML_COMMENT];
  171. }
  172. }
  173. if (this._checkEqual($resultString, 0, 7, '<script')){
  174. return [$resultString, baidu.FL.HTML_JS_START];
  175. }else if (this._checkEqual($resultString, 0, 9, '</script>')){
  176. return [$resultString, baidu.FL.HTML_JS_END];
  177. }else if (this._checkEqual($resultString, 0, 6, '<style')){
  178. return [$resultString, baidu.FL.HTML_CSS_START];
  179. }else if (this._checkEqual($resultString, 0, 8, '</style>')){
  180. return [$resultString, baidu.FL.HTML_CSS_END];
  181. }else if (this._checkEqual($resultString, 0, 9, '<textarea')){
  182. return [$resultString, baidu.FL.HTML_TEXTAREA_START];
  183. }else if (this._checkEqual($resultString, 0, 11, '</textarea>')){
  184. return [$resultString, baidu.FL.HTML_TEXTAREA_END];
  185. }else if (this._checkEqual($resultString, 0, 4, '<pre')){
  186. return [$resultString, baidu.FL.HTML_PRE_START];
  187. }else if (this._checkEqual($resultString, 0, 6, '</pre>')){
  188. return [$resultString, baidu.FL.HTML_PRE_END];
  189. }
  190. if (this._checkEqual($resultString, 0, 2, '</')){
  191. return [$resultString, baidu.FL.HTML_TAG_END];
  192. }
  193. return [$resultString, baidu.FL.HTML_TAG_START];
  194. };
  195. /**
  196. *
  197. * 检测一个字符串的截取部分是否等于一个特定的字符串
  198. * @param string $str
  199. * @param int $start
  200. * @param int $len
  201. * @param string $result
  202. */
  203. this._checkEqual = function($str, $start, $len, $result){
  204. return $str.substr($start, $len).toLowerCase() === $result.toLowerCase();
  205. };
  206. /**
  207. *
  208. * 解析文本节点
  209. * @param string $char
  210. */
  211. this._getContentToken = function($char){
  212. var $resultString = $char;
  213. while (true){
  214. if (this.parsePos >= this.contentLength){
  215. break;
  216. }
  217. //增加对<a href=""><<<</a>的兼容,此时内容为<<<
  218. if (this.content[this.parsePos] === '<'
  219. && this.content[this.parsePos+1]
  220. && this.content[this.parsePos+1] !== '<' && this.content[this.parsePos+1] !== '>'){
  221. break;
  222. }
  223. $resultString += this.content[this.parsePos];
  224. this.parsePos++;
  225. }
  226. return [$resultString, baidu.FL.HTML_CONTENT];
  227. };
  228. /**
  229. * 获取需要的字符
  230. * @param {Object} $char
  231. * @param {Object} $orign
  232. */
  233. this._getUnformated = function($char, $orign){
  234. if($orign == undefined) $orign = '';
  235. if ($orign.indexOf($char) > -1) return '';
  236. var $resultString = '';
  237. do {
  238. if (this.parsePos >= this.contentLength){
  239. break;
  240. }
  241. $c = this.content[this.parsePos];
  242. $resultString += $c;
  243. this.parsePos++;
  244. }while ($resultString.indexOf($char) == -1);
  245. //增加一个字符的容错机制,如:value=""",这里一不小心多写了个引号
  246. if ($char.length === 1){
  247. while ($char === this.content[this.parsePos]){
  248. $resultString += this.content[this.parsePos];
  249. this.parsePos++;
  250. }
  251. }
  252. return $resultString;
  253. };
  254. /**
  255. * 获取script或者style里的内容
  256. * @param {String} $char
  257. * @param {Integer} $type 0:script,1:style
  258. */
  259. this._getScriptOrStyleContent = function($char, $type){
  260. var $tokenText = $type == 1 ? '</script>' : '</style>';
  261. var $tokenLength = $tokenText.length;
  262. if (this.content.substr( this.parsePos - 1, $tokenLength).toLowerCase() === $tokenText){
  263. return '';
  264. }
  265. var $resultString = $char;
  266. while (this.parsePos < this.contentLength){
  267. if (this.content.substr( this.parsePos, $tokenLength).toLowerCase() === $tokenText){
  268. break;
  269. }else {
  270. $resultString += this.content[this.parsePos];
  271. this.parsePos++;
  272. }
  273. }
  274. $resultString = $resultString.trim();
  275. var $startEscape = ['<!--', '/*<![CDATA[*/', '//<![CDATA['];
  276. var $endEscape = ['//-->', '/*]]>*/', '//]]>'];
  277. for (var $escape in $startEscape){
  278. if ($resultString.indexOf($escape) === 0){
  279. $resultString = $resultString.substr($escape.length);
  280. break;
  281. }
  282. }
  283. for (var $escape in $endEscape ){
  284. if ($resultString.indexOf($escape) === ($resultString.length - $escape.length)){
  285. $resultString = $resultString.substr(0, $resultString.length - $escape.length);
  286. break;
  287. }
  288. }
  289. return [$resultString.trim(), $type === 1 ? baidu.FL.HTML_JS_CONTENT : baidu.FL.HTML_CSS_CONTENT];
  290. };
  291. /**
  292. * 获取Textarea或者pre标签的内容
  293. * @param {String} $char
  294. * @param {Integer} $type 0:textarea,1:pre
  295. */
  296. this._getTextareaOrPreContent = function($char, $type){
  297. var $tokenText = $type == 1 ? '</textarea>' : '</pre>';
  298. var $tokenLength = $tokenText.length;
  299. if (this.content.substr( this.parsePos - 1, $tokenLength).toLowerCase() === $tokenText){
  300. return '';
  301. }
  302. var $resultString = $char;
  303. while (this.parsePos < this.contentLength){
  304. if (this.content.substr( this.parsePos, $tokenLength).toLowerCase() === $tokenText){
  305. break;
  306. }else {
  307. $resultString += this.content[this.parsePos];
  308. this.parsePos++;
  309. }
  310. }
  311. return [$resultString.trim(), $type === 1 ? baidu.FL.HTML_TEXTAREA_CONTENT : baidu.FL.HTML_PRE_CONTENT];
  312. };
  313. /**
  314. *
  315. * 分析tag标签的属性名和属性值
  316. * @param string $tagContent
  317. */
  318. this.getTagAttributes = function($tagContent){
  319. //tag end
  320. var $tagContent = $tagContent.trim();
  321. if ($tagContent.substr( 0, 2) === '</') {
  322. return [
  323. baidu.FL.HTML_TAG_END,
  324. $tagContent.substr(2, $tagContent.length - 3).trim()
  325. ];
  326. }
  327. //tag start
  328. var $result = [baidu.FL.HTML_TAG_START, '', []];
  329. this.parsePos = 1;
  330. this.contentLength = $tagContent.replace(/^>\/|>\/$/g,'').length;
  331. var $tagName = '';
  332. while (true){
  333. if (this.parsePos >= this.contentLength){
  334. break;
  335. }
  336. $char = this.content[this.parsePos];
  337. this.parsePos++;
  338. if (!/^[a-z0-9]{1}$/g.test($char)){
  339. this.parsePos--;
  340. break;
  341. }else{
  342. $tagName += $char;
  343. }
  344. }
  345. //get tag name
  346. $result[1] = $tagName;
  347. var $attr = $name = '';
  348. while (true){
  349. if (this.parsePos >= this.contentLength){
  350. break;
  351. }
  352. $char = this.content[this.parsePos];
  353. this.parsePos++;
  354. var $re;
  355. if ($char === '"' || $char === "'"){
  356. $re = $char + this._getUnformated($char);
  357. $result[2].push([$name, $re]);
  358. }else if ($char === '='){
  359. $name = $attr;
  360. $attr = '';
  361. }else if ($char === ' '){
  362. if ($attr){
  363. if ($name){
  364. $result[2].push([$name, $attr]);
  365. }else{
  366. $result[2].push([$attr, '']);
  367. }
  368. }
  369. $name = $attr = '';
  370. }else{
  371. if ($char !== ' ' && $char != "\n" && $char != "\r" && $char != "\t") $attr += $char;
  372. }
  373. }
  374. if ($attr){
  375. if ($name){
  376. $result[2].push([$name, $attr]);
  377. }else{
  378. $result[2].push([$attr, '']);
  379. }
  380. }
  381. return $result;
  382. };
  383. /**
  384. * 模拟PHP中的in_array
  385. * @param {Object} $tag
  386. */
  387. this._in_array = function($array,$tag){
  388. for(var i = 0,len = $array.length;i < len;i++){
  389. if($tag.trim() == $array[i]) return true;
  390. }
  391. return false;
  392. };
  393. /**
  394. * 从给定的html代码中提取tagName
  395. * @param {Object} $tagOuterHtml
  396. * @param {Object} $type 0:开始标签,1:结束标签
  397. */
  398. this._getTagName = function($tagOuterHtml,$type){
  399. var reg_start = /^<([^\s\/>]+)\s*\/?/g;
  400. var reg_end = /^<\/([^\s]+)>/g;
  401. var tagAttrs = ($type == 0 ? reg_start : reg_end).exec($tagOuterHtml);
  402. return tagAttrs ? tagAttrs[1] : '';
  403. };
  404. /**
  405. * 获取某段html中未闭合的标签
  406. * @param {Object} str 待检测的html片段
  407. */
  408. this.getUnclosedTags = function(str){
  409. //给Array增加remove方法
  410. Array.prototype.remove = function(str){
  411. for (var index = this.length - 1; index >= 0; index--) {
  412. if (str == this[index].tagName) {
  413. this.splice(index,1);
  414. return true;
  415. }
  416. }
  417. return false;
  418. };
  419. //HTML词法分析
  420. var analyticRst = this.run(str);
  421. var rawHtml = [];
  422. for(var i = 0,len = analyticRst.length;i < len;i++){
  423. if(analyticRst[i][1] === baidu.FL.HTML_PRE_START ||
  424. analyticRst[i][1] === baidu.FL.HTML_PRE_END ||
  425. analyticRst[i][1] === baidu.FL.HTML_TEXTAREA_START ||
  426. analyticRst[i][1] === baidu.FL.HTML_TEXTAREA_END ||
  427. analyticRst[i][1] === baidu.FL.HTML_TAG_START ||
  428. analyticRst[i][1] === baidu.FL.HTML_TAG_END ||
  429. analyticRst[i][1] === baidu.FL.HTML_XML) {
  430. rawHtml.push(analyticRst[i]);
  431. }
  432. }
  433. var tag = ''; // 标签
  434. var startUncloseTags = []; // "开始标签栈",前不闭合,如有</div>而前面没有<div>
  435. var endUncloseTags = []; // "结束标签栈",后不闭合,如有<div>而后面没有</div>
  436. //开始分析
  437. for(var i = 0,len = rawHtml.length;i < len;i++) {
  438. //开始标签
  439. if(rawHtml[i][1] !== baidu.FL.HTML_PRE_END &&
  440. rawHtml[i][1] !== baidu.FL.HTML_TEXTAREA_END &&
  441. rawHtml[i][1] !== baidu.FL.HTML_TAG_END) {
  442. tag = this._getTagName(rawHtml[i][0],0);
  443. endUncloseTags.push({
  444. tagName : tag,
  445. outerHTML : rawHtml[i][0],
  446. type : 1 //1表示标签后不闭合
  447. });
  448. }
  449. //结束标签
  450. else {
  451. tag = this._getTagName(rawHtml[i][0],1);
  452. // 从"结束标签栈"移除一个闭合的标签
  453. if (!endUncloseTags.remove(tag)) { // 若移除失败,说明前面没有需要闭合的标签
  454. startUncloseTags.push({ // 此标签需要前闭合
  455. tagName : tag,
  456. outerHTML : rawHtml[i][0],
  457. type : 0 //0表示标签前不闭合
  458. });
  459. }
  460. }
  461. }
  462. //结果
  463. var rst = [],temp = endUncloseTags.concat(startUncloseTags);
  464. //后不闭合\前不闭合,此处过滤自动闭合的标签
  465. for(var i = 0,len = temp.length;i < len;i++) {
  466. if((!this._in_array(this.singleTag ,temp[i].tagName.toLowerCase()) || temp[i].type == 0)
  467. && !this._in_array(this.closeTagWhiteList ,temp[i].tagName.toLowerCase())) {
  468. rst.push(temp[i]);
  469. }
  470. }
  471. return rst;
  472. };
  473. };