xmltok_impl.c 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775
  1. /*
  2. Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  3. See the file COPYING for copying permission.
  4. */
  5. #ifndef IS_INVALID_CHAR
  6. #define IS_INVALID_CHAR(enc, ptr, n) (0)
  7. #endif
  8. #ifndef INVALID_LEAD_CASE
  9. #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  10. case BT_LEAD ## n: \
  11. if (end - ptr < n) \
  12. return XML_TOK_PARTIAL_CHAR; \
  13. if (IS_INVALID_CHAR(enc, ptr, n)) { \
  14. *(nextTokPtr) = (ptr); \
  15. return XML_TOK_INVALID; \
  16. } \
  17. ptr += n; \
  18. break;
  19. #endif
  20. #define INVALID_CASES(ptr, nextTokPtr) \
  21. INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  22. INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  23. INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  24. case BT_NONXML: \
  25. case BT_MALFORM: \
  26. case BT_TRAIL: \
  27. *(nextTokPtr) = (ptr); \
  28. return XML_TOK_INVALID;
  29. #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  30. case BT_LEAD ## n: \
  31. if (end - ptr < n) \
  32. return XML_TOK_PARTIAL_CHAR; \
  33. if (!IS_NAME_CHAR(enc, ptr, n)) { \
  34. *nextTokPtr = ptr; \
  35. return XML_TOK_INVALID; \
  36. } \
  37. ptr += n; \
  38. break;
  39. #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  40. case BT_NONASCII: \
  41. if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  42. *nextTokPtr = ptr; \
  43. return XML_TOK_INVALID; \
  44. } \
  45. case BT_NMSTRT: \
  46. case BT_HEX: \
  47. case BT_DIGIT: \
  48. case BT_NAME: \
  49. case BT_MINUS: \
  50. ptr += MINBPC(enc); \
  51. break; \
  52. CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  53. CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  54. CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  55. #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  56. case BT_LEAD ## n: \
  57. if (end - ptr < n) \
  58. return XML_TOK_PARTIAL_CHAR; \
  59. if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
  60. *nextTokPtr = ptr; \
  61. return XML_TOK_INVALID; \
  62. } \
  63. ptr += n; \
  64. break;
  65. #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  66. case BT_NONASCII: \
  67. if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  68. *nextTokPtr = ptr; \
  69. return XML_TOK_INVALID; \
  70. } \
  71. case BT_NMSTRT: \
  72. case BT_HEX: \
  73. ptr += MINBPC(enc); \
  74. break; \
  75. CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  76. CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  77. CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  78. #ifndef PREFIX
  79. #define PREFIX(ident) ident
  80. #endif
  81. /* ptr points to character following "<!-" */
  82. static
  83. int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
  84. const char **nextTokPtr)
  85. {
  86. if (ptr != end) {
  87. if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  88. *nextTokPtr = ptr;
  89. return XML_TOK_INVALID;
  90. }
  91. ptr += MINBPC(enc);
  92. while (ptr != end) {
  93. switch (BYTE_TYPE(enc, ptr)) {
  94. INVALID_CASES(ptr, nextTokPtr)
  95. case BT_MINUS:
  96. if ((ptr += MINBPC(enc)) == end)
  97. return XML_TOK_PARTIAL;
  98. if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  99. if ((ptr += MINBPC(enc)) == end)
  100. return XML_TOK_PARTIAL;
  101. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  102. *nextTokPtr = ptr;
  103. return XML_TOK_INVALID;
  104. }
  105. *nextTokPtr = ptr + MINBPC(enc);
  106. return XML_TOK_COMMENT;
  107. }
  108. break;
  109. default:
  110. ptr += MINBPC(enc);
  111. break;
  112. }
  113. }
  114. }
  115. return XML_TOK_PARTIAL;
  116. }
  117. /* ptr points to character following "<!" */
  118. static
  119. int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
  120. const char **nextTokPtr)
  121. {
  122. if (ptr == end)
  123. return XML_TOK_PARTIAL;
  124. switch (BYTE_TYPE(enc, ptr)) {
  125. case BT_MINUS:
  126. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  127. case BT_LSQB:
  128. *nextTokPtr = ptr + MINBPC(enc);
  129. return XML_TOK_COND_SECT_OPEN;
  130. case BT_NMSTRT:
  131. case BT_HEX:
  132. ptr += MINBPC(enc);
  133. break;
  134. default:
  135. *nextTokPtr = ptr;
  136. return XML_TOK_INVALID;
  137. }
  138. while (ptr != end) {
  139. switch (BYTE_TYPE(enc, ptr)) {
  140. case BT_PERCNT:
  141. if (ptr + MINBPC(enc) == end)
  142. return XML_TOK_PARTIAL;
  143. /* don't allow <!ENTITY% foo "whatever"> */
  144. switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  145. case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
  146. *nextTokPtr = ptr;
  147. return XML_TOK_INVALID;
  148. }
  149. /* fall through */
  150. case BT_S: case BT_CR: case BT_LF:
  151. *nextTokPtr = ptr;
  152. return XML_TOK_DECL_OPEN;
  153. case BT_NMSTRT:
  154. case BT_HEX:
  155. ptr += MINBPC(enc);
  156. break;
  157. default:
  158. *nextTokPtr = ptr;
  159. return XML_TOK_INVALID;
  160. }
  161. }
  162. return XML_TOK_PARTIAL;
  163. }
  164. static
  165. int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
  166. {
  167. int upper = 0;
  168. cmExpatUnused(enc);
  169. *tokPtr = XML_TOK_PI;
  170. if (end - ptr != MINBPC(enc)*3)
  171. return 1;
  172. switch (BYTE_TO_ASCII(enc, ptr)) {
  173. case ASCII_x:
  174. break;
  175. case ASCII_X:
  176. upper = 1;
  177. break;
  178. default:
  179. return 1;
  180. }
  181. ptr += MINBPC(enc);
  182. switch (BYTE_TO_ASCII(enc, ptr)) {
  183. case ASCII_m:
  184. break;
  185. case ASCII_M:
  186. upper = 1;
  187. break;
  188. default:
  189. return 1;
  190. }
  191. ptr += MINBPC(enc);
  192. switch (BYTE_TO_ASCII(enc, ptr)) {
  193. case ASCII_l:
  194. break;
  195. case ASCII_L:
  196. upper = 1;
  197. break;
  198. default:
  199. return 1;
  200. }
  201. if (upper)
  202. return 0;
  203. *tokPtr = XML_TOK_XML_DECL;
  204. return 1;
  205. }
  206. /* ptr points to character following "<?" */
  207. static
  208. int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
  209. const char **nextTokPtr)
  210. {
  211. int tok;
  212. const char *target = ptr;
  213. if (ptr == end)
  214. return XML_TOK_PARTIAL;
  215. switch (BYTE_TYPE(enc, ptr)) {
  216. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  217. default:
  218. *nextTokPtr = ptr;
  219. return XML_TOK_INVALID;
  220. }
  221. while (ptr != end) {
  222. switch (BYTE_TYPE(enc, ptr)) {
  223. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  224. case BT_S: case BT_CR: case BT_LF:
  225. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  226. *nextTokPtr = ptr;
  227. return XML_TOK_INVALID;
  228. }
  229. ptr += MINBPC(enc);
  230. while (ptr != end) {
  231. switch (BYTE_TYPE(enc, ptr)) {
  232. INVALID_CASES(ptr, nextTokPtr)
  233. case BT_QUEST:
  234. ptr += MINBPC(enc);
  235. if (ptr == end)
  236. return XML_TOK_PARTIAL;
  237. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  238. *nextTokPtr = ptr + MINBPC(enc);
  239. return tok;
  240. }
  241. break;
  242. default:
  243. ptr += MINBPC(enc);
  244. break;
  245. }
  246. }
  247. return XML_TOK_PARTIAL;
  248. case BT_QUEST:
  249. if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  250. *nextTokPtr = ptr;
  251. return XML_TOK_INVALID;
  252. }
  253. ptr += MINBPC(enc);
  254. if (ptr == end)
  255. return XML_TOK_PARTIAL;
  256. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  257. *nextTokPtr = ptr + MINBPC(enc);
  258. return tok;
  259. }
  260. /* fall through */
  261. default:
  262. *nextTokPtr = ptr;
  263. return XML_TOK_INVALID;
  264. }
  265. }
  266. return XML_TOK_PARTIAL;
  267. }
  268. static
  269. int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
  270. const char **nextTokPtr)
  271. {
  272. static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
  273. int i;
  274. cmExpatUnused(enc);
  275. /* CDATA[ */
  276. if (end - ptr < 6 * MINBPC(enc))
  277. return XML_TOK_PARTIAL;
  278. for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  279. if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  280. *nextTokPtr = ptr;
  281. return XML_TOK_INVALID;
  282. }
  283. }
  284. *nextTokPtr = ptr;
  285. return XML_TOK_CDATA_SECT_OPEN;
  286. }
  287. static
  288. int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
  289. const char **nextTokPtr)
  290. {
  291. if (ptr == end)
  292. return XML_TOK_NONE;
  293. if (MINBPC(enc) > 1) {
  294. size_t n = end - ptr;
  295. if (n & (MINBPC(enc) - 1)) {
  296. n &= ~(MINBPC(enc) - 1);
  297. if (n == 0)
  298. return XML_TOK_PARTIAL;
  299. end = ptr + n;
  300. }
  301. }
  302. switch (BYTE_TYPE(enc, ptr)) {
  303. case BT_RSQB:
  304. ptr += MINBPC(enc);
  305. if (ptr == end)
  306. return XML_TOK_PARTIAL;
  307. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  308. break;
  309. ptr += MINBPC(enc);
  310. if (ptr == end)
  311. return XML_TOK_PARTIAL;
  312. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  313. ptr -= MINBPC(enc);
  314. break;
  315. }
  316. *nextTokPtr = ptr + MINBPC(enc);
  317. return XML_TOK_CDATA_SECT_CLOSE;
  318. case BT_CR:
  319. ptr += MINBPC(enc);
  320. if (ptr == end)
  321. return XML_TOK_PARTIAL;
  322. if (BYTE_TYPE(enc, ptr) == BT_LF)
  323. ptr += MINBPC(enc);
  324. *nextTokPtr = ptr;
  325. return XML_TOK_DATA_NEWLINE;
  326. case BT_LF:
  327. *nextTokPtr = ptr + MINBPC(enc);
  328. return XML_TOK_DATA_NEWLINE;
  329. INVALID_CASES(ptr, nextTokPtr)
  330. default:
  331. ptr += MINBPC(enc);
  332. break;
  333. }
  334. while (ptr != end) {
  335. switch (BYTE_TYPE(enc, ptr)) {
  336. #define LEAD_CASE(n) \
  337. case BT_LEAD ## n: \
  338. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  339. *nextTokPtr = ptr; \
  340. return XML_TOK_DATA_CHARS; \
  341. } \
  342. ptr += n; \
  343. break;
  344. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  345. #undef LEAD_CASE
  346. case BT_NONXML:
  347. case BT_MALFORM:
  348. case BT_TRAIL:
  349. case BT_CR:
  350. case BT_LF:
  351. case BT_RSQB:
  352. *nextTokPtr = ptr;
  353. return XML_TOK_DATA_CHARS;
  354. default:
  355. ptr += MINBPC(enc);
  356. break;
  357. }
  358. }
  359. *nextTokPtr = ptr;
  360. return XML_TOK_DATA_CHARS;
  361. }
  362. /* ptr points to character following "</" */
  363. static
  364. int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
  365. const char **nextTokPtr)
  366. {
  367. if (ptr == end)
  368. return XML_TOK_PARTIAL;
  369. switch (BYTE_TYPE(enc, ptr)) {
  370. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  371. default:
  372. *nextTokPtr = ptr;
  373. return XML_TOK_INVALID;
  374. }
  375. while (ptr != end) {
  376. switch (BYTE_TYPE(enc, ptr)) {
  377. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  378. case BT_S: case BT_CR: case BT_LF:
  379. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  380. switch (BYTE_TYPE(enc, ptr)) {
  381. case BT_S: case BT_CR: case BT_LF:
  382. break;
  383. case BT_GT:
  384. *nextTokPtr = ptr + MINBPC(enc);
  385. return XML_TOK_END_TAG;
  386. default:
  387. *nextTokPtr = ptr;
  388. return XML_TOK_INVALID;
  389. }
  390. }
  391. return XML_TOK_PARTIAL;
  392. #ifdef XML_NS
  393. case BT_COLON:
  394. /* no need to check qname syntax here, since end-tag must match exactly */
  395. ptr += MINBPC(enc);
  396. break;
  397. #endif
  398. case BT_GT:
  399. *nextTokPtr = ptr + MINBPC(enc);
  400. return XML_TOK_END_TAG;
  401. default:
  402. *nextTokPtr = ptr;
  403. return XML_TOK_INVALID;
  404. }
  405. }
  406. return XML_TOK_PARTIAL;
  407. }
  408. /* ptr points to character following "&#X" */
  409. static
  410. int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
  411. const char **nextTokPtr)
  412. {
  413. if (ptr != end) {
  414. switch (BYTE_TYPE(enc, ptr)) {
  415. case BT_DIGIT:
  416. case BT_HEX:
  417. break;
  418. default:
  419. *nextTokPtr = ptr;
  420. return XML_TOK_INVALID;
  421. }
  422. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  423. switch (BYTE_TYPE(enc, ptr)) {
  424. case BT_DIGIT:
  425. case BT_HEX:
  426. break;
  427. case BT_SEMI:
  428. *nextTokPtr = ptr + MINBPC(enc);
  429. return XML_TOK_CHAR_REF;
  430. default:
  431. *nextTokPtr = ptr;
  432. return XML_TOK_INVALID;
  433. }
  434. }
  435. }
  436. return XML_TOK_PARTIAL;
  437. }
  438. /* ptr points to character following "&#" */
  439. static
  440. int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
  441. const char **nextTokPtr)
  442. {
  443. if (ptr != end) {
  444. if (CHAR_MATCHES(enc, ptr, ASCII_x))
  445. return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  446. switch (BYTE_TYPE(enc, ptr)) {
  447. case BT_DIGIT:
  448. break;
  449. default:
  450. *nextTokPtr = ptr;
  451. return XML_TOK_INVALID;
  452. }
  453. for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  454. switch (BYTE_TYPE(enc, ptr)) {
  455. case BT_DIGIT:
  456. break;
  457. case BT_SEMI:
  458. *nextTokPtr = ptr + MINBPC(enc);
  459. return XML_TOK_CHAR_REF;
  460. default:
  461. *nextTokPtr = ptr;
  462. return XML_TOK_INVALID;
  463. }
  464. }
  465. }
  466. return XML_TOK_PARTIAL;
  467. }
  468. /* ptr points to character following "&" */
  469. static
  470. int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  471. const char **nextTokPtr)
  472. {
  473. if (ptr == end)
  474. return XML_TOK_PARTIAL;
  475. switch (BYTE_TYPE(enc, ptr)) {
  476. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  477. case BT_NUM:
  478. return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  479. default:
  480. *nextTokPtr = ptr;
  481. return XML_TOK_INVALID;
  482. }
  483. while (ptr != end) {
  484. switch (BYTE_TYPE(enc, ptr)) {
  485. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  486. case BT_SEMI:
  487. *nextTokPtr = ptr + MINBPC(enc);
  488. return XML_TOK_ENTITY_REF;
  489. default:
  490. *nextTokPtr = ptr;
  491. return XML_TOK_INVALID;
  492. }
  493. }
  494. return XML_TOK_PARTIAL;
  495. }
  496. /* ptr points to character following first character of attribute name */
  497. static
  498. int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  499. const char **nextTokPtr)
  500. {
  501. #ifdef XML_NS
  502. int hadColon = 0;
  503. #endif
  504. while (ptr != end) {
  505. switch (BYTE_TYPE(enc, ptr)) {
  506. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  507. #ifdef XML_NS
  508. case BT_COLON:
  509. if (hadColon) {
  510. *nextTokPtr = ptr;
  511. return XML_TOK_INVALID;
  512. }
  513. hadColon = 1;
  514. ptr += MINBPC(enc);
  515. if (ptr == end)
  516. return XML_TOK_PARTIAL;
  517. switch (BYTE_TYPE(enc, ptr)) {
  518. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  519. default:
  520. *nextTokPtr = ptr;
  521. return XML_TOK_INVALID;
  522. }
  523. break;
  524. #endif
  525. case BT_S: case BT_CR: case BT_LF:
  526. for (;;) {
  527. int t;
  528. ptr += MINBPC(enc);
  529. if (ptr == end)
  530. return XML_TOK_PARTIAL;
  531. t = BYTE_TYPE(enc, ptr);
  532. if (t == BT_EQUALS)
  533. break;
  534. switch (t) {
  535. case BT_S:
  536. case BT_LF:
  537. case BT_CR:
  538. break;
  539. default:
  540. *nextTokPtr = ptr;
  541. return XML_TOK_INVALID;
  542. }
  543. }
  544. /* fall through */
  545. case BT_EQUALS:
  546. {
  547. int open;
  548. #ifdef XML_NS
  549. hadColon = 0;
  550. #endif
  551. for (;;) {
  552. ptr += MINBPC(enc);
  553. if (ptr == end)
  554. return XML_TOK_PARTIAL;
  555. open = BYTE_TYPE(enc, ptr);
  556. if (open == BT_QUOT || open == BT_APOS)
  557. break;
  558. switch (open) {
  559. case BT_S:
  560. case BT_LF:
  561. case BT_CR:
  562. break;
  563. default:
  564. *nextTokPtr = ptr;
  565. return XML_TOK_INVALID;
  566. }
  567. }
  568. ptr += MINBPC(enc);
  569. /* in attribute value */
  570. for (;;) {
  571. int t;
  572. if (ptr == end)
  573. return XML_TOK_PARTIAL;
  574. t = BYTE_TYPE(enc, ptr);
  575. if (t == open)
  576. break;
  577. switch (t) {
  578. INVALID_CASES(ptr, nextTokPtr)
  579. case BT_AMP:
  580. {
  581. int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  582. if (tok <= 0) {
  583. if (tok == XML_TOK_INVALID)
  584. *nextTokPtr = ptr;
  585. return tok;
  586. }
  587. break;
  588. }
  589. case BT_LT:
  590. *nextTokPtr = ptr;
  591. return XML_TOK_INVALID;
  592. default:
  593. ptr += MINBPC(enc);
  594. break;
  595. }
  596. }
  597. ptr += MINBPC(enc);
  598. if (ptr == end)
  599. return XML_TOK_PARTIAL;
  600. switch (BYTE_TYPE(enc, ptr)) {
  601. case BT_S:
  602. case BT_CR:
  603. case BT_LF:
  604. break;
  605. case BT_SOL:
  606. goto sol;
  607. case BT_GT:
  608. goto gt;
  609. default:
  610. *nextTokPtr = ptr;
  611. return XML_TOK_INVALID;
  612. }
  613. /* ptr points to closing quote */
  614. for (;;) {
  615. ptr += MINBPC(enc);
  616. if (ptr == end)
  617. return XML_TOK_PARTIAL;
  618. switch (BYTE_TYPE(enc, ptr)) {
  619. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  620. case BT_S: case BT_CR: case BT_LF:
  621. continue;
  622. case BT_GT:
  623. gt:
  624. *nextTokPtr = ptr + MINBPC(enc);
  625. return XML_TOK_START_TAG_WITH_ATTS;
  626. case BT_SOL:
  627. sol:
  628. ptr += MINBPC(enc);
  629. if (ptr == end)
  630. return XML_TOK_PARTIAL;
  631. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  632. *nextTokPtr = ptr;
  633. return XML_TOK_INVALID;
  634. }
  635. *nextTokPtr = ptr + MINBPC(enc);
  636. return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  637. default:
  638. *nextTokPtr = ptr;
  639. return XML_TOK_INVALID;
  640. }
  641. break;
  642. }
  643. break;
  644. }
  645. default:
  646. *nextTokPtr = ptr;
  647. return XML_TOK_INVALID;
  648. }
  649. }
  650. return XML_TOK_PARTIAL;
  651. }
  652. /* ptr points to character following "<" */
  653. static
  654. int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  655. const char **nextTokPtr)
  656. {
  657. #ifdef XML_NS
  658. int hadColon;
  659. #endif
  660. if (ptr == end)
  661. return XML_TOK_PARTIAL;
  662. switch (BYTE_TYPE(enc, ptr)) {
  663. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  664. case BT_EXCL:
  665. if ((ptr += MINBPC(enc)) == end)
  666. return XML_TOK_PARTIAL;
  667. switch (BYTE_TYPE(enc, ptr)) {
  668. case BT_MINUS:
  669. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  670. case BT_LSQB:
  671. return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  672. }
  673. *nextTokPtr = ptr;
  674. return XML_TOK_INVALID;
  675. case BT_QUEST:
  676. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  677. case BT_SOL:
  678. return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  679. default:
  680. *nextTokPtr = ptr;
  681. return XML_TOK_INVALID;
  682. }
  683. #ifdef XML_NS
  684. hadColon = 0;
  685. #endif
  686. /* we have a start-tag */
  687. while (ptr != end) {
  688. switch (BYTE_TYPE(enc, ptr)) {
  689. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  690. #ifdef XML_NS
  691. case BT_COLON:
  692. if (hadColon) {
  693. *nextTokPtr = ptr;
  694. return XML_TOK_INVALID;
  695. }
  696. hadColon = 1;
  697. ptr += MINBPC(enc);
  698. if (ptr == end)
  699. return XML_TOK_PARTIAL;
  700. switch (BYTE_TYPE(enc, ptr)) {
  701. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  702. default:
  703. *nextTokPtr = ptr;
  704. return XML_TOK_INVALID;
  705. }
  706. break;
  707. #endif
  708. case BT_S: case BT_CR: case BT_LF:
  709. {
  710. ptr += MINBPC(enc);
  711. while (ptr != end) {
  712. switch (BYTE_TYPE(enc, ptr)) {
  713. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  714. case BT_GT:
  715. goto gt;
  716. case BT_SOL:
  717. goto sol;
  718. case BT_S: case BT_CR: case BT_LF:
  719. ptr += MINBPC(enc);
  720. continue;
  721. default:
  722. *nextTokPtr = ptr;
  723. return XML_TOK_INVALID;
  724. }
  725. return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  726. }
  727. return XML_TOK_PARTIAL;
  728. }
  729. case BT_GT:
  730. gt:
  731. *nextTokPtr = ptr + MINBPC(enc);
  732. return XML_TOK_START_TAG_NO_ATTS;
  733. case BT_SOL:
  734. sol:
  735. ptr += MINBPC(enc);
  736. if (ptr == end)
  737. return XML_TOK_PARTIAL;
  738. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  739. *nextTokPtr = ptr;
  740. return XML_TOK_INVALID;
  741. }
  742. *nextTokPtr = ptr + MINBPC(enc);
  743. return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  744. default:
  745. *nextTokPtr = ptr;
  746. return XML_TOK_INVALID;
  747. }
  748. }
  749. return XML_TOK_PARTIAL;
  750. }
  751. static
  752. int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  753. const char **nextTokPtr)
  754. {
  755. if (ptr == end)
  756. return XML_TOK_NONE;
  757. if (MINBPC(enc) > 1) {
  758. size_t n = end - ptr;
  759. if (n & (MINBPC(enc) - 1)) {
  760. n &= ~(MINBPC(enc) - 1);
  761. if (n == 0)
  762. return XML_TOK_PARTIAL;
  763. end = ptr + n;
  764. }
  765. }
  766. switch (BYTE_TYPE(enc, ptr)) {
  767. case BT_LT:
  768. return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  769. case BT_AMP:
  770. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  771. case BT_CR:
  772. ptr += MINBPC(enc);
  773. if (ptr == end)
  774. return XML_TOK_TRAILING_CR;
  775. if (BYTE_TYPE(enc, ptr) == BT_LF)
  776. ptr += MINBPC(enc);
  777. *nextTokPtr = ptr;
  778. return XML_TOK_DATA_NEWLINE;
  779. case BT_LF:
  780. *nextTokPtr = ptr + MINBPC(enc);
  781. return XML_TOK_DATA_NEWLINE;
  782. case BT_RSQB:
  783. ptr += MINBPC(enc);
  784. if (ptr == end)
  785. return XML_TOK_TRAILING_RSQB;
  786. if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  787. break;
  788. ptr += MINBPC(enc);
  789. if (ptr == end)
  790. return XML_TOK_TRAILING_RSQB;
  791. if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  792. ptr -= MINBPC(enc);
  793. break;
  794. }
  795. *nextTokPtr = ptr;
  796. return XML_TOK_INVALID;
  797. INVALID_CASES(ptr, nextTokPtr)
  798. default:
  799. ptr += MINBPC(enc);
  800. break;
  801. }
  802. while (ptr != end) {
  803. switch (BYTE_TYPE(enc, ptr)) {
  804. #define LEAD_CASE(n) \
  805. case BT_LEAD ## n: \
  806. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  807. *nextTokPtr = ptr; \
  808. return XML_TOK_DATA_CHARS; \
  809. } \
  810. ptr += n; \
  811. break;
  812. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  813. #undef LEAD_CASE
  814. case BT_RSQB:
  815. if (ptr + MINBPC(enc) != end) {
  816. if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  817. ptr += MINBPC(enc);
  818. break;
  819. }
  820. if (ptr + 2*MINBPC(enc) != end) {
  821. if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
  822. ptr += MINBPC(enc);
  823. break;
  824. }
  825. *nextTokPtr = ptr + 2*MINBPC(enc);
  826. return XML_TOK_INVALID;
  827. }
  828. }
  829. /* fall through */
  830. case BT_AMP:
  831. case BT_LT:
  832. case BT_NONXML:
  833. case BT_MALFORM:
  834. case BT_TRAIL:
  835. case BT_CR:
  836. case BT_LF:
  837. *nextTokPtr = ptr;
  838. return XML_TOK_DATA_CHARS;
  839. default:
  840. ptr += MINBPC(enc);
  841. break;
  842. }
  843. }
  844. *nextTokPtr = ptr;
  845. return XML_TOK_DATA_CHARS;
  846. }
  847. /* ptr points to character following "%" */
  848. static
  849. int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  850. const char **nextTokPtr)
  851. {
  852. if (ptr == end)
  853. return XML_TOK_PARTIAL;
  854. switch (BYTE_TYPE(enc, ptr)) {
  855. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  856. case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
  857. *nextTokPtr = ptr;
  858. return XML_TOK_PERCENT;
  859. default:
  860. *nextTokPtr = ptr;
  861. return XML_TOK_INVALID;
  862. }
  863. while (ptr != end) {
  864. switch (BYTE_TYPE(enc, ptr)) {
  865. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  866. case BT_SEMI:
  867. *nextTokPtr = ptr + MINBPC(enc);
  868. return XML_TOK_PARAM_ENTITY_REF;
  869. default:
  870. *nextTokPtr = ptr;
  871. return XML_TOK_INVALID;
  872. }
  873. }
  874. return XML_TOK_PARTIAL;
  875. }
  876. static
  877. int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  878. const char **nextTokPtr)
  879. {
  880. if (ptr == end)
  881. return XML_TOK_PARTIAL;
  882. switch (BYTE_TYPE(enc, ptr)) {
  883. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  884. default:
  885. *nextTokPtr = ptr;
  886. return XML_TOK_INVALID;
  887. }
  888. while (ptr != end) {
  889. switch (BYTE_TYPE(enc, ptr)) {
  890. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  891. case BT_CR: case BT_LF: case BT_S:
  892. case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
  893. *nextTokPtr = ptr;
  894. return XML_TOK_POUND_NAME;
  895. default:
  896. *nextTokPtr = ptr;
  897. return XML_TOK_INVALID;
  898. }
  899. }
  900. return -XML_TOK_POUND_NAME;
  901. }
  902. static
  903. int PREFIX(scanLit)(int open, const ENCODING *enc,
  904. const char *ptr, const char *end,
  905. const char **nextTokPtr)
  906. {
  907. while (ptr != end) {
  908. int t = BYTE_TYPE(enc, ptr);
  909. switch (t) {
  910. INVALID_CASES(ptr, nextTokPtr)
  911. case BT_QUOT:
  912. case BT_APOS:
  913. ptr += MINBPC(enc);
  914. if (t != open)
  915. break;
  916. if (ptr == end)
  917. return -XML_TOK_LITERAL;
  918. *nextTokPtr = ptr;
  919. switch (BYTE_TYPE(enc, ptr)) {
  920. case BT_S: case BT_CR: case BT_LF:
  921. case BT_GT: case BT_PERCNT: case BT_LSQB:
  922. return XML_TOK_LITERAL;
  923. default:
  924. return XML_TOK_INVALID;
  925. }
  926. default:
  927. ptr += MINBPC(enc);
  928. break;
  929. }
  930. }
  931. return XML_TOK_PARTIAL;
  932. }
  933. static
  934. int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  935. const char **nextTokPtr)
  936. {
  937. int tok;
  938. if (ptr == end)
  939. return XML_TOK_NONE;
  940. if (MINBPC(enc) > 1) {
  941. size_t n = end - ptr;
  942. if (n & (MINBPC(enc) - 1)) {
  943. n &= ~(MINBPC(enc) - 1);
  944. if (n == 0)
  945. return XML_TOK_PARTIAL;
  946. end = ptr + n;
  947. }
  948. }
  949. switch (BYTE_TYPE(enc, ptr)) {
  950. case BT_QUOT:
  951. return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  952. case BT_APOS:
  953. return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  954. case BT_LT:
  955. {
  956. ptr += MINBPC(enc);
  957. if (ptr == end)
  958. return XML_TOK_PARTIAL;
  959. switch (BYTE_TYPE(enc, ptr)) {
  960. case BT_EXCL:
  961. return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  962. case BT_QUEST:
  963. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  964. case BT_NMSTRT:
  965. case BT_HEX:
  966. case BT_NONASCII:
  967. case BT_LEAD2:
  968. case BT_LEAD3:
  969. case BT_LEAD4:
  970. *nextTokPtr = ptr - MINBPC(enc);
  971. return XML_TOK_INSTANCE_START;
  972. }
  973. *nextTokPtr = ptr;
  974. return XML_TOK_INVALID;
  975. }
  976. case BT_CR:
  977. if (ptr + MINBPC(enc) == end)
  978. return -XML_TOK_PROLOG_S;
  979. /* fall through */
  980. case BT_S: case BT_LF:
  981. for (;;) {
  982. ptr += MINBPC(enc);
  983. if (ptr == end)
  984. break;
  985. switch (BYTE_TYPE(enc, ptr)) {
  986. case BT_S: case BT_LF:
  987. break;
  988. case BT_CR:
  989. /* don't split CR/LF pair */
  990. if (ptr + MINBPC(enc) != end)
  991. break;
  992. /* fall through */
  993. default:
  994. *nextTokPtr = ptr;
  995. return XML_TOK_PROLOG_S;
  996. }
  997. }
  998. *nextTokPtr = ptr;
  999. return XML_TOK_PROLOG_S;
  1000. case BT_PERCNT:
  1001. return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1002. case BT_COMMA:
  1003. *nextTokPtr = ptr + MINBPC(enc);
  1004. return XML_TOK_COMMA;
  1005. case BT_LSQB:
  1006. *nextTokPtr = ptr + MINBPC(enc);
  1007. return XML_TOK_OPEN_BRACKET;
  1008. case BT_RSQB:
  1009. ptr += MINBPC(enc);
  1010. if (ptr == end)
  1011. return -XML_TOK_CLOSE_BRACKET;
  1012. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1013. if (ptr + MINBPC(enc) == end)
  1014. return XML_TOK_PARTIAL;
  1015. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1016. *nextTokPtr = ptr + 2*MINBPC(enc);
  1017. return XML_TOK_COND_SECT_CLOSE;
  1018. }
  1019. }
  1020. *nextTokPtr = ptr;
  1021. return XML_TOK_CLOSE_BRACKET;
  1022. case BT_LPAR:
  1023. *nextTokPtr = ptr + MINBPC(enc);
  1024. return XML_TOK_OPEN_PAREN;
  1025. case BT_RPAR:
  1026. ptr += MINBPC(enc);
  1027. if (ptr == end)
  1028. return -XML_TOK_CLOSE_PAREN;
  1029. switch (BYTE_TYPE(enc, ptr)) {
  1030. case BT_AST:
  1031. *nextTokPtr = ptr + MINBPC(enc);
  1032. return XML_TOK_CLOSE_PAREN_ASTERISK;
  1033. case BT_QUEST:
  1034. *nextTokPtr = ptr + MINBPC(enc);
  1035. return XML_TOK_CLOSE_PAREN_QUESTION;
  1036. case BT_PLUS:
  1037. *nextTokPtr = ptr + MINBPC(enc);
  1038. return XML_TOK_CLOSE_PAREN_PLUS;
  1039. case BT_CR: case BT_LF: case BT_S:
  1040. case BT_GT: case BT_COMMA: case BT_VERBAR:
  1041. case BT_RPAR:
  1042. *nextTokPtr = ptr;
  1043. return XML_TOK_CLOSE_PAREN;
  1044. }
  1045. *nextTokPtr = ptr;
  1046. return XML_TOK_INVALID;
  1047. case BT_VERBAR:
  1048. *nextTokPtr = ptr + MINBPC(enc);
  1049. return XML_TOK_OR;
  1050. case BT_GT:
  1051. *nextTokPtr = ptr + MINBPC(enc);
  1052. return XML_TOK_DECL_CLOSE;
  1053. case BT_NUM:
  1054. return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1055. #define LEAD_CASE(n) \
  1056. case BT_LEAD ## n: \
  1057. if (end - ptr < n) \
  1058. return XML_TOK_PARTIAL_CHAR; \
  1059. if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1060. ptr += n; \
  1061. tok = XML_TOK_NAME; \
  1062. break; \
  1063. } \
  1064. if (IS_NAME_CHAR(enc, ptr, n)) { \
  1065. ptr += n; \
  1066. tok = XML_TOK_NMTOKEN; \
  1067. break; \
  1068. } \
  1069. *nextTokPtr = ptr; \
  1070. return XML_TOK_INVALID;
  1071. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1072. #undef LEAD_CASE
  1073. case BT_NMSTRT:
  1074. case BT_HEX:
  1075. tok = XML_TOK_NAME;
  1076. ptr += MINBPC(enc);
  1077. break;
  1078. case BT_DIGIT:
  1079. case BT_NAME:
  1080. case BT_MINUS:
  1081. #ifdef XML_NS
  1082. case BT_COLON:
  1083. #endif
  1084. tok = XML_TOK_NMTOKEN;
  1085. ptr += MINBPC(enc);
  1086. break;
  1087. case BT_NONASCII:
  1088. if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1089. ptr += MINBPC(enc);
  1090. tok = XML_TOK_NAME;
  1091. break;
  1092. }
  1093. if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1094. ptr += MINBPC(enc);
  1095. tok = XML_TOK_NMTOKEN;
  1096. break;
  1097. }
  1098. /* fall through */
  1099. default:
  1100. *nextTokPtr = ptr;
  1101. return XML_TOK_INVALID;
  1102. }
  1103. while (ptr != end) {
  1104. switch (BYTE_TYPE(enc, ptr)) {
  1105. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1106. case BT_GT: case BT_RPAR: case BT_COMMA:
  1107. case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
  1108. case BT_S: case BT_CR: case BT_LF:
  1109. *nextTokPtr = ptr;
  1110. return tok;
  1111. #ifdef XML_NS
  1112. case BT_COLON:
  1113. ptr += MINBPC(enc);
  1114. switch (tok) {
  1115. case XML_TOK_NAME:
  1116. if (ptr == end)
  1117. return XML_TOK_PARTIAL;
  1118. tok = XML_TOK_PREFIXED_NAME;
  1119. switch (BYTE_TYPE(enc, ptr)) {
  1120. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1121. default:
  1122. tok = XML_TOK_NMTOKEN;
  1123. break;
  1124. }
  1125. break;
  1126. case XML_TOK_PREFIXED_NAME:
  1127. tok = XML_TOK_NMTOKEN;
  1128. break;
  1129. }
  1130. break;
  1131. #endif
  1132. case BT_PLUS:
  1133. if (tok == XML_TOK_NMTOKEN) {
  1134. *nextTokPtr = ptr;
  1135. return XML_TOK_INVALID;
  1136. }
  1137. *nextTokPtr = ptr + MINBPC(enc);
  1138. return XML_TOK_NAME_PLUS;
  1139. case BT_AST:
  1140. if (tok == XML_TOK_NMTOKEN) {
  1141. *nextTokPtr = ptr;
  1142. return XML_TOK_INVALID;
  1143. }
  1144. *nextTokPtr = ptr + MINBPC(enc);
  1145. return XML_TOK_NAME_ASTERISK;
  1146. case BT_QUEST:
  1147. if (tok == XML_TOK_NMTOKEN) {
  1148. *nextTokPtr = ptr;
  1149. return XML_TOK_INVALID;
  1150. }
  1151. *nextTokPtr = ptr + MINBPC(enc);
  1152. return XML_TOK_NAME_QUESTION;
  1153. default:
  1154. *nextTokPtr = ptr;
  1155. return XML_TOK_INVALID;
  1156. }
  1157. }
  1158. return -tok;
  1159. }
  1160. static
  1161. int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
  1162. const char **nextTokPtr)
  1163. {
  1164. const char *start;
  1165. if (ptr == end)
  1166. return XML_TOK_NONE;
  1167. start = ptr;
  1168. while (ptr != end) {
  1169. switch (BYTE_TYPE(enc, ptr)) {
  1170. #define LEAD_CASE(n) \
  1171. case BT_LEAD ## n: ptr += n; break;
  1172. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1173. #undef LEAD_CASE
  1174. case BT_AMP:
  1175. if (ptr == start)
  1176. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1177. *nextTokPtr = ptr;
  1178. return XML_TOK_DATA_CHARS;
  1179. case BT_LT:
  1180. /* this is for inside entity references */
  1181. *nextTokPtr = ptr;
  1182. return XML_TOK_INVALID;
  1183. case BT_LF:
  1184. if (ptr == start) {
  1185. *nextTokPtr = ptr + MINBPC(enc);
  1186. return XML_TOK_DATA_NEWLINE;
  1187. }
  1188. *nextTokPtr = ptr;
  1189. return XML_TOK_DATA_CHARS;
  1190. case BT_CR:
  1191. if (ptr == start) {
  1192. ptr += MINBPC(enc);
  1193. if (ptr == end)
  1194. return XML_TOK_TRAILING_CR;
  1195. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1196. ptr += MINBPC(enc);
  1197. *nextTokPtr = ptr;
  1198. return XML_TOK_DATA_NEWLINE;
  1199. }
  1200. *nextTokPtr = ptr;
  1201. return XML_TOK_DATA_CHARS;
  1202. case BT_S:
  1203. if (ptr == start) {
  1204. *nextTokPtr = ptr + MINBPC(enc);
  1205. return XML_TOK_ATTRIBUTE_VALUE_S;
  1206. }
  1207. *nextTokPtr = ptr;
  1208. return XML_TOK_DATA_CHARS;
  1209. default:
  1210. ptr += MINBPC(enc);
  1211. break;
  1212. }
  1213. }
  1214. *nextTokPtr = ptr;
  1215. return XML_TOK_DATA_CHARS;
  1216. }
  1217. static
  1218. int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
  1219. const char **nextTokPtr)
  1220. {
  1221. const char *start;
  1222. if (ptr == end)
  1223. return XML_TOK_NONE;
  1224. start = ptr;
  1225. while (ptr != end) {
  1226. switch (BYTE_TYPE(enc, ptr)) {
  1227. #define LEAD_CASE(n) \
  1228. case BT_LEAD ## n: ptr += n; break;
  1229. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1230. #undef LEAD_CASE
  1231. case BT_AMP:
  1232. if (ptr == start)
  1233. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1234. *nextTokPtr = ptr;
  1235. return XML_TOK_DATA_CHARS;
  1236. case BT_PERCNT:
  1237. if (ptr == start) {
  1238. int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
  1239. end, nextTokPtr);
  1240. return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1241. }
  1242. *nextTokPtr = ptr;
  1243. return XML_TOK_DATA_CHARS;
  1244. case BT_LF:
  1245. if (ptr == start) {
  1246. *nextTokPtr = ptr + MINBPC(enc);
  1247. return XML_TOK_DATA_NEWLINE;
  1248. }
  1249. *nextTokPtr = ptr;
  1250. return XML_TOK_DATA_CHARS;
  1251. case BT_CR:
  1252. if (ptr == start) {
  1253. ptr += MINBPC(enc);
  1254. if (ptr == end)
  1255. return XML_TOK_TRAILING_CR;
  1256. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1257. ptr += MINBPC(enc);
  1258. *nextTokPtr = ptr;
  1259. return XML_TOK_DATA_NEWLINE;
  1260. }
  1261. *nextTokPtr = ptr;
  1262. return XML_TOK_DATA_CHARS;
  1263. default:
  1264. ptr += MINBPC(enc);
  1265. break;
  1266. }
  1267. }
  1268. *nextTokPtr = ptr;
  1269. return XML_TOK_DATA_CHARS;
  1270. }
  1271. #ifdef XML_DTD
  1272. static
  1273. int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
  1274. const char **nextTokPtr)
  1275. {
  1276. int level = 0;
  1277. if (MINBPC(enc) > 1) {
  1278. size_t n = end - ptr;
  1279. if (n & (MINBPC(enc) - 1)) {
  1280. n &= ~(MINBPC(enc) - 1);
  1281. end = ptr + n;
  1282. }
  1283. }
  1284. while (ptr != end) {
  1285. switch (BYTE_TYPE(enc, ptr)) {
  1286. INVALID_CASES(ptr, nextTokPtr)
  1287. case BT_LT:
  1288. if ((ptr += MINBPC(enc)) == end)
  1289. return XML_TOK_PARTIAL;
  1290. if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1291. if ((ptr += MINBPC(enc)) == end)
  1292. return XML_TOK_PARTIAL;
  1293. if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1294. ++level;
  1295. ptr += MINBPC(enc);
  1296. }
  1297. }
  1298. break;
  1299. case BT_RSQB:
  1300. if ((ptr += MINBPC(enc)) == end)
  1301. return XML_TOK_PARTIAL;
  1302. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1303. if ((ptr += MINBPC(enc)) == end)
  1304. return XML_TOK_PARTIAL;
  1305. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1306. ptr += MINBPC(enc);
  1307. if (level == 0) {
  1308. *nextTokPtr = ptr;
  1309. return XML_TOK_IGNORE_SECT;
  1310. }
  1311. --level;
  1312. }
  1313. }
  1314. break;
  1315. default:
  1316. ptr += MINBPC(enc);
  1317. break;
  1318. }
  1319. }
  1320. return XML_TOK_PARTIAL;
  1321. }
  1322. #endif /* XML_DTD */
  1323. static
  1324. int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1325. const char **badPtr)
  1326. {
  1327. ptr += MINBPC(enc);
  1328. end -= MINBPC(enc);
  1329. for (; ptr != end; ptr += MINBPC(enc)) {
  1330. switch (BYTE_TYPE(enc, ptr)) {
  1331. case BT_DIGIT:
  1332. case BT_HEX:
  1333. case BT_MINUS:
  1334. case BT_APOS:
  1335. case BT_LPAR:
  1336. case BT_RPAR:
  1337. case BT_PLUS:
  1338. case BT_COMMA:
  1339. case BT_SOL:
  1340. case BT_EQUALS:
  1341. case BT_QUEST:
  1342. case BT_CR:
  1343. case BT_LF:
  1344. case BT_SEMI:
  1345. case BT_EXCL:
  1346. case BT_AST:
  1347. case BT_PERCNT:
  1348. case BT_NUM:
  1349. #ifdef XML_NS
  1350. case BT_COLON:
  1351. #endif
  1352. break;
  1353. case BT_S:
  1354. if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1355. *badPtr = ptr;
  1356. return 0;
  1357. }
  1358. break;
  1359. case BT_NAME:
  1360. case BT_NMSTRT:
  1361. if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1362. break;
  1363. default:
  1364. switch (BYTE_TO_ASCII(enc, ptr)) {
  1365. case 0x24: /* $ */
  1366. case 0x40: /* @ */
  1367. break;
  1368. default:
  1369. *badPtr = ptr;
  1370. return 0;
  1371. }
  1372. break;
  1373. }
  1374. }
  1375. return 1;
  1376. }
  1377. /* This must only be called for a well-formed start-tag or empty element tag.
  1378. Returns the number of attributes. Pointers to the first attsMax attributes
  1379. are stored in atts. */
  1380. static
  1381. int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
  1382. int attsMax, ATTRIBUTE *atts)
  1383. {
  1384. enum { other, inName, inValue } state = inName;
  1385. int nAtts = 0;
  1386. int open = 0; /* defined when state == inValue;
  1387. initialization just to shut up compilers */
  1388. for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1389. switch (BYTE_TYPE(enc, ptr)) {
  1390. #define START_NAME \
  1391. if (state == other) { \
  1392. if (nAtts < attsMax) { \
  1393. atts[nAtts].name = ptr; \
  1394. atts[nAtts].normalized = 1; \
  1395. } \
  1396. state = inName; \
  1397. }
  1398. #define LEAD_CASE(n) \
  1399. case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
  1400. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1401. #undef LEAD_CASE
  1402. case BT_NONASCII:
  1403. case BT_NMSTRT:
  1404. case BT_HEX:
  1405. START_NAME
  1406. break;
  1407. #undef START_NAME
  1408. case BT_QUOT:
  1409. if (state != inValue) {
  1410. if (nAtts < attsMax)
  1411. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1412. state = inValue;
  1413. open = BT_QUOT;
  1414. }
  1415. else if (open == BT_QUOT) {
  1416. state = other;
  1417. if (nAtts < attsMax)
  1418. atts[nAtts].valueEnd = ptr;
  1419. nAtts++;
  1420. }
  1421. break;
  1422. case BT_APOS:
  1423. if (state != inValue) {
  1424. if (nAtts < attsMax)
  1425. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1426. state = inValue;
  1427. open = BT_APOS;
  1428. }
  1429. else if (open == BT_APOS) {
  1430. state = other;
  1431. if (nAtts < attsMax)
  1432. atts[nAtts].valueEnd = ptr;
  1433. nAtts++;
  1434. }
  1435. break;
  1436. case BT_AMP:
  1437. if (nAtts < attsMax)
  1438. atts[nAtts].normalized = 0;
  1439. break;
  1440. case BT_S:
  1441. if (state == inName)
  1442. state = other;
  1443. else if (state == inValue
  1444. && nAtts < attsMax
  1445. && atts[nAtts].normalized
  1446. && (ptr == atts[nAtts].valuePtr
  1447. || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1448. || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1449. || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1450. atts[nAtts].normalized = 0;
  1451. break;
  1452. case BT_CR: case BT_LF:
  1453. /* This case ensures that the first attribute name is counted
  1454. Apart from that we could just change state on the quote. */
  1455. if (state == inName)
  1456. state = other;
  1457. else if (state == inValue && nAtts < attsMax)
  1458. atts[nAtts].normalized = 0;
  1459. break;
  1460. case BT_GT:
  1461. case BT_SOL:
  1462. if (state != inValue)
  1463. return nAtts;
  1464. break;
  1465. default:
  1466. break;
  1467. }
  1468. }
  1469. /* not reached */
  1470. }
  1471. static
  1472. int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  1473. {
  1474. int result = 0;
  1475. cmExpatUnused(enc);
  1476. /* skip &# */
  1477. ptr += 2*MINBPC(enc);
  1478. if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1479. for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1480. int c = BYTE_TO_ASCII(enc, ptr);
  1481. switch (c) {
  1482. case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
  1483. case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
  1484. result <<= 4;
  1485. result |= (c - ASCII_0);
  1486. break;
  1487. case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
  1488. result <<= 4;
  1489. result += 10 + (c - ASCII_A);
  1490. break;
  1491. case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
  1492. result <<= 4;
  1493. result += 10 + (c - ASCII_a);
  1494. break;
  1495. }
  1496. if (result >= 0x110000)
  1497. return -1;
  1498. }
  1499. }
  1500. else {
  1501. for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1502. int c = BYTE_TO_ASCII(enc, ptr);
  1503. result *= 10;
  1504. result += (c - ASCII_0);
  1505. if (result >= 0x110000)
  1506. return -1;
  1507. }
  1508. }
  1509. return checkCharRefNumber(result);
  1510. }
  1511. static
  1512. int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
  1513. {
  1514. cmExpatUnused(enc);
  1515. switch ((end - ptr)/MINBPC(enc)) {
  1516. case 2:
  1517. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1518. switch (BYTE_TO_ASCII(enc, ptr)) {
  1519. case ASCII_l:
  1520. return ASCII_LT;
  1521. case ASCII_g:
  1522. return ASCII_GT;
  1523. }
  1524. }
  1525. break;
  1526. case 3:
  1527. if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1528. ptr += MINBPC(enc);
  1529. if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1530. ptr += MINBPC(enc);
  1531. if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1532. return ASCII_AMP;
  1533. }
  1534. }
  1535. break;
  1536. case 4:
  1537. switch (BYTE_TO_ASCII(enc, ptr)) {
  1538. case ASCII_q:
  1539. ptr += MINBPC(enc);
  1540. if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1541. ptr += MINBPC(enc);
  1542. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1543. ptr += MINBPC(enc);
  1544. if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1545. return ASCII_QUOT;
  1546. }
  1547. }
  1548. break;
  1549. case ASCII_a:
  1550. ptr += MINBPC(enc);
  1551. if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1552. ptr += MINBPC(enc);
  1553. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1554. ptr += MINBPC(enc);
  1555. if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1556. return ASCII_APOS;
  1557. }
  1558. }
  1559. break;
  1560. }
  1561. }
  1562. return 0;
  1563. }
  1564. static
  1565. int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
  1566. {
  1567. for (;;) {
  1568. switch (BYTE_TYPE(enc, ptr1)) {
  1569. #define LEAD_CASE(n) \
  1570. case BT_LEAD ## n: \
  1571. if (*ptr1++ != *ptr2++) \
  1572. return 0;
  1573. LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
  1574. #undef LEAD_CASE
  1575. /* fall through */
  1576. if (*ptr1++ != *ptr2++)
  1577. return 0;
  1578. break;
  1579. case BT_NONASCII:
  1580. case BT_NMSTRT:
  1581. #ifdef XML_NS
  1582. case BT_COLON:
  1583. #endif
  1584. case BT_HEX:
  1585. case BT_DIGIT:
  1586. case BT_NAME:
  1587. case BT_MINUS:
  1588. if (*ptr2++ != *ptr1++)
  1589. return 0;
  1590. if (MINBPC(enc) > 1) {
  1591. if (*ptr2++ != *ptr1++)
  1592. return 0;
  1593. if (MINBPC(enc) > 2) {
  1594. if (*ptr2++ != *ptr1++)
  1595. return 0;
  1596. if (MINBPC(enc) > 3) {
  1597. if (*ptr2++ != *ptr1++)
  1598. return 0;
  1599. }
  1600. }
  1601. }
  1602. break;
  1603. default:
  1604. if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
  1605. return 1;
  1606. switch (BYTE_TYPE(enc, ptr2)) {
  1607. case BT_LEAD2:
  1608. case BT_LEAD3:
  1609. case BT_LEAD4:
  1610. case BT_NONASCII:
  1611. case BT_NMSTRT:
  1612. #ifdef XML_NS
  1613. case BT_COLON:
  1614. #endif
  1615. case BT_HEX:
  1616. case BT_DIGIT:
  1617. case BT_NAME:
  1618. case BT_MINUS:
  1619. return 0;
  1620. default:
  1621. return 1;
  1622. }
  1623. }
  1624. }
  1625. /* not reached */
  1626. }
  1627. static
  1628. int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1629. const char *end1, const char *ptr2)
  1630. {
  1631. cmExpatUnused(enc);
  1632. for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1633. if (ptr1 == end1)
  1634. return 0;
  1635. if (!CHAR_MATCHES(enc, ptr1, *ptr2))
  1636. return 0;
  1637. }
  1638. return ptr1 == end1;
  1639. }
  1640. static
  1641. int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
  1642. {
  1643. const char *start = ptr;
  1644. for (;;) {
  1645. switch (BYTE_TYPE(enc, ptr)) {
  1646. #define LEAD_CASE(n) \
  1647. case BT_LEAD ## n: ptr += n; break;
  1648. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1649. #undef LEAD_CASE
  1650. case BT_NONASCII:
  1651. case BT_NMSTRT:
  1652. #ifdef XML_NS
  1653. case BT_COLON:
  1654. #endif
  1655. case BT_HEX:
  1656. case BT_DIGIT:
  1657. case BT_NAME:
  1658. case BT_MINUS:
  1659. ptr += MINBPC(enc);
  1660. break;
  1661. default:
  1662. return ptr - start;
  1663. }
  1664. }
  1665. }
  1666. static
  1667. const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
  1668. {
  1669. for (;;) {
  1670. switch (BYTE_TYPE(enc, ptr)) {
  1671. case BT_LF:
  1672. case BT_CR:
  1673. case BT_S:
  1674. ptr += MINBPC(enc);
  1675. break;
  1676. default:
  1677. return ptr;
  1678. }
  1679. }
  1680. }
  1681. static
  1682. void PREFIX(updatePosition)(const ENCODING *enc,
  1683. const char *ptr,
  1684. const char *end,
  1685. POSITION *pos)
  1686. {
  1687. while (ptr != end) {
  1688. switch (BYTE_TYPE(enc, ptr)) {
  1689. #define LEAD_CASE(n) \
  1690. case BT_LEAD ## n: \
  1691. ptr += n; \
  1692. break;
  1693. LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  1694. #undef LEAD_CASE
  1695. case BT_LF:
  1696. pos->columnNumber = (unsigned)-1;
  1697. pos->lineNumber++;
  1698. ptr += MINBPC(enc);
  1699. break;
  1700. case BT_CR:
  1701. pos->lineNumber++;
  1702. ptr += MINBPC(enc);
  1703. if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
  1704. ptr += MINBPC(enc);
  1705. pos->columnNumber = (unsigned)-1;
  1706. break;
  1707. default:
  1708. ptr += MINBPC(enc);
  1709. break;
  1710. }
  1711. pos->columnNumber++;
  1712. }
  1713. }
  1714. #undef DO_LEAD_CASE
  1715. #undef MULTIBYTE_CASES
  1716. #undef INVALID_CASES
  1717. #undef CHECK_NAME_CASE
  1718. #undef CHECK_NAME_CASES
  1719. #undef CHECK_NMSTRT_CASE
  1720. #undef CHECK_NMSTRT_CASES