xmltok_impl.c 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809
  1. /* This file is included!
  2. __ __ _
  3. ___\ \/ /_ __ __ _| |_
  4. / _ \\ /| '_ \ / _` | __|
  5. | __// \| |_) | (_| | |_
  6. \___/_/\_\ .__/ \__,_|\__|
  7. |_| XML parser
  8. Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
  9. Copyright (c) 2000-2017 Expat development team
  10. Licensed under the MIT license:
  11. Permission is hereby granted, free of charge, to any person obtaining
  12. a copy of this software and associated documentation files (the
  13. "Software"), to deal in the Software without restriction, including
  14. without limitation the rights to use, copy, modify, merge, publish,
  15. distribute, sublicense, and/or sell copies of the Software, and to permit
  16. persons to whom the Software is furnished to do so, subject to the
  17. following conditions:
  18. The above copyright notice and this permission notice shall be included
  19. in all copies or substantial portions of the Software.
  20. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
  23. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  24. DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  25. OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  26. USE OR OTHER DEALINGS IN THE SOFTWARE.
  27. */
  28. #ifdef XML_TOK_IMPL_C
  29. # ifndef IS_INVALID_CHAR
  30. # define IS_INVALID_CHAR(enc, ptr, n) (0)
  31. # endif
  32. # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
  33. case BT_LEAD##n: \
  34. if (end - ptr < n) \
  35. return XML_TOK_PARTIAL_CHAR; \
  36. if (IS_INVALID_CHAR(enc, ptr, n)) { \
  37. *(nextTokPtr) = (ptr); \
  38. return XML_TOK_INVALID; \
  39. } \
  40. ptr += n; \
  41. break;
  42. # define INVALID_CASES(ptr, nextTokPtr) \
  43. INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
  44. INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
  45. INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
  46. case BT_NONXML: \
  47. case BT_MALFORM: \
  48. case BT_TRAIL: \
  49. *(nextTokPtr) = (ptr); \
  50. return XML_TOK_INVALID;
  51. # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
  52. case BT_LEAD##n: \
  53. if (end - ptr < n) \
  54. return XML_TOK_PARTIAL_CHAR; \
  55. if (! IS_NAME_CHAR(enc, ptr, n)) { \
  56. *nextTokPtr = ptr; \
  57. return XML_TOK_INVALID; \
  58. } \
  59. ptr += n; \
  60. break;
  61. # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  62. case BT_NONASCII: \
  63. if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
  64. *nextTokPtr = ptr; \
  65. return XML_TOK_INVALID; \
  66. } \
  67. /* fall through */ \
  68. case BT_NMSTRT: \
  69. case BT_HEX: \
  70. case BT_DIGIT: \
  71. case BT_NAME: \
  72. case BT_MINUS: \
  73. ptr += MINBPC(enc); \
  74. break; \
  75. CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
  76. CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
  77. CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
  78. # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
  79. case BT_LEAD##n: \
  80. if (end - ptr < n) \
  81. return XML_TOK_PARTIAL_CHAR; \
  82. if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
  83. *nextTokPtr = ptr; \
  84. return XML_TOK_INVALID; \
  85. } \
  86. ptr += n; \
  87. break;
  88. # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  89. case BT_NONASCII: \
  90. if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
  91. *nextTokPtr = ptr; \
  92. return XML_TOK_INVALID; \
  93. } \
  94. /* fall through */ \
  95. case BT_NMSTRT: \
  96. case BT_HEX: \
  97. ptr += MINBPC(enc); \
  98. break; \
  99. CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
  100. CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
  101. CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
  102. # ifndef PREFIX
  103. # define PREFIX(ident) ident
  104. # endif
  105. # define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
  106. # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
  107. # define REQUIRE_CHARS(enc, ptr, end, count) \
  108. { \
  109. if (! HAS_CHARS(enc, ptr, end, count)) { \
  110. return XML_TOK_PARTIAL; \
  111. } \
  112. }
  113. # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
  114. /* ptr points to character following "<!-" */
  115. static int PTRCALL
  116. PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
  117. const char **nextTokPtr) {
  118. if (HAS_CHAR(enc, ptr, end)) {
  119. if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  120. *nextTokPtr = ptr;
  121. return XML_TOK_INVALID;
  122. }
  123. ptr += MINBPC(enc);
  124. while (HAS_CHAR(enc, ptr, end)) {
  125. switch (BYTE_TYPE(enc, ptr)) {
  126. INVALID_CASES(ptr, nextTokPtr)
  127. case BT_MINUS:
  128. ptr += MINBPC(enc);
  129. REQUIRE_CHAR(enc, ptr, end);
  130. if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  131. ptr += MINBPC(enc);
  132. REQUIRE_CHAR(enc, ptr, end);
  133. if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  134. *nextTokPtr = ptr;
  135. return XML_TOK_INVALID;
  136. }
  137. *nextTokPtr = ptr + MINBPC(enc);
  138. return XML_TOK_COMMENT;
  139. }
  140. break;
  141. default:
  142. ptr += MINBPC(enc);
  143. break;
  144. }
  145. }
  146. }
  147. return XML_TOK_PARTIAL;
  148. }
  149. /* ptr points to character following "<!" */
  150. static int PTRCALL
  151. PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
  152. const char **nextTokPtr) {
  153. REQUIRE_CHAR(enc, ptr, end);
  154. switch (BYTE_TYPE(enc, ptr)) {
  155. case BT_MINUS:
  156. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  157. case BT_LSQB:
  158. *nextTokPtr = ptr + MINBPC(enc);
  159. return XML_TOK_COND_SECT_OPEN;
  160. case BT_NMSTRT:
  161. case BT_HEX:
  162. ptr += MINBPC(enc);
  163. break;
  164. default:
  165. *nextTokPtr = ptr;
  166. return XML_TOK_INVALID;
  167. }
  168. while (HAS_CHAR(enc, ptr, end)) {
  169. switch (BYTE_TYPE(enc, ptr)) {
  170. case BT_PERCNT:
  171. REQUIRE_CHARS(enc, ptr, end, 2);
  172. /* don't allow <!ENTITY% foo "whatever"> */
  173. switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  174. case BT_S:
  175. case BT_CR:
  176. case BT_LF:
  177. case BT_PERCNT:
  178. *nextTokPtr = ptr;
  179. return XML_TOK_INVALID;
  180. }
  181. /* fall through */
  182. case BT_S:
  183. case BT_CR:
  184. case BT_LF:
  185. *nextTokPtr = ptr;
  186. return XML_TOK_DECL_OPEN;
  187. case BT_NMSTRT:
  188. case BT_HEX:
  189. ptr += MINBPC(enc);
  190. break;
  191. default:
  192. *nextTokPtr = ptr;
  193. return XML_TOK_INVALID;
  194. }
  195. }
  196. return XML_TOK_PARTIAL;
  197. }
  198. static int PTRCALL
  199. PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
  200. int *tokPtr) {
  201. int upper = 0;
  202. UNUSED_P(enc);
  203. *tokPtr = XML_TOK_PI;
  204. if (end - ptr != MINBPC(enc) * 3)
  205. return 1;
  206. switch (BYTE_TO_ASCII(enc, ptr)) {
  207. case ASCII_x:
  208. break;
  209. case ASCII_X:
  210. upper = 1;
  211. break;
  212. default:
  213. return 1;
  214. }
  215. ptr += MINBPC(enc);
  216. switch (BYTE_TO_ASCII(enc, ptr)) {
  217. case ASCII_m:
  218. break;
  219. case ASCII_M:
  220. upper = 1;
  221. break;
  222. default:
  223. return 1;
  224. }
  225. ptr += MINBPC(enc);
  226. switch (BYTE_TO_ASCII(enc, ptr)) {
  227. case ASCII_l:
  228. break;
  229. case ASCII_L:
  230. upper = 1;
  231. break;
  232. default:
  233. return 1;
  234. }
  235. if (upper)
  236. return 0;
  237. *tokPtr = XML_TOK_XML_DECL;
  238. return 1;
  239. }
  240. /* ptr points to character following "<?" */
  241. static int PTRCALL
  242. PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
  243. const char **nextTokPtr) {
  244. int tok;
  245. const char *target = ptr;
  246. REQUIRE_CHAR(enc, ptr, end);
  247. switch (BYTE_TYPE(enc, ptr)) {
  248. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  249. default:
  250. *nextTokPtr = ptr;
  251. return XML_TOK_INVALID;
  252. }
  253. while (HAS_CHAR(enc, ptr, end)) {
  254. switch (BYTE_TYPE(enc, ptr)) {
  255. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  256. case BT_S:
  257. case BT_CR:
  258. case BT_LF:
  259. if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  260. *nextTokPtr = ptr;
  261. return XML_TOK_INVALID;
  262. }
  263. ptr += MINBPC(enc);
  264. while (HAS_CHAR(enc, ptr, end)) {
  265. switch (BYTE_TYPE(enc, ptr)) {
  266. INVALID_CASES(ptr, nextTokPtr)
  267. case BT_QUEST:
  268. ptr += MINBPC(enc);
  269. REQUIRE_CHAR(enc, ptr, end);
  270. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  271. *nextTokPtr = ptr + MINBPC(enc);
  272. return tok;
  273. }
  274. break;
  275. default:
  276. ptr += MINBPC(enc);
  277. break;
  278. }
  279. }
  280. return XML_TOK_PARTIAL;
  281. case BT_QUEST:
  282. if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  283. *nextTokPtr = ptr;
  284. return XML_TOK_INVALID;
  285. }
  286. ptr += MINBPC(enc);
  287. REQUIRE_CHAR(enc, ptr, end);
  288. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  289. *nextTokPtr = ptr + MINBPC(enc);
  290. return tok;
  291. }
  292. /* fall through */
  293. default:
  294. *nextTokPtr = ptr;
  295. return XML_TOK_INVALID;
  296. }
  297. }
  298. return XML_TOK_PARTIAL;
  299. }
  300. static int PTRCALL
  301. PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
  302. const char **nextTokPtr) {
  303. static const char CDATA_LSQB[]
  304. = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
  305. int i;
  306. UNUSED_P(enc);
  307. /* CDATA[ */
  308. REQUIRE_CHARS(enc, ptr, end, 6);
  309. for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  310. if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  311. *nextTokPtr = ptr;
  312. return XML_TOK_INVALID;
  313. }
  314. }
  315. *nextTokPtr = ptr;
  316. return XML_TOK_CDATA_SECT_OPEN;
  317. }
  318. static int PTRCALL
  319. PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
  320. const char **nextTokPtr) {
  321. if (ptr >= end)
  322. return XML_TOK_NONE;
  323. if (MINBPC(enc) > 1) {
  324. size_t n = end - ptr;
  325. if (n & (MINBPC(enc) - 1)) {
  326. n &= ~(MINBPC(enc) - 1);
  327. if (n == 0)
  328. return XML_TOK_PARTIAL;
  329. end = ptr + n;
  330. }
  331. }
  332. switch (BYTE_TYPE(enc, ptr)) {
  333. case BT_RSQB:
  334. ptr += MINBPC(enc);
  335. REQUIRE_CHAR(enc, ptr, end);
  336. if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  337. break;
  338. ptr += MINBPC(enc);
  339. REQUIRE_CHAR(enc, ptr, end);
  340. if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  341. ptr -= MINBPC(enc);
  342. break;
  343. }
  344. *nextTokPtr = ptr + MINBPC(enc);
  345. return XML_TOK_CDATA_SECT_CLOSE;
  346. case BT_CR:
  347. ptr += MINBPC(enc);
  348. REQUIRE_CHAR(enc, ptr, end);
  349. if (BYTE_TYPE(enc, ptr) == BT_LF)
  350. ptr += MINBPC(enc);
  351. *nextTokPtr = ptr;
  352. return XML_TOK_DATA_NEWLINE;
  353. case BT_LF:
  354. *nextTokPtr = ptr + MINBPC(enc);
  355. return XML_TOK_DATA_NEWLINE;
  356. INVALID_CASES(ptr, nextTokPtr)
  357. default:
  358. ptr += MINBPC(enc);
  359. break;
  360. }
  361. while (HAS_CHAR(enc, ptr, end)) {
  362. switch (BYTE_TYPE(enc, ptr)) {
  363. # define LEAD_CASE(n) \
  364. case BT_LEAD##n: \
  365. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  366. *nextTokPtr = ptr; \
  367. return XML_TOK_DATA_CHARS; \
  368. } \
  369. ptr += n; \
  370. break;
  371. LEAD_CASE(2)
  372. LEAD_CASE(3)
  373. LEAD_CASE(4)
  374. # undef LEAD_CASE
  375. case BT_NONXML:
  376. case BT_MALFORM:
  377. case BT_TRAIL:
  378. case BT_CR:
  379. case BT_LF:
  380. case BT_RSQB:
  381. *nextTokPtr = ptr;
  382. return XML_TOK_DATA_CHARS;
  383. default:
  384. ptr += MINBPC(enc);
  385. break;
  386. }
  387. }
  388. *nextTokPtr = ptr;
  389. return XML_TOK_DATA_CHARS;
  390. }
  391. /* ptr points to character following "</" */
  392. static int PTRCALL
  393. PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
  394. const char **nextTokPtr) {
  395. REQUIRE_CHAR(enc, ptr, end);
  396. switch (BYTE_TYPE(enc, ptr)) {
  397. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  398. default:
  399. *nextTokPtr = ptr;
  400. return XML_TOK_INVALID;
  401. }
  402. while (HAS_CHAR(enc, ptr, end)) {
  403. switch (BYTE_TYPE(enc, ptr)) {
  404. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  405. case BT_S:
  406. case BT_CR:
  407. case BT_LF:
  408. for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
  409. switch (BYTE_TYPE(enc, ptr)) {
  410. case BT_S:
  411. case BT_CR:
  412. case BT_LF:
  413. break;
  414. case BT_GT:
  415. *nextTokPtr = ptr + MINBPC(enc);
  416. return XML_TOK_END_TAG;
  417. default:
  418. *nextTokPtr = ptr;
  419. return XML_TOK_INVALID;
  420. }
  421. }
  422. return XML_TOK_PARTIAL;
  423. # ifdef XML_NS
  424. case BT_COLON:
  425. /* no need to check qname syntax here,
  426. since end-tag must match exactly */
  427. ptr += MINBPC(enc);
  428. break;
  429. # endif
  430. case BT_GT:
  431. *nextTokPtr = ptr + MINBPC(enc);
  432. return XML_TOK_END_TAG;
  433. default:
  434. *nextTokPtr = ptr;
  435. return XML_TOK_INVALID;
  436. }
  437. }
  438. return XML_TOK_PARTIAL;
  439. }
  440. /* ptr points to character following "&#X" */
  441. static int PTRCALL
  442. PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
  443. const char **nextTokPtr) {
  444. if (HAS_CHAR(enc, ptr, end)) {
  445. switch (BYTE_TYPE(enc, ptr)) {
  446. case BT_DIGIT:
  447. case BT_HEX:
  448. break;
  449. default:
  450. *nextTokPtr = ptr;
  451. return XML_TOK_INVALID;
  452. }
  453. for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
  454. switch (BYTE_TYPE(enc, ptr)) {
  455. case BT_DIGIT:
  456. case BT_HEX:
  457. break;
  458. case BT_SEMI:
  459. *nextTokPtr = ptr + MINBPC(enc);
  460. return XML_TOK_CHAR_REF;
  461. default:
  462. *nextTokPtr = ptr;
  463. return XML_TOK_INVALID;
  464. }
  465. }
  466. }
  467. return XML_TOK_PARTIAL;
  468. }
  469. /* ptr points to character following "&#" */
  470. static int PTRCALL
  471. PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
  472. const char **nextTokPtr) {
  473. if (HAS_CHAR(enc, ptr, end)) {
  474. if (CHAR_MATCHES(enc, ptr, ASCII_x))
  475. return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  476. switch (BYTE_TYPE(enc, ptr)) {
  477. case BT_DIGIT:
  478. break;
  479. default:
  480. *nextTokPtr = ptr;
  481. return XML_TOK_INVALID;
  482. }
  483. for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
  484. switch (BYTE_TYPE(enc, ptr)) {
  485. case BT_DIGIT:
  486. break;
  487. case BT_SEMI:
  488. *nextTokPtr = ptr + MINBPC(enc);
  489. return XML_TOK_CHAR_REF;
  490. default:
  491. *nextTokPtr = ptr;
  492. return XML_TOK_INVALID;
  493. }
  494. }
  495. }
  496. return XML_TOK_PARTIAL;
  497. }
  498. /* ptr points to character following "&" */
  499. static int PTRCALL
  500. PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  501. const char **nextTokPtr) {
  502. REQUIRE_CHAR(enc, ptr, end);
  503. switch (BYTE_TYPE(enc, ptr)) {
  504. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  505. case BT_NUM:
  506. return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  507. default:
  508. *nextTokPtr = ptr;
  509. return XML_TOK_INVALID;
  510. }
  511. while (HAS_CHAR(enc, ptr, end)) {
  512. switch (BYTE_TYPE(enc, ptr)) {
  513. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  514. case BT_SEMI:
  515. *nextTokPtr = ptr + MINBPC(enc);
  516. return XML_TOK_ENTITY_REF;
  517. default:
  518. *nextTokPtr = ptr;
  519. return XML_TOK_INVALID;
  520. }
  521. }
  522. return XML_TOK_PARTIAL;
  523. }
  524. /* ptr points to character following first character of attribute name */
  525. static int PTRCALL
  526. PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  527. const char **nextTokPtr) {
  528. # ifdef XML_NS
  529. int hadColon = 0;
  530. # endif
  531. while (HAS_CHAR(enc, ptr, end)) {
  532. switch (BYTE_TYPE(enc, ptr)) {
  533. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  534. # ifdef XML_NS
  535. case BT_COLON:
  536. if (hadColon) {
  537. *nextTokPtr = ptr;
  538. return XML_TOK_INVALID;
  539. }
  540. hadColon = 1;
  541. ptr += MINBPC(enc);
  542. REQUIRE_CHAR(enc, ptr, end);
  543. switch (BYTE_TYPE(enc, ptr)) {
  544. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  545. default:
  546. *nextTokPtr = ptr;
  547. return XML_TOK_INVALID;
  548. }
  549. break;
  550. # endif
  551. case BT_S:
  552. case BT_CR:
  553. case BT_LF:
  554. for (;;) {
  555. int t;
  556. ptr += MINBPC(enc);
  557. REQUIRE_CHAR(enc, ptr, end);
  558. t = BYTE_TYPE(enc, ptr);
  559. if (t == BT_EQUALS)
  560. break;
  561. switch (t) {
  562. case BT_S:
  563. case BT_LF:
  564. case BT_CR:
  565. break;
  566. default:
  567. *nextTokPtr = ptr;
  568. return XML_TOK_INVALID;
  569. }
  570. }
  571. /* fall through */
  572. case BT_EQUALS: {
  573. int open;
  574. # ifdef XML_NS
  575. hadColon = 0;
  576. # endif
  577. for (;;) {
  578. ptr += MINBPC(enc);
  579. REQUIRE_CHAR(enc, ptr, end);
  580. open = BYTE_TYPE(enc, ptr);
  581. if (open == BT_QUOT || open == BT_APOS)
  582. break;
  583. switch (open) {
  584. case BT_S:
  585. case BT_LF:
  586. case BT_CR:
  587. break;
  588. default:
  589. *nextTokPtr = ptr;
  590. return XML_TOK_INVALID;
  591. }
  592. }
  593. ptr += MINBPC(enc);
  594. /* in attribute value */
  595. for (;;) {
  596. int t;
  597. REQUIRE_CHAR(enc, ptr, end);
  598. t = BYTE_TYPE(enc, ptr);
  599. if (t == open)
  600. break;
  601. switch (t) {
  602. INVALID_CASES(ptr, nextTokPtr)
  603. case BT_AMP: {
  604. int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  605. if (tok <= 0) {
  606. if (tok == XML_TOK_INVALID)
  607. *nextTokPtr = ptr;
  608. return tok;
  609. }
  610. break;
  611. }
  612. case BT_LT:
  613. *nextTokPtr = ptr;
  614. return XML_TOK_INVALID;
  615. default:
  616. ptr += MINBPC(enc);
  617. break;
  618. }
  619. }
  620. ptr += MINBPC(enc);
  621. REQUIRE_CHAR(enc, ptr, end);
  622. switch (BYTE_TYPE(enc, ptr)) {
  623. case BT_S:
  624. case BT_CR:
  625. case BT_LF:
  626. break;
  627. case BT_SOL:
  628. goto sol;
  629. case BT_GT:
  630. goto gt;
  631. default:
  632. *nextTokPtr = ptr;
  633. return XML_TOK_INVALID;
  634. }
  635. /* ptr points to closing quote */
  636. for (;;) {
  637. ptr += MINBPC(enc);
  638. REQUIRE_CHAR(enc, ptr, end);
  639. switch (BYTE_TYPE(enc, ptr)) {
  640. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  641. case BT_S:
  642. case BT_CR:
  643. case BT_LF:
  644. continue;
  645. case BT_GT:
  646. gt:
  647. *nextTokPtr = ptr + MINBPC(enc);
  648. return XML_TOK_START_TAG_WITH_ATTS;
  649. case BT_SOL:
  650. sol:
  651. ptr += MINBPC(enc);
  652. REQUIRE_CHAR(enc, ptr, end);
  653. if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  654. *nextTokPtr = ptr;
  655. return XML_TOK_INVALID;
  656. }
  657. *nextTokPtr = ptr + MINBPC(enc);
  658. return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  659. default:
  660. *nextTokPtr = ptr;
  661. return XML_TOK_INVALID;
  662. }
  663. break;
  664. }
  665. break;
  666. }
  667. default:
  668. *nextTokPtr = ptr;
  669. return XML_TOK_INVALID;
  670. }
  671. }
  672. return XML_TOK_PARTIAL;
  673. }
  674. /* ptr points to character following "<" */
  675. static int PTRCALL
  676. PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  677. const char **nextTokPtr) {
  678. # ifdef XML_NS
  679. int hadColon;
  680. # endif
  681. REQUIRE_CHAR(enc, ptr, end);
  682. switch (BYTE_TYPE(enc, ptr)) {
  683. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  684. case BT_EXCL:
  685. ptr += MINBPC(enc);
  686. REQUIRE_CHAR(enc, ptr, end);
  687. switch (BYTE_TYPE(enc, ptr)) {
  688. case BT_MINUS:
  689. return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  690. case BT_LSQB:
  691. return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  692. }
  693. *nextTokPtr = ptr;
  694. return XML_TOK_INVALID;
  695. case BT_QUEST:
  696. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  697. case BT_SOL:
  698. return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  699. default:
  700. *nextTokPtr = ptr;
  701. return XML_TOK_INVALID;
  702. }
  703. # ifdef XML_NS
  704. hadColon = 0;
  705. # endif
  706. /* we have a start-tag */
  707. while (HAS_CHAR(enc, ptr, end)) {
  708. switch (BYTE_TYPE(enc, ptr)) {
  709. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  710. # ifdef XML_NS
  711. case BT_COLON:
  712. if (hadColon) {
  713. *nextTokPtr = ptr;
  714. return XML_TOK_INVALID;
  715. }
  716. hadColon = 1;
  717. ptr += MINBPC(enc);
  718. REQUIRE_CHAR(enc, ptr, end);
  719. switch (BYTE_TYPE(enc, ptr)) {
  720. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  721. default:
  722. *nextTokPtr = ptr;
  723. return XML_TOK_INVALID;
  724. }
  725. break;
  726. # endif
  727. case BT_S:
  728. case BT_CR:
  729. case BT_LF: {
  730. ptr += MINBPC(enc);
  731. while (HAS_CHAR(enc, ptr, end)) {
  732. switch (BYTE_TYPE(enc, ptr)) {
  733. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  734. case BT_GT:
  735. goto gt;
  736. case BT_SOL:
  737. goto sol;
  738. case BT_S:
  739. case BT_CR:
  740. case BT_LF:
  741. ptr += MINBPC(enc);
  742. continue;
  743. default:
  744. *nextTokPtr = ptr;
  745. return XML_TOK_INVALID;
  746. }
  747. return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  748. }
  749. return XML_TOK_PARTIAL;
  750. }
  751. case BT_GT:
  752. gt:
  753. *nextTokPtr = ptr + MINBPC(enc);
  754. return XML_TOK_START_TAG_NO_ATTS;
  755. case BT_SOL:
  756. sol:
  757. ptr += MINBPC(enc);
  758. REQUIRE_CHAR(enc, ptr, end);
  759. if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  760. *nextTokPtr = ptr;
  761. return XML_TOK_INVALID;
  762. }
  763. *nextTokPtr = ptr + MINBPC(enc);
  764. return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  765. default:
  766. *nextTokPtr = ptr;
  767. return XML_TOK_INVALID;
  768. }
  769. }
  770. return XML_TOK_PARTIAL;
  771. }
  772. static int PTRCALL
  773. PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  774. const char **nextTokPtr) {
  775. if (ptr >= end)
  776. return XML_TOK_NONE;
  777. if (MINBPC(enc) > 1) {
  778. size_t n = end - ptr;
  779. if (n & (MINBPC(enc) - 1)) {
  780. n &= ~(MINBPC(enc) - 1);
  781. if (n == 0)
  782. return XML_TOK_PARTIAL;
  783. end = ptr + n;
  784. }
  785. }
  786. switch (BYTE_TYPE(enc, ptr)) {
  787. case BT_LT:
  788. return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  789. case BT_AMP:
  790. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  791. case BT_CR:
  792. ptr += MINBPC(enc);
  793. if (! HAS_CHAR(enc, ptr, end))
  794. return XML_TOK_TRAILING_CR;
  795. if (BYTE_TYPE(enc, ptr) == BT_LF)
  796. ptr += MINBPC(enc);
  797. *nextTokPtr = ptr;
  798. return XML_TOK_DATA_NEWLINE;
  799. case BT_LF:
  800. *nextTokPtr = ptr + MINBPC(enc);
  801. return XML_TOK_DATA_NEWLINE;
  802. case BT_RSQB:
  803. ptr += MINBPC(enc);
  804. if (! HAS_CHAR(enc, ptr, end))
  805. return XML_TOK_TRAILING_RSQB;
  806. if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  807. break;
  808. ptr += MINBPC(enc);
  809. if (! HAS_CHAR(enc, ptr, end))
  810. return XML_TOK_TRAILING_RSQB;
  811. if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  812. ptr -= MINBPC(enc);
  813. break;
  814. }
  815. *nextTokPtr = ptr;
  816. return XML_TOK_INVALID;
  817. INVALID_CASES(ptr, nextTokPtr)
  818. default:
  819. ptr += MINBPC(enc);
  820. break;
  821. }
  822. while (HAS_CHAR(enc, ptr, end)) {
  823. switch (BYTE_TYPE(enc, ptr)) {
  824. # define LEAD_CASE(n) \
  825. case BT_LEAD##n: \
  826. if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  827. *nextTokPtr = ptr; \
  828. return XML_TOK_DATA_CHARS; \
  829. } \
  830. ptr += n; \
  831. break;
  832. LEAD_CASE(2)
  833. LEAD_CASE(3)
  834. LEAD_CASE(4)
  835. # undef LEAD_CASE
  836. case BT_RSQB:
  837. if (HAS_CHARS(enc, ptr, end, 2)) {
  838. if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  839. ptr += MINBPC(enc);
  840. break;
  841. }
  842. if (HAS_CHARS(enc, ptr, end, 3)) {
  843. if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
  844. ptr += MINBPC(enc);
  845. break;
  846. }
  847. *nextTokPtr = ptr + 2 * MINBPC(enc);
  848. return XML_TOK_INVALID;
  849. }
  850. }
  851. /* fall through */
  852. case BT_AMP:
  853. case BT_LT:
  854. case BT_NONXML:
  855. case BT_MALFORM:
  856. case BT_TRAIL:
  857. case BT_CR:
  858. case BT_LF:
  859. *nextTokPtr = ptr;
  860. return XML_TOK_DATA_CHARS;
  861. default:
  862. ptr += MINBPC(enc);
  863. break;
  864. }
  865. }
  866. *nextTokPtr = ptr;
  867. return XML_TOK_DATA_CHARS;
  868. }
  869. /* ptr points to character following "%" */
  870. static int PTRCALL
  871. PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  872. const char **nextTokPtr) {
  873. REQUIRE_CHAR(enc, ptr, end);
  874. switch (BYTE_TYPE(enc, ptr)) {
  875. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  876. case BT_S:
  877. case BT_LF:
  878. case BT_CR:
  879. case BT_PERCNT:
  880. *nextTokPtr = ptr;
  881. return XML_TOK_PERCENT;
  882. default:
  883. *nextTokPtr = ptr;
  884. return XML_TOK_INVALID;
  885. }
  886. while (HAS_CHAR(enc, ptr, end)) {
  887. switch (BYTE_TYPE(enc, ptr)) {
  888. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  889. case BT_SEMI:
  890. *nextTokPtr = ptr + MINBPC(enc);
  891. return XML_TOK_PARAM_ENTITY_REF;
  892. default:
  893. *nextTokPtr = ptr;
  894. return XML_TOK_INVALID;
  895. }
  896. }
  897. return XML_TOK_PARTIAL;
  898. }
  899. static int PTRCALL
  900. PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  901. const char **nextTokPtr) {
  902. REQUIRE_CHAR(enc, ptr, end);
  903. switch (BYTE_TYPE(enc, ptr)) {
  904. CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  905. default:
  906. *nextTokPtr = ptr;
  907. return XML_TOK_INVALID;
  908. }
  909. while (HAS_CHAR(enc, ptr, end)) {
  910. switch (BYTE_TYPE(enc, ptr)) {
  911. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  912. case BT_CR:
  913. case BT_LF:
  914. case BT_S:
  915. case BT_RPAR:
  916. case BT_GT:
  917. case BT_PERCNT:
  918. case BT_VERBAR:
  919. *nextTokPtr = ptr;
  920. return XML_TOK_POUND_NAME;
  921. default:
  922. *nextTokPtr = ptr;
  923. return XML_TOK_INVALID;
  924. }
  925. }
  926. return -XML_TOK_POUND_NAME;
  927. }
  928. static int PTRCALL
  929. PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
  930. const char **nextTokPtr) {
  931. while (HAS_CHAR(enc, ptr, end)) {
  932. int t = BYTE_TYPE(enc, ptr);
  933. switch (t) {
  934. INVALID_CASES(ptr, nextTokPtr)
  935. case BT_QUOT:
  936. case BT_APOS:
  937. ptr += MINBPC(enc);
  938. if (t != open)
  939. break;
  940. if (! HAS_CHAR(enc, ptr, end))
  941. return -XML_TOK_LITERAL;
  942. *nextTokPtr = ptr;
  943. switch (BYTE_TYPE(enc, ptr)) {
  944. case BT_S:
  945. case BT_CR:
  946. case BT_LF:
  947. case BT_GT:
  948. case BT_PERCNT:
  949. case BT_LSQB:
  950. return XML_TOK_LITERAL;
  951. default:
  952. return XML_TOK_INVALID;
  953. }
  954. default:
  955. ptr += MINBPC(enc);
  956. break;
  957. }
  958. }
  959. return XML_TOK_PARTIAL;
  960. }
  961. static int PTRCALL
  962. PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  963. const char **nextTokPtr) {
  964. int tok;
  965. if (ptr >= end)
  966. return XML_TOK_NONE;
  967. if (MINBPC(enc) > 1) {
  968. size_t n = end - ptr;
  969. if (n & (MINBPC(enc) - 1)) {
  970. n &= ~(MINBPC(enc) - 1);
  971. if (n == 0)
  972. return XML_TOK_PARTIAL;
  973. end = ptr + n;
  974. }
  975. }
  976. switch (BYTE_TYPE(enc, ptr)) {
  977. case BT_QUOT:
  978. return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  979. case BT_APOS:
  980. return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  981. case BT_LT: {
  982. ptr += MINBPC(enc);
  983. REQUIRE_CHAR(enc, ptr, end);
  984. switch (BYTE_TYPE(enc, ptr)) {
  985. case BT_EXCL:
  986. return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  987. case BT_QUEST:
  988. return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  989. case BT_NMSTRT:
  990. case BT_HEX:
  991. case BT_NONASCII:
  992. case BT_LEAD2:
  993. case BT_LEAD3:
  994. case BT_LEAD4:
  995. *nextTokPtr = ptr - MINBPC(enc);
  996. return XML_TOK_INSTANCE_START;
  997. }
  998. *nextTokPtr = ptr;
  999. return XML_TOK_INVALID;
  1000. }
  1001. case BT_CR:
  1002. if (ptr + MINBPC(enc) == end) {
  1003. *nextTokPtr = end;
  1004. /* indicate that this might be part of a CR/LF pair */
  1005. return -XML_TOK_PROLOG_S;
  1006. }
  1007. /* fall through */
  1008. case BT_S:
  1009. case BT_LF:
  1010. for (;;) {
  1011. ptr += MINBPC(enc);
  1012. if (! HAS_CHAR(enc, ptr, end))
  1013. break;
  1014. switch (BYTE_TYPE(enc, ptr)) {
  1015. case BT_S:
  1016. case BT_LF:
  1017. break;
  1018. case BT_CR:
  1019. /* don't split CR/LF pair */
  1020. if (ptr + MINBPC(enc) != end)
  1021. break;
  1022. /* fall through */
  1023. default:
  1024. *nextTokPtr = ptr;
  1025. return XML_TOK_PROLOG_S;
  1026. }
  1027. }
  1028. *nextTokPtr = ptr;
  1029. return XML_TOK_PROLOG_S;
  1030. case BT_PERCNT:
  1031. return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1032. case BT_COMMA:
  1033. *nextTokPtr = ptr + MINBPC(enc);
  1034. return XML_TOK_COMMA;
  1035. case BT_LSQB:
  1036. *nextTokPtr = ptr + MINBPC(enc);
  1037. return XML_TOK_OPEN_BRACKET;
  1038. case BT_RSQB:
  1039. ptr += MINBPC(enc);
  1040. if (! HAS_CHAR(enc, ptr, end))
  1041. return -XML_TOK_CLOSE_BRACKET;
  1042. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1043. REQUIRE_CHARS(enc, ptr, end, 2);
  1044. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
  1045. *nextTokPtr = ptr + 2 * MINBPC(enc);
  1046. return XML_TOK_COND_SECT_CLOSE;
  1047. }
  1048. }
  1049. *nextTokPtr = ptr;
  1050. return XML_TOK_CLOSE_BRACKET;
  1051. case BT_LPAR:
  1052. *nextTokPtr = ptr + MINBPC(enc);
  1053. return XML_TOK_OPEN_PAREN;
  1054. case BT_RPAR:
  1055. ptr += MINBPC(enc);
  1056. if (! HAS_CHAR(enc, ptr, end))
  1057. return -XML_TOK_CLOSE_PAREN;
  1058. switch (BYTE_TYPE(enc, ptr)) {
  1059. case BT_AST:
  1060. *nextTokPtr = ptr + MINBPC(enc);
  1061. return XML_TOK_CLOSE_PAREN_ASTERISK;
  1062. case BT_QUEST:
  1063. *nextTokPtr = ptr + MINBPC(enc);
  1064. return XML_TOK_CLOSE_PAREN_QUESTION;
  1065. case BT_PLUS:
  1066. *nextTokPtr = ptr + MINBPC(enc);
  1067. return XML_TOK_CLOSE_PAREN_PLUS;
  1068. case BT_CR:
  1069. case BT_LF:
  1070. case BT_S:
  1071. case BT_GT:
  1072. case BT_COMMA:
  1073. case BT_VERBAR:
  1074. case BT_RPAR:
  1075. *nextTokPtr = ptr;
  1076. return XML_TOK_CLOSE_PAREN;
  1077. }
  1078. *nextTokPtr = ptr;
  1079. return XML_TOK_INVALID;
  1080. case BT_VERBAR:
  1081. *nextTokPtr = ptr + MINBPC(enc);
  1082. return XML_TOK_OR;
  1083. case BT_GT:
  1084. *nextTokPtr = ptr + MINBPC(enc);
  1085. return XML_TOK_DECL_CLOSE;
  1086. case BT_NUM:
  1087. return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1088. # define LEAD_CASE(n) \
  1089. case BT_LEAD##n: \
  1090. if (end - ptr < n) \
  1091. return XML_TOK_PARTIAL_CHAR; \
  1092. if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
  1093. ptr += n; \
  1094. tok = XML_TOK_NAME; \
  1095. break; \
  1096. } \
  1097. if (IS_NAME_CHAR(enc, ptr, n)) { \
  1098. ptr += n; \
  1099. tok = XML_TOK_NMTOKEN; \
  1100. break; \
  1101. } \
  1102. *nextTokPtr = ptr; \
  1103. return XML_TOK_INVALID;
  1104. LEAD_CASE(2)
  1105. LEAD_CASE(3)
  1106. LEAD_CASE(4)
  1107. # undef LEAD_CASE
  1108. case BT_NMSTRT:
  1109. case BT_HEX:
  1110. tok = XML_TOK_NAME;
  1111. ptr += MINBPC(enc);
  1112. break;
  1113. case BT_DIGIT:
  1114. case BT_NAME:
  1115. case BT_MINUS:
  1116. # ifdef XML_NS
  1117. case BT_COLON:
  1118. # endif
  1119. tok = XML_TOK_NMTOKEN;
  1120. ptr += MINBPC(enc);
  1121. break;
  1122. case BT_NONASCII:
  1123. if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
  1124. ptr += MINBPC(enc);
  1125. tok = XML_TOK_NAME;
  1126. break;
  1127. }
  1128. if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
  1129. ptr += MINBPC(enc);
  1130. tok = XML_TOK_NMTOKEN;
  1131. break;
  1132. }
  1133. /* fall through */
  1134. default:
  1135. *nextTokPtr = ptr;
  1136. return XML_TOK_INVALID;
  1137. }
  1138. while (HAS_CHAR(enc, ptr, end)) {
  1139. switch (BYTE_TYPE(enc, ptr)) {
  1140. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1141. case BT_GT:
  1142. case BT_RPAR:
  1143. case BT_COMMA:
  1144. case BT_VERBAR:
  1145. case BT_LSQB:
  1146. case BT_PERCNT:
  1147. case BT_S:
  1148. case BT_CR:
  1149. case BT_LF:
  1150. *nextTokPtr = ptr;
  1151. return tok;
  1152. # ifdef XML_NS
  1153. case BT_COLON:
  1154. ptr += MINBPC(enc);
  1155. switch (tok) {
  1156. case XML_TOK_NAME:
  1157. REQUIRE_CHAR(enc, ptr, end);
  1158. tok = XML_TOK_PREFIXED_NAME;
  1159. switch (BYTE_TYPE(enc, ptr)) {
  1160. CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  1161. default:
  1162. tok = XML_TOK_NMTOKEN;
  1163. break;
  1164. }
  1165. break;
  1166. case XML_TOK_PREFIXED_NAME:
  1167. tok = XML_TOK_NMTOKEN;
  1168. break;
  1169. }
  1170. break;
  1171. # endif
  1172. case BT_PLUS:
  1173. if (tok == XML_TOK_NMTOKEN) {
  1174. *nextTokPtr = ptr;
  1175. return XML_TOK_INVALID;
  1176. }
  1177. *nextTokPtr = ptr + MINBPC(enc);
  1178. return XML_TOK_NAME_PLUS;
  1179. case BT_AST:
  1180. if (tok == XML_TOK_NMTOKEN) {
  1181. *nextTokPtr = ptr;
  1182. return XML_TOK_INVALID;
  1183. }
  1184. *nextTokPtr = ptr + MINBPC(enc);
  1185. return XML_TOK_NAME_ASTERISK;
  1186. case BT_QUEST:
  1187. if (tok == XML_TOK_NMTOKEN) {
  1188. *nextTokPtr = ptr;
  1189. return XML_TOK_INVALID;
  1190. }
  1191. *nextTokPtr = ptr + MINBPC(enc);
  1192. return XML_TOK_NAME_QUESTION;
  1193. default:
  1194. *nextTokPtr = ptr;
  1195. return XML_TOK_INVALID;
  1196. }
  1197. }
  1198. return -tok;
  1199. }
  1200. static int PTRCALL
  1201. PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
  1202. const char **nextTokPtr) {
  1203. const char *start;
  1204. if (ptr >= end)
  1205. return XML_TOK_NONE;
  1206. else if (! HAS_CHAR(enc, ptr, end)) {
  1207. /* This line cannot be executed. The incoming data has already
  1208. * been tokenized once, so incomplete characters like this have
  1209. * already been eliminated from the input. Retaining the paranoia
  1210. * check is still valuable, however.
  1211. */
  1212. return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
  1213. }
  1214. start = ptr;
  1215. while (HAS_CHAR(enc, ptr, end)) {
  1216. switch (BYTE_TYPE(enc, ptr)) {
  1217. # define LEAD_CASE(n) \
  1218. case BT_LEAD##n: \
  1219. ptr += n; \
  1220. break;
  1221. LEAD_CASE(2)
  1222. LEAD_CASE(3)
  1223. LEAD_CASE(4)
  1224. # undef LEAD_CASE
  1225. case BT_AMP:
  1226. if (ptr == start)
  1227. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1228. *nextTokPtr = ptr;
  1229. return XML_TOK_DATA_CHARS;
  1230. case BT_LT:
  1231. /* this is for inside entity references */
  1232. *nextTokPtr = ptr;
  1233. return XML_TOK_INVALID;
  1234. case BT_LF:
  1235. if (ptr == start) {
  1236. *nextTokPtr = ptr + MINBPC(enc);
  1237. return XML_TOK_DATA_NEWLINE;
  1238. }
  1239. *nextTokPtr = ptr;
  1240. return XML_TOK_DATA_CHARS;
  1241. case BT_CR:
  1242. if (ptr == start) {
  1243. ptr += MINBPC(enc);
  1244. if (! HAS_CHAR(enc, ptr, end))
  1245. return XML_TOK_TRAILING_CR;
  1246. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1247. ptr += MINBPC(enc);
  1248. *nextTokPtr = ptr;
  1249. return XML_TOK_DATA_NEWLINE;
  1250. }
  1251. *nextTokPtr = ptr;
  1252. return XML_TOK_DATA_CHARS;
  1253. case BT_S:
  1254. if (ptr == start) {
  1255. *nextTokPtr = ptr + MINBPC(enc);
  1256. return XML_TOK_ATTRIBUTE_VALUE_S;
  1257. }
  1258. *nextTokPtr = ptr;
  1259. return XML_TOK_DATA_CHARS;
  1260. default:
  1261. ptr += MINBPC(enc);
  1262. break;
  1263. }
  1264. }
  1265. *nextTokPtr = ptr;
  1266. return XML_TOK_DATA_CHARS;
  1267. }
  1268. static int PTRCALL
  1269. PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
  1270. const char **nextTokPtr) {
  1271. const char *start;
  1272. if (ptr >= end)
  1273. return XML_TOK_NONE;
  1274. else if (! HAS_CHAR(enc, ptr, end)) {
  1275. /* This line cannot be executed. The incoming data has already
  1276. * been tokenized once, so incomplete characters like this have
  1277. * already been eliminated from the input. Retaining the paranoia
  1278. * check is still valuable, however.
  1279. */
  1280. return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
  1281. }
  1282. start = ptr;
  1283. while (HAS_CHAR(enc, ptr, end)) {
  1284. switch (BYTE_TYPE(enc, ptr)) {
  1285. # define LEAD_CASE(n) \
  1286. case BT_LEAD##n: \
  1287. ptr += n; \
  1288. break;
  1289. LEAD_CASE(2)
  1290. LEAD_CASE(3)
  1291. LEAD_CASE(4)
  1292. # undef LEAD_CASE
  1293. case BT_AMP:
  1294. if (ptr == start)
  1295. return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1296. *nextTokPtr = ptr;
  1297. return XML_TOK_DATA_CHARS;
  1298. case BT_PERCNT:
  1299. if (ptr == start) {
  1300. int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  1301. return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
  1302. }
  1303. *nextTokPtr = ptr;
  1304. return XML_TOK_DATA_CHARS;
  1305. case BT_LF:
  1306. if (ptr == start) {
  1307. *nextTokPtr = ptr + MINBPC(enc);
  1308. return XML_TOK_DATA_NEWLINE;
  1309. }
  1310. *nextTokPtr = ptr;
  1311. return XML_TOK_DATA_CHARS;
  1312. case BT_CR:
  1313. if (ptr == start) {
  1314. ptr += MINBPC(enc);
  1315. if (! HAS_CHAR(enc, ptr, end))
  1316. return XML_TOK_TRAILING_CR;
  1317. if (BYTE_TYPE(enc, ptr) == BT_LF)
  1318. ptr += MINBPC(enc);
  1319. *nextTokPtr = ptr;
  1320. return XML_TOK_DATA_NEWLINE;
  1321. }
  1322. *nextTokPtr = ptr;
  1323. return XML_TOK_DATA_CHARS;
  1324. default:
  1325. ptr += MINBPC(enc);
  1326. break;
  1327. }
  1328. }
  1329. *nextTokPtr = ptr;
  1330. return XML_TOK_DATA_CHARS;
  1331. }
  1332. # ifdef XML_DTD
  1333. static int PTRCALL
  1334. PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
  1335. const char **nextTokPtr) {
  1336. int level = 0;
  1337. if (MINBPC(enc) > 1) {
  1338. size_t n = end - ptr;
  1339. if (n & (MINBPC(enc) - 1)) {
  1340. n &= ~(MINBPC(enc) - 1);
  1341. end = ptr + n;
  1342. }
  1343. }
  1344. while (HAS_CHAR(enc, ptr, end)) {
  1345. switch (BYTE_TYPE(enc, ptr)) {
  1346. INVALID_CASES(ptr, nextTokPtr)
  1347. case BT_LT:
  1348. ptr += MINBPC(enc);
  1349. REQUIRE_CHAR(enc, ptr, end);
  1350. if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
  1351. ptr += MINBPC(enc);
  1352. REQUIRE_CHAR(enc, ptr, end);
  1353. if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
  1354. ++level;
  1355. ptr += MINBPC(enc);
  1356. }
  1357. }
  1358. break;
  1359. case BT_RSQB:
  1360. ptr += MINBPC(enc);
  1361. REQUIRE_CHAR(enc, ptr, end);
  1362. if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
  1363. ptr += MINBPC(enc);
  1364. REQUIRE_CHAR(enc, ptr, end);
  1365. if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  1366. ptr += MINBPC(enc);
  1367. if (level == 0) {
  1368. *nextTokPtr = ptr;
  1369. return XML_TOK_IGNORE_SECT;
  1370. }
  1371. --level;
  1372. }
  1373. }
  1374. break;
  1375. default:
  1376. ptr += MINBPC(enc);
  1377. break;
  1378. }
  1379. }
  1380. return XML_TOK_PARTIAL;
  1381. }
  1382. # endif /* XML_DTD */
  1383. static int PTRCALL
  1384. PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
  1385. const char **badPtr) {
  1386. ptr += MINBPC(enc);
  1387. end -= MINBPC(enc);
  1388. for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
  1389. switch (BYTE_TYPE(enc, ptr)) {
  1390. case BT_DIGIT:
  1391. case BT_HEX:
  1392. case BT_MINUS:
  1393. case BT_APOS:
  1394. case BT_LPAR:
  1395. case BT_RPAR:
  1396. case BT_PLUS:
  1397. case BT_COMMA:
  1398. case BT_SOL:
  1399. case BT_EQUALS:
  1400. case BT_QUEST:
  1401. case BT_CR:
  1402. case BT_LF:
  1403. case BT_SEMI:
  1404. case BT_EXCL:
  1405. case BT_AST:
  1406. case BT_PERCNT:
  1407. case BT_NUM:
  1408. # ifdef XML_NS
  1409. case BT_COLON:
  1410. # endif
  1411. break;
  1412. case BT_S:
  1413. if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
  1414. *badPtr = ptr;
  1415. return 0;
  1416. }
  1417. break;
  1418. case BT_NAME:
  1419. case BT_NMSTRT:
  1420. if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
  1421. break;
  1422. /* fall through */
  1423. default:
  1424. switch (BYTE_TO_ASCII(enc, ptr)) {
  1425. case 0x24: /* $ */
  1426. case 0x40: /* @ */
  1427. break;
  1428. default:
  1429. *badPtr = ptr;
  1430. return 0;
  1431. }
  1432. break;
  1433. }
  1434. }
  1435. return 1;
  1436. }
  1437. /* This must only be called for a well-formed start-tag or empty
  1438. element tag. Returns the number of attributes. Pointers to the
  1439. first attsMax attributes are stored in atts.
  1440. */
  1441. static int PTRCALL
  1442. PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
  1443. ATTRIBUTE *atts) {
  1444. enum { other, inName, inValue } state = inName;
  1445. int nAtts = 0;
  1446. int open = 0; /* defined when state == inValue;
  1447. initialization just to shut up compilers */
  1448. for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
  1449. switch (BYTE_TYPE(enc, ptr)) {
  1450. # define START_NAME \
  1451. if (state == other) { \
  1452. if (nAtts < attsMax) { \
  1453. atts[nAtts].name = ptr; \
  1454. atts[nAtts].normalized = 1; \
  1455. } \
  1456. state = inName; \
  1457. }
  1458. # define LEAD_CASE(n) \
  1459. case BT_LEAD##n: \
  1460. START_NAME ptr += (n - MINBPC(enc)); \
  1461. break;
  1462. LEAD_CASE(2)
  1463. LEAD_CASE(3)
  1464. LEAD_CASE(4)
  1465. # undef LEAD_CASE
  1466. case BT_NONASCII:
  1467. case BT_NMSTRT:
  1468. case BT_HEX:
  1469. START_NAME
  1470. break;
  1471. # undef START_NAME
  1472. case BT_QUOT:
  1473. if (state != inValue) {
  1474. if (nAtts < attsMax)
  1475. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1476. state = inValue;
  1477. open = BT_QUOT;
  1478. } else if (open == BT_QUOT) {
  1479. state = other;
  1480. if (nAtts < attsMax)
  1481. atts[nAtts].valueEnd = ptr;
  1482. nAtts++;
  1483. }
  1484. break;
  1485. case BT_APOS:
  1486. if (state != inValue) {
  1487. if (nAtts < attsMax)
  1488. atts[nAtts].valuePtr = ptr + MINBPC(enc);
  1489. state = inValue;
  1490. open = BT_APOS;
  1491. } else if (open == BT_APOS) {
  1492. state = other;
  1493. if (nAtts < attsMax)
  1494. atts[nAtts].valueEnd = ptr;
  1495. nAtts++;
  1496. }
  1497. break;
  1498. case BT_AMP:
  1499. if (nAtts < attsMax)
  1500. atts[nAtts].normalized = 0;
  1501. break;
  1502. case BT_S:
  1503. if (state == inName)
  1504. state = other;
  1505. else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
  1506. && (ptr == atts[nAtts].valuePtr
  1507. || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
  1508. || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
  1509. || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
  1510. atts[nAtts].normalized = 0;
  1511. break;
  1512. case BT_CR:
  1513. case BT_LF:
  1514. /* This case ensures that the first attribute name is counted
  1515. Apart from that we could just change state on the quote. */
  1516. if (state == inName)
  1517. state = other;
  1518. else if (state == inValue && nAtts < attsMax)
  1519. atts[nAtts].normalized = 0;
  1520. break;
  1521. case BT_GT:
  1522. case BT_SOL:
  1523. if (state != inValue)
  1524. return nAtts;
  1525. break;
  1526. default:
  1527. break;
  1528. }
  1529. }
  1530. /* not reached */
  1531. }
  1532. static int PTRFASTCALL
  1533. PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
  1534. int result = 0;
  1535. /* skip &# */
  1536. UNUSED_P(enc);
  1537. ptr += 2 * MINBPC(enc);
  1538. if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
  1539. for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
  1540. ptr += MINBPC(enc)) {
  1541. int c = BYTE_TO_ASCII(enc, ptr);
  1542. switch (c) {
  1543. case ASCII_0:
  1544. case ASCII_1:
  1545. case ASCII_2:
  1546. case ASCII_3:
  1547. case ASCII_4:
  1548. case ASCII_5:
  1549. case ASCII_6:
  1550. case ASCII_7:
  1551. case ASCII_8:
  1552. case ASCII_9:
  1553. result <<= 4;
  1554. result |= (c - ASCII_0);
  1555. break;
  1556. case ASCII_A:
  1557. case ASCII_B:
  1558. case ASCII_C:
  1559. case ASCII_D:
  1560. case ASCII_E:
  1561. case ASCII_F:
  1562. result <<= 4;
  1563. result += 10 + (c - ASCII_A);
  1564. break;
  1565. case ASCII_a:
  1566. case ASCII_b:
  1567. case ASCII_c:
  1568. case ASCII_d:
  1569. case ASCII_e:
  1570. case ASCII_f:
  1571. result <<= 4;
  1572. result += 10 + (c - ASCII_a);
  1573. break;
  1574. }
  1575. if (result >= 0x110000)
  1576. return -1;
  1577. }
  1578. } else {
  1579. for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
  1580. int c = BYTE_TO_ASCII(enc, ptr);
  1581. result *= 10;
  1582. result += (c - ASCII_0);
  1583. if (result >= 0x110000)
  1584. return -1;
  1585. }
  1586. }
  1587. return checkCharRefNumber(result);
  1588. }
  1589. static int PTRCALL
  1590. PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
  1591. const char *end) {
  1592. UNUSED_P(enc);
  1593. switch ((end - ptr) / MINBPC(enc)) {
  1594. case 2:
  1595. if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
  1596. switch (BYTE_TO_ASCII(enc, ptr)) {
  1597. case ASCII_l:
  1598. return ASCII_LT;
  1599. case ASCII_g:
  1600. return ASCII_GT;
  1601. }
  1602. }
  1603. break;
  1604. case 3:
  1605. if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
  1606. ptr += MINBPC(enc);
  1607. if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
  1608. ptr += MINBPC(enc);
  1609. if (CHAR_MATCHES(enc, ptr, ASCII_p))
  1610. return ASCII_AMP;
  1611. }
  1612. }
  1613. break;
  1614. case 4:
  1615. switch (BYTE_TO_ASCII(enc, ptr)) {
  1616. case ASCII_q:
  1617. ptr += MINBPC(enc);
  1618. if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
  1619. ptr += MINBPC(enc);
  1620. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1621. ptr += MINBPC(enc);
  1622. if (CHAR_MATCHES(enc, ptr, ASCII_t))
  1623. return ASCII_QUOT;
  1624. }
  1625. }
  1626. break;
  1627. case ASCII_a:
  1628. ptr += MINBPC(enc);
  1629. if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
  1630. ptr += MINBPC(enc);
  1631. if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
  1632. ptr += MINBPC(enc);
  1633. if (CHAR_MATCHES(enc, ptr, ASCII_s))
  1634. return ASCII_APOS;
  1635. }
  1636. }
  1637. break;
  1638. }
  1639. }
  1640. return 0;
  1641. }
  1642. static int PTRCALL
  1643. PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
  1644. const char *end1, const char *ptr2) {
  1645. UNUSED_P(enc);
  1646. for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
  1647. if (end1 - ptr1 < MINBPC(enc)) {
  1648. /* This line cannot be executed. The incoming data has already
  1649. * been tokenized once, so incomplete characters like this have
  1650. * already been eliminated from the input. Retaining the
  1651. * paranoia check is still valuable, however.
  1652. */
  1653. return 0; /* LCOV_EXCL_LINE */
  1654. }
  1655. if (! CHAR_MATCHES(enc, ptr1, *ptr2))
  1656. return 0;
  1657. }
  1658. return ptr1 == end1;
  1659. }
  1660. static int PTRFASTCALL
  1661. PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
  1662. const char *start = ptr;
  1663. for (;;) {
  1664. switch (BYTE_TYPE(enc, ptr)) {
  1665. # define LEAD_CASE(n) \
  1666. case BT_LEAD##n: \
  1667. ptr += n; \
  1668. break;
  1669. LEAD_CASE(2)
  1670. LEAD_CASE(3)
  1671. LEAD_CASE(4)
  1672. # undef LEAD_CASE
  1673. case BT_NONASCII:
  1674. case BT_NMSTRT:
  1675. # ifdef XML_NS
  1676. case BT_COLON:
  1677. # endif
  1678. case BT_HEX:
  1679. case BT_DIGIT:
  1680. case BT_NAME:
  1681. case BT_MINUS:
  1682. ptr += MINBPC(enc);
  1683. break;
  1684. default:
  1685. return (int)(ptr - start);
  1686. }
  1687. }
  1688. }
  1689. static const char *PTRFASTCALL
  1690. PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
  1691. for (;;) {
  1692. switch (BYTE_TYPE(enc, ptr)) {
  1693. case BT_LF:
  1694. case BT_CR:
  1695. case BT_S:
  1696. ptr += MINBPC(enc);
  1697. break;
  1698. default:
  1699. return ptr;
  1700. }
  1701. }
  1702. }
  1703. static void PTRCALL
  1704. PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
  1705. POSITION *pos) {
  1706. while (HAS_CHAR(enc, ptr, end)) {
  1707. switch (BYTE_TYPE(enc, ptr)) {
  1708. // The "if (end - ptr < n)" branch is a patch from chromium\third_party\expat\files\lib
  1709. # define LEAD_CASE(n) \
  1710. case BT_LEAD##n: \
  1711. if (end - ptr < n) { \
  1712. return; \
  1713. } \
  1714. ptr += n; \
  1715. pos->columnNumber++; \
  1716. break;
  1717. LEAD_CASE(2)
  1718. LEAD_CASE(3)
  1719. LEAD_CASE(4)
  1720. # undef LEAD_CASE
  1721. case BT_LF:
  1722. pos->columnNumber = 0;
  1723. pos->lineNumber++;
  1724. ptr += MINBPC(enc);
  1725. break;
  1726. case BT_CR:
  1727. pos->lineNumber++;
  1728. ptr += MINBPC(enc);
  1729. if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
  1730. ptr += MINBPC(enc);
  1731. pos->columnNumber = 0;
  1732. break;
  1733. default:
  1734. ptr += MINBPC(enc);
  1735. pos->columnNumber++;
  1736. break;
  1737. }
  1738. }
  1739. }
  1740. # undef DO_LEAD_CASE
  1741. # undef MULTIBYTE_CASES
  1742. # undef INVALID_CASES
  1743. # undef CHECK_NAME_CASE
  1744. # undef CHECK_NAME_CASES
  1745. # undef CHECK_NMSTRT_CASE
  1746. # undef CHECK_NMSTRT_CASES
  1747. #endif /* XML_TOK_IMPL_C */