xmltok.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662
  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. #include <stddef.h>
  5. #ifdef COMPILING_FOR_WINDOWS
  6. #include "winconfig.h"
  7. #elif defined(MACOS_CLASSIC)
  8. #include "macconfig.h"
  9. #elif defined(__amigaos__)
  10. #include "amigaconfig.h"
  11. #elif defined(__WATCOMC__)
  12. #include "watcomconfig.h"
  13. #else
  14. #ifdef HAVE_EXPAT_CONFIG_H
  15. #include <expat_config.h>
  16. #endif
  17. #endif /* ndef COMPILING_FOR_WINDOWS */
  18. #include "expat_external.h"
  19. #include "internal.h"
  20. #include "xmltok.h"
  21. #include "nametab.h"
  22. #ifdef XML_DTD
  23. #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  24. #else
  25. #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  26. #endif
  27. #define VTABLE1 \
  28. { PREFIX(prologTok), PREFIX(contentTok), \
  29. PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
  30. { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  31. PREFIX(sameName), \
  32. PREFIX(nameMatchesAscii), \
  33. PREFIX(nameLength), \
  34. PREFIX(skipS), \
  35. PREFIX(getAtts), \
  36. PREFIX(charRefNumber), \
  37. PREFIX(predefinedEntityName), \
  38. PREFIX(updatePosition), \
  39. PREFIX(isPublicId)
  40. #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  41. #define UCS2_GET_NAMING(pages, hi, lo) \
  42. (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  43. /* A 2 byte UTF-8 representation splits the characters 11 bits between
  44. the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
  45. pages, 3 bits to add to that index and 5 bits to generate the mask.
  46. */
  47. #define UTF8_GET_NAMING2(pages, byte) \
  48. (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  49. + ((((byte)[0]) & 3) << 1) \
  50. + ((((byte)[1]) >> 5) & 1)] \
  51. & (1 << (((byte)[1]) & 0x1F)))
  52. /* A 3 byte UTF-8 representation splits the characters 16 bits between
  53. the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
  54. into pages, 3 bits to add to that index and 5 bits to generate the
  55. mask.
  56. */
  57. #define UTF8_GET_NAMING3(pages, byte) \
  58. (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  59. + ((((byte)[1]) >> 2) & 0xF)] \
  60. << 3) \
  61. + ((((byte)[1]) & 3) << 1) \
  62. + ((((byte)[2]) >> 5) & 1)] \
  63. & (1 << (((byte)[2]) & 0x1F)))
  64. #define UTF8_GET_NAMING(pages, p, n) \
  65. ((n) == 2 \
  66. ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  67. : ((n) == 3 \
  68. ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  69. : 0))
  70. /* Detection of invalid UTF-8 sequences is based on Table 3.1B
  71. of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
  72. with the additional restriction of not allowing the Unicode
  73. code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
  74. Implementation details:
  75. (A & 0x80) == 0 means A < 0x80
  76. and
  77. (A & 0xC0) == 0xC0 means A > 0xBF
  78. */
  79. #define UTF8_INVALID2(p) \
  80. ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
  81. #define UTF8_INVALID3(p) \
  82. (((p)[2] & 0x80) == 0 \
  83. || \
  84. ((*p) == 0xEF && (p)[1] == 0xBF \
  85. ? \
  86. (p)[2] > 0xBD \
  87. : \
  88. ((p)[2] & 0xC0) == 0xC0) \
  89. || \
  90. ((*p) == 0xE0 \
  91. ? \
  92. (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
  93. : \
  94. ((p)[1] & 0x80) == 0 \
  95. || \
  96. ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
  97. #define UTF8_INVALID4(p) \
  98. (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
  99. || \
  100. ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
  101. || \
  102. ((*p) == 0xF0 \
  103. ? \
  104. (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
  105. : \
  106. ((p)[1] & 0x80) == 0 \
  107. || \
  108. ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
  109. static int PTRFASTCALL
  110. isNever(const ENCODING *enc, const char *p)
  111. {
  112. return 0;
  113. }
  114. static int PTRFASTCALL
  115. utf8_isName2(const ENCODING *enc, const char *p)
  116. {
  117. return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
  118. }
  119. static int PTRFASTCALL
  120. utf8_isName3(const ENCODING *enc, const char *p)
  121. {
  122. return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  123. }
  124. #define utf8_isName4 isNever
  125. static int PTRFASTCALL
  126. utf8_isNmstrt2(const ENCODING *enc, const char *p)
  127. {
  128. return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  129. }
  130. static int PTRFASTCALL
  131. utf8_isNmstrt3(const ENCODING *enc, const char *p)
  132. {
  133. return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  134. }
  135. #define utf8_isNmstrt4 isNever
  136. static int PTRFASTCALL
  137. utf8_isInvalid2(const ENCODING *enc, const char *p)
  138. {
  139. return UTF8_INVALID2((const unsigned char *)p);
  140. }
  141. static int PTRFASTCALL
  142. utf8_isInvalid3(const ENCODING *enc, const char *p)
  143. {
  144. return UTF8_INVALID3((const unsigned char *)p);
  145. }
  146. static int PTRFASTCALL
  147. utf8_isInvalid4(const ENCODING *enc, const char *p)
  148. {
  149. return UTF8_INVALID4((const unsigned char *)p);
  150. }
  151. struct normal_encoding {
  152. ENCODING enc;
  153. unsigned char type[256];
  154. #ifdef XML_MIN_SIZE
  155. int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
  156. int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
  157. int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
  158. int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
  159. int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
  160. #endif /* XML_MIN_SIZE */
  161. int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
  162. int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
  163. int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
  164. int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
  165. int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
  166. int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
  167. int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
  168. int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
  169. int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
  170. };
  171. #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
  172. #ifdef XML_MIN_SIZE
  173. #define STANDARD_VTABLE(E) \
  174. E ## byteType, \
  175. E ## isNameMin, \
  176. E ## isNmstrtMin, \
  177. E ## byteToAscii, \
  178. E ## charMatches,
  179. #else
  180. #define STANDARD_VTABLE(E) /* as nothing */
  181. #endif
  182. #define NORMAL_VTABLE(E) \
  183. E ## isName2, \
  184. E ## isName3, \
  185. E ## isName4, \
  186. E ## isNmstrt2, \
  187. E ## isNmstrt3, \
  188. E ## isNmstrt4, \
  189. E ## isInvalid2, \
  190. E ## isInvalid3, \
  191. E ## isInvalid4
  192. #define NULL_VTABLE \
  193. /* isName2 */ NULL, \
  194. /* isName3 */ NULL, \
  195. /* isName4 */ NULL, \
  196. /* isNmstrt2 */ NULL, \
  197. /* isNmstrt3 */ NULL, \
  198. /* isNmstrt4 */ NULL, \
  199. /* isInvalid2 */ NULL, \
  200. /* isInvalid3 */ NULL, \
  201. /* isInvalid4 */ NULL
  202. static int FASTCALL checkCharRefNumber(int);
  203. #include "xmltok_impl.h"
  204. #include "ascii.h"
  205. #ifdef XML_MIN_SIZE
  206. #define sb_isNameMin isNever
  207. #define sb_isNmstrtMin isNever
  208. #endif
  209. #ifdef XML_MIN_SIZE
  210. #define MINBPC(enc) ((enc)->minBytesPerChar)
  211. #else
  212. /* minimum bytes per character */
  213. #define MINBPC(enc) 1
  214. #endif
  215. #define SB_BYTE_TYPE(enc, p) \
  216. (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  217. #ifdef XML_MIN_SIZE
  218. static int PTRFASTCALL
  219. sb_byteType(const ENCODING *enc, const char *p)
  220. {
  221. return SB_BYTE_TYPE(enc, p);
  222. }
  223. #define BYTE_TYPE(enc, p) \
  224. (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
  225. #else
  226. #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  227. #endif
  228. #ifdef XML_MIN_SIZE
  229. #define BYTE_TO_ASCII(enc, p) \
  230. (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
  231. static int PTRFASTCALL
  232. sb_byteToAscii(const ENCODING *enc, const char *p)
  233. {
  234. return *p;
  235. }
  236. #else
  237. #define BYTE_TO_ASCII(enc, p) (*(p))
  238. #endif
  239. #define IS_NAME_CHAR(enc, p, n) \
  240. (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
  241. #define IS_NMSTRT_CHAR(enc, p, n) \
  242. (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
  243. #define IS_INVALID_CHAR(enc, p, n) \
  244. (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
  245. #ifdef XML_MIN_SIZE
  246. #define IS_NAME_CHAR_MINBPC(enc, p) \
  247. (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
  248. #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  249. (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
  250. #else
  251. #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  252. #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  253. #endif
  254. #ifdef XML_MIN_SIZE
  255. #define CHAR_MATCHES(enc, p, c) \
  256. (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
  257. static int PTRCALL
  258. sb_charMatches(const ENCODING *enc, const char *p, int c)
  259. {
  260. return *p == c;
  261. }
  262. #else
  263. /* c is an ASCII character */
  264. #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  265. #endif
  266. #define PREFIX(ident) normal_ ## ident
  267. #define XML_TOK_IMPL_C
  268. #include "xmltok_impl.c"
  269. #undef XML_TOK_IMPL_C
  270. #undef MINBPC
  271. #undef BYTE_TYPE
  272. #undef BYTE_TO_ASCII
  273. #undef CHAR_MATCHES
  274. #undef IS_NAME_CHAR
  275. #undef IS_NAME_CHAR_MINBPC
  276. #undef IS_NMSTRT_CHAR
  277. #undef IS_NMSTRT_CHAR_MINBPC
  278. #undef IS_INVALID_CHAR
  279. enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
  280. UTF8_cval1 = 0x00,
  281. UTF8_cval2 = 0xc0,
  282. UTF8_cval3 = 0xe0,
  283. UTF8_cval4 = 0xf0
  284. };
  285. static void PTRCALL
  286. utf8_toUtf8(const ENCODING *enc,
  287. const char **fromP, const char *fromLim,
  288. char **toP, const char *toLim)
  289. {
  290. char *to;
  291. const char *from;
  292. if (fromLim - *fromP > toLim - *toP) {
  293. /* Avoid copying partial characters. */
  294. for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
  295. if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
  296. break;
  297. }
  298. for (to = *toP, from = *fromP; from != fromLim; from++, to++)
  299. *to = *from;
  300. *fromP = from;
  301. *toP = to;
  302. }
  303. static void PTRCALL
  304. utf8_toUtf16(const ENCODING *enc,
  305. const char **fromP, const char *fromLim,
  306. unsigned short **toP, const unsigned short *toLim)
  307. {
  308. unsigned short *to = *toP;
  309. const char *from = *fromP;
  310. while (from != fromLim && to != toLim) {
  311. switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
  312. case BT_LEAD2:
  313. *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
  314. from += 2;
  315. break;
  316. case BT_LEAD3:
  317. *to++ = (unsigned short)(((from[0] & 0xf) << 12)
  318. | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
  319. from += 3;
  320. break;
  321. case BT_LEAD4:
  322. {
  323. unsigned long n;
  324. if (to + 1 == toLim)
  325. goto after;
  326. n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
  327. | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
  328. n -= 0x10000;
  329. to[0] = (unsigned short)((n >> 10) | 0xD800);
  330. to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
  331. to += 2;
  332. from += 4;
  333. }
  334. break;
  335. default:
  336. *to++ = *from++;
  337. break;
  338. }
  339. }
  340. after:
  341. *fromP = from;
  342. *toP = to;
  343. }
  344. #ifdef XML_NS
  345. static const struct normal_encoding utf8_encoding_ns = {
  346. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  347. {
  348. #include "asciitab.h"
  349. #include "utf8tab.h"
  350. },
  351. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  352. };
  353. #endif
  354. static const struct normal_encoding utf8_encoding = {
  355. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  356. {
  357. #define BT_COLON BT_NMSTRT
  358. #include "asciitab.h"
  359. #undef BT_COLON
  360. #include "utf8tab.h"
  361. },
  362. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  363. };
  364. #ifdef XML_NS
  365. static const struct normal_encoding internal_utf8_encoding_ns = {
  366. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  367. {
  368. #include "iasciitab.h"
  369. #include "utf8tab.h"
  370. },
  371. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  372. };
  373. #endif
  374. static const struct normal_encoding internal_utf8_encoding = {
  375. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  376. {
  377. #define BT_COLON BT_NMSTRT
  378. #include "iasciitab.h"
  379. #undef BT_COLON
  380. #include "utf8tab.h"
  381. },
  382. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  383. };
  384. static void PTRCALL
  385. latin1_toUtf8(const ENCODING *enc,
  386. const char **fromP, const char *fromLim,
  387. char **toP, const char *toLim)
  388. {
  389. for (;;) {
  390. unsigned char c;
  391. if (*fromP == fromLim)
  392. break;
  393. c = (unsigned char)**fromP;
  394. if (c & 0x80) {
  395. if (toLim - *toP < 2)
  396. break;
  397. *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
  398. *(*toP)++ = (char)((c & 0x3f) | 0x80);
  399. (*fromP)++;
  400. }
  401. else {
  402. if (*toP == toLim)
  403. break;
  404. *(*toP)++ = *(*fromP)++;
  405. }
  406. }
  407. }
  408. static void PTRCALL
  409. latin1_toUtf16(const ENCODING *enc,
  410. const char **fromP, const char *fromLim,
  411. unsigned short **toP, const unsigned short *toLim)
  412. {
  413. while (*fromP != fromLim && *toP != toLim)
  414. *(*toP)++ = (unsigned char)*(*fromP)++;
  415. }
  416. #ifdef XML_NS
  417. static const struct normal_encoding latin1_encoding_ns = {
  418. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  419. {
  420. #include "asciitab.h"
  421. #include "latin1tab.h"
  422. },
  423. STANDARD_VTABLE(sb_) NULL_VTABLE
  424. };
  425. #endif
  426. static const struct normal_encoding latin1_encoding = {
  427. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  428. {
  429. #define BT_COLON BT_NMSTRT
  430. #include "asciitab.h"
  431. #undef BT_COLON
  432. #include "latin1tab.h"
  433. },
  434. STANDARD_VTABLE(sb_) NULL_VTABLE
  435. };
  436. static void PTRCALL
  437. ascii_toUtf8(const ENCODING *enc,
  438. const char **fromP, const char *fromLim,
  439. char **toP, const char *toLim)
  440. {
  441. while (*fromP != fromLim && *toP != toLim)
  442. *(*toP)++ = *(*fromP)++;
  443. }
  444. #ifdef XML_NS
  445. static const struct normal_encoding ascii_encoding_ns = {
  446. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  447. {
  448. #include "asciitab.h"
  449. /* BT_NONXML == 0 */
  450. },
  451. STANDARD_VTABLE(sb_) NULL_VTABLE
  452. };
  453. #endif
  454. static const struct normal_encoding ascii_encoding = {
  455. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  456. {
  457. #define BT_COLON BT_NMSTRT
  458. #include "asciitab.h"
  459. #undef BT_COLON
  460. /* BT_NONXML == 0 */
  461. },
  462. STANDARD_VTABLE(sb_) NULL_VTABLE
  463. };
  464. static int PTRFASTCALL
  465. unicode_byte_type(char hi, char lo)
  466. {
  467. switch ((unsigned char)hi) {
  468. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  469. return BT_LEAD4;
  470. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  471. return BT_TRAIL;
  472. case 0xFF:
  473. switch ((unsigned char)lo) {
  474. case 0xFF:
  475. case 0xFE:
  476. return BT_NONXML;
  477. }
  478. break;
  479. }
  480. return BT_NONASCII;
  481. }
  482. #define DEFINE_UTF16_TO_UTF8(E) \
  483. static void PTRCALL \
  484. E ## toUtf8(const ENCODING *enc, \
  485. const char **fromP, const char *fromLim, \
  486. char **toP, const char *toLim) \
  487. { \
  488. const char *from; \
  489. for (from = *fromP; from != fromLim; from += 2) { \
  490. int plane; \
  491. unsigned char lo2; \
  492. unsigned char lo = GET_LO(from); \
  493. unsigned char hi = GET_HI(from); \
  494. switch (hi) { \
  495. case 0: \
  496. if (lo < 0x80) { \
  497. if (*toP == toLim) { \
  498. *fromP = from; \
  499. return; \
  500. } \
  501. *(*toP)++ = lo; \
  502. break; \
  503. } \
  504. /* fall through */ \
  505. case 0x1: case 0x2: case 0x3: \
  506. case 0x4: case 0x5: case 0x6: case 0x7: \
  507. if (toLim - *toP < 2) { \
  508. *fromP = from; \
  509. return; \
  510. } \
  511. *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
  512. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  513. break; \
  514. default: \
  515. if (toLim - *toP < 3) { \
  516. *fromP = from; \
  517. return; \
  518. } \
  519. /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
  520. *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
  521. *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
  522. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  523. break; \
  524. case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
  525. if (toLim - *toP < 4) { \
  526. *fromP = from; \
  527. return; \
  528. } \
  529. plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
  530. *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
  531. *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
  532. from += 2; \
  533. lo2 = GET_LO(from); \
  534. *(*toP)++ = (((lo & 0x3) << 4) \
  535. | ((GET_HI(from) & 0x3) << 2) \
  536. | (lo2 >> 6) \
  537. | 0x80); \
  538. *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
  539. break; \
  540. } \
  541. } \
  542. *fromP = from; \
  543. }
  544. #define DEFINE_UTF16_TO_UTF16(E) \
  545. static void PTRCALL \
  546. E ## toUtf16(const ENCODING *enc, \
  547. const char **fromP, const char *fromLim, \
  548. unsigned short **toP, const unsigned short *toLim) \
  549. { \
  550. /* Avoid copying first half only of surrogate */ \
  551. if (fromLim - *fromP > ((toLim - *toP) << 1) \
  552. && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
  553. fromLim -= 2; \
  554. for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
  555. *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  556. }
  557. #define SET2(ptr, ch) \
  558. (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  559. #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  560. #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  561. DEFINE_UTF16_TO_UTF8(little2_)
  562. DEFINE_UTF16_TO_UTF16(little2_)
  563. #undef SET2
  564. #undef GET_LO
  565. #undef GET_HI
  566. #define SET2(ptr, ch) \
  567. (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  568. #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  569. #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  570. DEFINE_UTF16_TO_UTF8(big2_)
  571. DEFINE_UTF16_TO_UTF16(big2_)
  572. #undef SET2
  573. #undef GET_LO
  574. #undef GET_HI
  575. #define LITTLE2_BYTE_TYPE(enc, p) \
  576. ((p)[1] == 0 \
  577. ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
  578. : unicode_byte_type((p)[1], (p)[0]))
  579. #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  580. #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  581. #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
  582. UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  583. #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  584. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  585. #ifdef XML_MIN_SIZE
  586. static int PTRFASTCALL
  587. little2_byteType(const ENCODING *enc, const char *p)
  588. {
  589. return LITTLE2_BYTE_TYPE(enc, p);
  590. }
  591. static int PTRFASTCALL
  592. little2_byteToAscii(const ENCODING *enc, const char *p)
  593. {
  594. return LITTLE2_BYTE_TO_ASCII(enc, p);
  595. }
  596. static int PTRCALL
  597. little2_charMatches(const ENCODING *enc, const char *p, int c)
  598. {
  599. return LITTLE2_CHAR_MATCHES(enc, p, c);
  600. }
  601. static int PTRFASTCALL
  602. little2_isNameMin(const ENCODING *enc, const char *p)
  603. {
  604. return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  605. }
  606. static int PTRFASTCALL
  607. little2_isNmstrtMin(const ENCODING *enc, const char *p)
  608. {
  609. return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  610. }
  611. #undef VTABLE
  612. #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  613. #else /* not XML_MIN_SIZE */
  614. #undef PREFIX
  615. #define PREFIX(ident) little2_ ## ident
  616. #define MINBPC(enc) 2
  617. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  618. #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  619. #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
  620. #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  621. #define IS_NAME_CHAR(enc, p, n) 0
  622. #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  623. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  624. #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  625. #define XML_TOK_IMPL_C
  626. #include "xmltok_impl.c"
  627. #undef XML_TOK_IMPL_C
  628. #undef MINBPC
  629. #undef BYTE_TYPE
  630. #undef BYTE_TO_ASCII
  631. #undef CHAR_MATCHES
  632. #undef IS_NAME_CHAR
  633. #undef IS_NAME_CHAR_MINBPC
  634. #undef IS_NMSTRT_CHAR
  635. #undef IS_NMSTRT_CHAR_MINBPC
  636. #undef IS_INVALID_CHAR
  637. #endif /* not XML_MIN_SIZE */
  638. #ifdef XML_NS
  639. static const struct normal_encoding little2_encoding_ns = {
  640. { VTABLE, 2, 0,
  641. #if BYTEORDER == 1234
  642. 1
  643. #else
  644. 0
  645. #endif
  646. },
  647. {
  648. #include "asciitab.h"
  649. #include "latin1tab.h"
  650. },
  651. STANDARD_VTABLE(little2_) NULL_VTABLE
  652. };
  653. #endif
  654. static const struct normal_encoding little2_encoding = {
  655. { VTABLE, 2, 0,
  656. #if BYTEORDER == 1234
  657. 1
  658. #else
  659. 0
  660. #endif
  661. },
  662. {
  663. #define BT_COLON BT_NMSTRT
  664. #include "asciitab.h"
  665. #undef BT_COLON
  666. #include "latin1tab.h"
  667. },
  668. STANDARD_VTABLE(little2_) NULL_VTABLE
  669. };
  670. #if BYTEORDER != 4321
  671. #ifdef XML_NS
  672. static const struct normal_encoding internal_little2_encoding_ns = {
  673. { VTABLE, 2, 0, 1 },
  674. {
  675. #include "iasciitab.h"
  676. #include "latin1tab.h"
  677. },
  678. STANDARD_VTABLE(little2_) NULL_VTABLE
  679. };
  680. #endif
  681. static const struct normal_encoding internal_little2_encoding = {
  682. { VTABLE, 2, 0, 1 },
  683. {
  684. #define BT_COLON BT_NMSTRT
  685. #include "iasciitab.h"
  686. #undef BT_COLON
  687. #include "latin1tab.h"
  688. },
  689. STANDARD_VTABLE(little2_) NULL_VTABLE
  690. };
  691. #endif
  692. #define BIG2_BYTE_TYPE(enc, p) \
  693. ((p)[0] == 0 \
  694. ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
  695. : unicode_byte_type((p)[0], (p)[1]))
  696. #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  697. #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  698. #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
  699. UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  700. #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  701. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  702. #ifdef XML_MIN_SIZE
  703. static int PTRFASTCALL
  704. big2_byteType(const ENCODING *enc, const char *p)
  705. {
  706. return BIG2_BYTE_TYPE(enc, p);
  707. }
  708. static int PTRFASTCALL
  709. big2_byteToAscii(const ENCODING *enc, const char *p)
  710. {
  711. return BIG2_BYTE_TO_ASCII(enc, p);
  712. }
  713. static int PTRCALL
  714. big2_charMatches(const ENCODING *enc, const char *p, int c)
  715. {
  716. return BIG2_CHAR_MATCHES(enc, p, c);
  717. }
  718. static int PTRFASTCALL
  719. big2_isNameMin(const ENCODING *enc, const char *p)
  720. {
  721. return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  722. }
  723. static int PTRFASTCALL
  724. big2_isNmstrtMin(const ENCODING *enc, const char *p)
  725. {
  726. return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  727. }
  728. #undef VTABLE
  729. #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  730. #else /* not XML_MIN_SIZE */
  731. #undef PREFIX
  732. #define PREFIX(ident) big2_ ## ident
  733. #define MINBPC(enc) 2
  734. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  735. #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  736. #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
  737. #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  738. #define IS_NAME_CHAR(enc, p, n) 0
  739. #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  740. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  741. #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  742. #define XML_TOK_IMPL_C
  743. #include "xmltok_impl.c"
  744. #undef XML_TOK_IMPL_C
  745. #undef MINBPC
  746. #undef BYTE_TYPE
  747. #undef BYTE_TO_ASCII
  748. #undef CHAR_MATCHES
  749. #undef IS_NAME_CHAR
  750. #undef IS_NAME_CHAR_MINBPC
  751. #undef IS_NMSTRT_CHAR
  752. #undef IS_NMSTRT_CHAR_MINBPC
  753. #undef IS_INVALID_CHAR
  754. #endif /* not XML_MIN_SIZE */
  755. #ifdef XML_NS
  756. static const struct normal_encoding big2_encoding_ns = {
  757. { VTABLE, 2, 0,
  758. #if BYTEORDER == 4321
  759. 1
  760. #else
  761. 0
  762. #endif
  763. },
  764. {
  765. #include "asciitab.h"
  766. #include "latin1tab.h"
  767. },
  768. STANDARD_VTABLE(big2_) NULL_VTABLE
  769. };
  770. #endif
  771. static const struct normal_encoding big2_encoding = {
  772. { VTABLE, 2, 0,
  773. #if BYTEORDER == 4321
  774. 1
  775. #else
  776. 0
  777. #endif
  778. },
  779. {
  780. #define BT_COLON BT_NMSTRT
  781. #include "asciitab.h"
  782. #undef BT_COLON
  783. #include "latin1tab.h"
  784. },
  785. STANDARD_VTABLE(big2_) NULL_VTABLE
  786. };
  787. #if BYTEORDER != 1234
  788. #ifdef XML_NS
  789. static const struct normal_encoding internal_big2_encoding_ns = {
  790. { VTABLE, 2, 0, 1 },
  791. {
  792. #include "iasciitab.h"
  793. #include "latin1tab.h"
  794. },
  795. STANDARD_VTABLE(big2_) NULL_VTABLE
  796. };
  797. #endif
  798. static const struct normal_encoding internal_big2_encoding = {
  799. { VTABLE, 2, 0, 1 },
  800. {
  801. #define BT_COLON BT_NMSTRT
  802. #include "iasciitab.h"
  803. #undef BT_COLON
  804. #include "latin1tab.h"
  805. },
  806. STANDARD_VTABLE(big2_) NULL_VTABLE
  807. };
  808. #endif
  809. #undef PREFIX
  810. static int FASTCALL
  811. streqci(const char *s1, const char *s2)
  812. {
  813. for (;;) {
  814. char c1 = *s1++;
  815. char c2 = *s2++;
  816. if (ASCII_a <= c1 && c1 <= ASCII_z)
  817. c1 += ASCII_A - ASCII_a;
  818. if (ASCII_a <= c2 && c2 <= ASCII_z)
  819. c2 += ASCII_A - ASCII_a;
  820. if (c1 != c2)
  821. return 0;
  822. if (!c1)
  823. break;
  824. }
  825. return 1;
  826. }
  827. static void PTRCALL
  828. initUpdatePosition(const ENCODING *enc, const char *ptr,
  829. const char *end, POSITION *pos)
  830. {
  831. normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  832. }
  833. static int
  834. toAscii(const ENCODING *enc, const char *ptr, const char *end)
  835. {
  836. char buf[1];
  837. char *p = buf;
  838. XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  839. if (p == buf)
  840. return -1;
  841. else
  842. return buf[0];
  843. }
  844. static int FASTCALL
  845. isSpace(int c)
  846. {
  847. switch (c) {
  848. case 0x20:
  849. case 0xD:
  850. case 0xA:
  851. case 0x9:
  852. return 1;
  853. }
  854. return 0;
  855. }
  856. /* Return 1 if there's just optional white space or there's an S
  857. followed by name=val.
  858. */
  859. static int
  860. parsePseudoAttribute(const ENCODING *enc,
  861. const char *ptr,
  862. const char *end,
  863. const char **namePtr,
  864. const char **nameEndPtr,
  865. const char **valPtr,
  866. const char **nextTokPtr)
  867. {
  868. int c;
  869. char open;
  870. if (ptr == end) {
  871. *namePtr = NULL;
  872. return 1;
  873. }
  874. if (!isSpace(toAscii(enc, ptr, end))) {
  875. *nextTokPtr = ptr;
  876. return 0;
  877. }
  878. do {
  879. ptr += enc->minBytesPerChar;
  880. } while (isSpace(toAscii(enc, ptr, end)));
  881. if (ptr == end) {
  882. *namePtr = NULL;
  883. return 1;
  884. }
  885. *namePtr = ptr;
  886. for (;;) {
  887. c = toAscii(enc, ptr, end);
  888. if (c == -1) {
  889. *nextTokPtr = ptr;
  890. return 0;
  891. }
  892. if (c == ASCII_EQUALS) {
  893. *nameEndPtr = ptr;
  894. break;
  895. }
  896. if (isSpace(c)) {
  897. *nameEndPtr = ptr;
  898. do {
  899. ptr += enc->minBytesPerChar;
  900. } while (isSpace(c = toAscii(enc, ptr, end)));
  901. if (c != ASCII_EQUALS) {
  902. *nextTokPtr = ptr;
  903. return 0;
  904. }
  905. break;
  906. }
  907. ptr += enc->minBytesPerChar;
  908. }
  909. if (ptr == *namePtr) {
  910. *nextTokPtr = ptr;
  911. return 0;
  912. }
  913. ptr += enc->minBytesPerChar;
  914. c = toAscii(enc, ptr, end);
  915. while (isSpace(c)) {
  916. ptr += enc->minBytesPerChar;
  917. c = toAscii(enc, ptr, end);
  918. }
  919. if (c != ASCII_QUOT && c != ASCII_APOS) {
  920. *nextTokPtr = ptr;
  921. return 0;
  922. }
  923. open = (char)c;
  924. ptr += enc->minBytesPerChar;
  925. *valPtr = ptr;
  926. for (;; ptr += enc->minBytesPerChar) {
  927. c = toAscii(enc, ptr, end);
  928. if (c == open)
  929. break;
  930. if (!(ASCII_a <= c && c <= ASCII_z)
  931. && !(ASCII_A <= c && c <= ASCII_Z)
  932. && !(ASCII_0 <= c && c <= ASCII_9)
  933. && c != ASCII_PERIOD
  934. && c != ASCII_MINUS
  935. && c != ASCII_UNDERSCORE) {
  936. *nextTokPtr = ptr;
  937. return 0;
  938. }
  939. }
  940. *nextTokPtr = ptr + enc->minBytesPerChar;
  941. return 1;
  942. }
  943. static const char KW_version[] = {
  944. ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  945. };
  946. static const char KW_encoding[] = {
  947. ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  948. };
  949. static const char KW_standalone[] = {
  950. ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
  951. ASCII_n, ASCII_e, '\0'
  952. };
  953. static const char KW_yes[] = {
  954. ASCII_y, ASCII_e, ASCII_s, '\0'
  955. };
  956. static const char KW_no[] = {
  957. ASCII_n, ASCII_o, '\0'
  958. };
  959. static int
  960. doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  961. const char *,
  962. const char *),
  963. int isGeneralTextEntity,
  964. const ENCODING *enc,
  965. const char *ptr,
  966. const char *end,
  967. const char **badPtr,
  968. const char **versionPtr,
  969. const char **versionEndPtr,
  970. const char **encodingName,
  971. const ENCODING **encoding,
  972. int *standalone)
  973. {
  974. const char *val = NULL;
  975. const char *name = NULL;
  976. const char *nameEnd = NULL;
  977. ptr += 5 * enc->minBytesPerChar;
  978. end -= 2 * enc->minBytesPerChar;
  979. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
  980. || !name) {
  981. *badPtr = ptr;
  982. return 0;
  983. }
  984. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  985. if (!isGeneralTextEntity) {
  986. *badPtr = name;
  987. return 0;
  988. }
  989. }
  990. else {
  991. if (versionPtr)
  992. *versionPtr = val;
  993. if (versionEndPtr)
  994. *versionEndPtr = ptr;
  995. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  996. *badPtr = ptr;
  997. return 0;
  998. }
  999. if (!name) {
  1000. if (isGeneralTextEntity) {
  1001. /* a TextDecl must have an EncodingDecl */
  1002. *badPtr = ptr;
  1003. return 0;
  1004. }
  1005. return 1;
  1006. }
  1007. }
  1008. if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  1009. int c = toAscii(enc, val, end);
  1010. if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  1011. *badPtr = val;
  1012. return 0;
  1013. }
  1014. if (encodingName)
  1015. *encodingName = val;
  1016. if (encoding)
  1017. *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  1018. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1019. *badPtr = ptr;
  1020. return 0;
  1021. }
  1022. if (!name)
  1023. return 1;
  1024. }
  1025. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
  1026. || isGeneralTextEntity) {
  1027. *badPtr = name;
  1028. return 0;
  1029. }
  1030. if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  1031. if (standalone)
  1032. *standalone = 1;
  1033. }
  1034. else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  1035. if (standalone)
  1036. *standalone = 0;
  1037. }
  1038. else {
  1039. *badPtr = val;
  1040. return 0;
  1041. }
  1042. while (isSpace(toAscii(enc, ptr, end)))
  1043. ptr += enc->minBytesPerChar;
  1044. if (ptr != end) {
  1045. *badPtr = ptr;
  1046. return 0;
  1047. }
  1048. return 1;
  1049. }
  1050. static int FASTCALL
  1051. checkCharRefNumber(int result)
  1052. {
  1053. switch (result >> 8) {
  1054. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  1055. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1056. return -1;
  1057. case 0:
  1058. if (latin1_encoding.type[result] == BT_NONXML)
  1059. return -1;
  1060. break;
  1061. case 0xFF:
  1062. if (result == 0xFFFE || result == 0xFFFF)
  1063. return -1;
  1064. break;
  1065. }
  1066. return result;
  1067. }
  1068. int FASTCALL
  1069. XmlUtf8Encode(int c, char *buf)
  1070. {
  1071. enum {
  1072. /* minN is minimum legal resulting value for N byte sequence */
  1073. min2 = 0x80,
  1074. min3 = 0x800,
  1075. min4 = 0x10000
  1076. };
  1077. if (c < 0)
  1078. return 0;
  1079. if (c < min2) {
  1080. buf[0] = (char)(c | UTF8_cval1);
  1081. return 1;
  1082. }
  1083. if (c < min3) {
  1084. buf[0] = (char)((c >> 6) | UTF8_cval2);
  1085. buf[1] = (char)((c & 0x3f) | 0x80);
  1086. return 2;
  1087. }
  1088. if (c < min4) {
  1089. buf[0] = (char)((c >> 12) | UTF8_cval3);
  1090. buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
  1091. buf[2] = (char)((c & 0x3f) | 0x80);
  1092. return 3;
  1093. }
  1094. if (c < 0x110000) {
  1095. buf[0] = (char)((c >> 18) | UTF8_cval4);
  1096. buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
  1097. buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
  1098. buf[3] = (char)((c & 0x3f) | 0x80);
  1099. return 4;
  1100. }
  1101. return 0;
  1102. }
  1103. int FASTCALL
  1104. XmlUtf16Encode(int charNum, unsigned short *buf)
  1105. {
  1106. if (charNum < 0)
  1107. return 0;
  1108. if (charNum < 0x10000) {
  1109. buf[0] = (unsigned short)charNum;
  1110. return 1;
  1111. }
  1112. if (charNum < 0x110000) {
  1113. charNum -= 0x10000;
  1114. buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
  1115. buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
  1116. return 2;
  1117. }
  1118. return 0;
  1119. }
  1120. struct unknown_encoding {
  1121. struct normal_encoding normal;
  1122. CONVERTER convert;
  1123. void *userData;
  1124. unsigned short utf16[256];
  1125. char utf8[256][4];
  1126. };
  1127. #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
  1128. int
  1129. XmlSizeOfUnknownEncoding(void)
  1130. {
  1131. return sizeof(struct unknown_encoding);
  1132. }
  1133. static int PTRFASTCALL
  1134. unknown_isName(const ENCODING *enc, const char *p)
  1135. {
  1136. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1137. int c = uenc->convert(uenc->userData, p);
  1138. if (c & ~0xFFFF)
  1139. return 0;
  1140. return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1141. }
  1142. static int PTRFASTCALL
  1143. unknown_isNmstrt(const ENCODING *enc, const char *p)
  1144. {
  1145. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1146. int c = uenc->convert(uenc->userData, p);
  1147. if (c & ~0xFFFF)
  1148. return 0;
  1149. return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1150. }
  1151. static int PTRFASTCALL
  1152. unknown_isInvalid(const ENCODING *enc, const char *p)
  1153. {
  1154. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1155. int c = uenc->convert(uenc->userData, p);
  1156. return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1157. }
  1158. static void PTRCALL
  1159. unknown_toUtf8(const ENCODING *enc,
  1160. const char **fromP, const char *fromLim,
  1161. char **toP, const char *toLim)
  1162. {
  1163. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1164. char buf[XML_UTF8_ENCODE_MAX];
  1165. for (;;) {
  1166. const char *utf8;
  1167. int n;
  1168. if (*fromP == fromLim)
  1169. break;
  1170. utf8 = uenc->utf8[(unsigned char)**fromP];
  1171. n = *utf8++;
  1172. if (n == 0) {
  1173. int c = uenc->convert(uenc->userData, *fromP);
  1174. n = XmlUtf8Encode(c, buf);
  1175. if (n > toLim - *toP)
  1176. break;
  1177. utf8 = buf;
  1178. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1179. - (BT_LEAD2 - 2));
  1180. }
  1181. else {
  1182. if (n > toLim - *toP)
  1183. break;
  1184. (*fromP)++;
  1185. }
  1186. do {
  1187. *(*toP)++ = *utf8++;
  1188. } while (--n != 0);
  1189. }
  1190. }
  1191. static void PTRCALL
  1192. unknown_toUtf16(const ENCODING *enc,
  1193. const char **fromP, const char *fromLim,
  1194. unsigned short **toP, const unsigned short *toLim)
  1195. {
  1196. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1197. while (*fromP != fromLim && *toP != toLim) {
  1198. unsigned short c = uenc->utf16[(unsigned char)**fromP];
  1199. if (c == 0) {
  1200. c = (unsigned short)
  1201. uenc->convert(uenc->userData, *fromP);
  1202. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1203. - (BT_LEAD2 - 2));
  1204. }
  1205. else
  1206. (*fromP)++;
  1207. *(*toP)++ = c;
  1208. }
  1209. }
  1210. ENCODING *
  1211. XmlInitUnknownEncoding(void *mem,
  1212. int *table,
  1213. CONVERTER convert,
  1214. void *userData)
  1215. {
  1216. int i;
  1217. struct unknown_encoding *e = (struct unknown_encoding *)mem;
  1218. for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1219. ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1220. for (i = 0; i < 128; i++)
  1221. if (latin1_encoding.type[i] != BT_OTHER
  1222. && latin1_encoding.type[i] != BT_NONXML
  1223. && table[i] != i)
  1224. return 0;
  1225. for (i = 0; i < 256; i++) {
  1226. int c = table[i];
  1227. if (c == -1) {
  1228. e->normal.type[i] = BT_MALFORM;
  1229. /* This shouldn't really get used. */
  1230. e->utf16[i] = 0xFFFF;
  1231. e->utf8[i][0] = 1;
  1232. e->utf8[i][1] = 0;
  1233. }
  1234. else if (c < 0) {
  1235. if (c < -4)
  1236. return 0;
  1237. e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
  1238. e->utf8[i][0] = 0;
  1239. e->utf16[i] = 0;
  1240. }
  1241. else if (c < 0x80) {
  1242. if (latin1_encoding.type[c] != BT_OTHER
  1243. && latin1_encoding.type[c] != BT_NONXML
  1244. && c != i)
  1245. return 0;
  1246. e->normal.type[i] = latin1_encoding.type[c];
  1247. e->utf8[i][0] = 1;
  1248. e->utf8[i][1] = (char)c;
  1249. e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
  1250. }
  1251. else if (checkCharRefNumber(c) < 0) {
  1252. e->normal.type[i] = BT_NONXML;
  1253. /* This shouldn't really get used. */
  1254. e->utf16[i] = 0xFFFF;
  1255. e->utf8[i][0] = 1;
  1256. e->utf8[i][1] = 0;
  1257. }
  1258. else {
  1259. if (c > 0xFFFF)
  1260. return 0;
  1261. if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1262. e->normal.type[i] = BT_NMSTRT;
  1263. else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1264. e->normal.type[i] = BT_NAME;
  1265. else
  1266. e->normal.type[i] = BT_OTHER;
  1267. e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1268. e->utf16[i] = (unsigned short)c;
  1269. }
  1270. }
  1271. e->userData = userData;
  1272. e->convert = convert;
  1273. if (convert) {
  1274. e->normal.isName2 = unknown_isName;
  1275. e->normal.isName3 = unknown_isName;
  1276. e->normal.isName4 = unknown_isName;
  1277. e->normal.isNmstrt2 = unknown_isNmstrt;
  1278. e->normal.isNmstrt3 = unknown_isNmstrt;
  1279. e->normal.isNmstrt4 = unknown_isNmstrt;
  1280. e->normal.isInvalid2 = unknown_isInvalid;
  1281. e->normal.isInvalid3 = unknown_isInvalid;
  1282. e->normal.isInvalid4 = unknown_isInvalid;
  1283. }
  1284. e->normal.enc.utf8Convert = unknown_toUtf8;
  1285. e->normal.enc.utf16Convert = unknown_toUtf16;
  1286. return &(e->normal.enc);
  1287. }
  1288. /* If this enumeration is changed, getEncodingIndex and encodings
  1289. must also be changed. */
  1290. enum {
  1291. UNKNOWN_ENC = -1,
  1292. ISO_8859_1_ENC = 0,
  1293. US_ASCII_ENC,
  1294. UTF_8_ENC,
  1295. UTF_16_ENC,
  1296. UTF_16BE_ENC,
  1297. UTF_16LE_ENC,
  1298. /* must match encodingNames up to here */
  1299. NO_ENC
  1300. };
  1301. static const char KW_ISO_8859_1[] = {
  1302. ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
  1303. ASCII_MINUS, ASCII_1, '\0'
  1304. };
  1305. static const char KW_US_ASCII[] = {
  1306. ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
  1307. '\0'
  1308. };
  1309. static const char KW_UTF_8[] = {
  1310. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1311. };
  1312. static const char KW_UTF_16[] = {
  1313. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1314. };
  1315. static const char KW_UTF_16BE[] = {
  1316. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
  1317. '\0'
  1318. };
  1319. static const char KW_UTF_16LE[] = {
  1320. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
  1321. '\0'
  1322. };
  1323. static int FASTCALL
  1324. getEncodingIndex(const char *name)
  1325. {
  1326. static const char * const encodingNames[] = {
  1327. KW_ISO_8859_1,
  1328. KW_US_ASCII,
  1329. KW_UTF_8,
  1330. KW_UTF_16,
  1331. KW_UTF_16BE,
  1332. KW_UTF_16LE,
  1333. };
  1334. int i;
  1335. if (name == NULL)
  1336. return NO_ENC;
  1337. for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1338. if (streqci(name, encodingNames[i]))
  1339. return i;
  1340. return UNKNOWN_ENC;
  1341. }
  1342. /* For binary compatibility, we store the index of the encoding
  1343. specified at initialization in the isUtf16 member.
  1344. */
  1345. #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1346. #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1347. /* This is what detects the encoding. encodingTable maps from
  1348. encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
  1349. the external (protocol) specified encoding; state is
  1350. XML_CONTENT_STATE if we're parsing an external text entity, and
  1351. XML_PROLOG_STATE otherwise.
  1352. */
  1353. static int
  1354. initScan(const ENCODING * const *encodingTable,
  1355. const INIT_ENCODING *enc,
  1356. int state,
  1357. const char *ptr,
  1358. const char *end,
  1359. const char **nextTokPtr)
  1360. {
  1361. const ENCODING **encPtr;
  1362. if (ptr == end)
  1363. return XML_TOK_NONE;
  1364. encPtr = enc->encPtr;
  1365. if (ptr + 1 == end) {
  1366. /* only a single byte available for auto-detection */
  1367. #ifndef XML_DTD /* FIXME */
  1368. /* a well-formed document entity must have more than one byte */
  1369. if (state != XML_CONTENT_STATE)
  1370. return XML_TOK_PARTIAL;
  1371. #endif
  1372. /* so we're parsing an external text entity... */
  1373. /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1374. switch (INIT_ENC_INDEX(enc)) {
  1375. case UTF_16_ENC:
  1376. case UTF_16LE_ENC:
  1377. case UTF_16BE_ENC:
  1378. return XML_TOK_PARTIAL;
  1379. }
  1380. switch ((unsigned char)*ptr) {
  1381. case 0xFE:
  1382. case 0xFF:
  1383. case 0xEF: /* possibly first byte of UTF-8 BOM */
  1384. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1385. && state == XML_CONTENT_STATE)
  1386. break;
  1387. /* fall through */
  1388. case 0x00:
  1389. case 0x3C:
  1390. return XML_TOK_PARTIAL;
  1391. }
  1392. }
  1393. else {
  1394. switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1395. case 0xFEFF:
  1396. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1397. && state == XML_CONTENT_STATE)
  1398. break;
  1399. *nextTokPtr = ptr + 2;
  1400. *encPtr = encodingTable[UTF_16BE_ENC];
  1401. return XML_TOK_BOM;
  1402. /* 00 3C is handled in the default case */
  1403. case 0x3C00:
  1404. if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1405. || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1406. && state == XML_CONTENT_STATE)
  1407. break;
  1408. *encPtr = encodingTable[UTF_16LE_ENC];
  1409. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1410. case 0xFFFE:
  1411. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1412. && state == XML_CONTENT_STATE)
  1413. break;
  1414. *nextTokPtr = ptr + 2;
  1415. *encPtr = encodingTable[UTF_16LE_ENC];
  1416. return XML_TOK_BOM;
  1417. case 0xEFBB:
  1418. /* Maybe a UTF-8 BOM (EF BB BF) */
  1419. /* If there's an explicitly specified (external) encoding
  1420. of ISO-8859-1 or some flavour of UTF-16
  1421. and this is an external text entity,
  1422. don't look for the BOM,
  1423. because it might be a legal data.
  1424. */
  1425. if (state == XML_CONTENT_STATE) {
  1426. int e = INIT_ENC_INDEX(enc);
  1427. if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
  1428. || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1429. break;
  1430. }
  1431. if (ptr + 2 == end)
  1432. return XML_TOK_PARTIAL;
  1433. if ((unsigned char)ptr[2] == 0xBF) {
  1434. *nextTokPtr = ptr + 3;
  1435. *encPtr = encodingTable[UTF_8_ENC];
  1436. return XML_TOK_BOM;
  1437. }
  1438. break;
  1439. default:
  1440. if (ptr[0] == '\0') {
  1441. /* 0 isn't a legal data character. Furthermore a document
  1442. entity can only start with ASCII characters. So the only
  1443. way this can fail to be big-endian UTF-16 if it it's an
  1444. external parsed general entity that's labelled as
  1445. UTF-16LE.
  1446. */
  1447. if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1448. break;
  1449. *encPtr = encodingTable[UTF_16BE_ENC];
  1450. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1451. }
  1452. else if (ptr[1] == '\0') {
  1453. /* We could recover here in the case:
  1454. - parsing an external entity
  1455. - second byte is 0
  1456. - no externally specified encoding
  1457. - no encoding declaration
  1458. by assuming UTF-16LE. But we don't, because this would mean when
  1459. presented just with a single byte, we couldn't reliably determine
  1460. whether we needed further bytes.
  1461. */
  1462. if (state == XML_CONTENT_STATE)
  1463. break;
  1464. *encPtr = encodingTable[UTF_16LE_ENC];
  1465. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1466. }
  1467. break;
  1468. }
  1469. }
  1470. *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1471. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1472. }
  1473. #define NS(x) x
  1474. #define ns(x) x
  1475. #define XML_TOK_NS_C
  1476. #include "xmltok_ns.c"
  1477. #undef XML_TOK_NS_C
  1478. #undef NS
  1479. #undef ns
  1480. #ifdef XML_NS
  1481. #define NS(x) x ## NS
  1482. #define ns(x) x ## _ns
  1483. #define XML_TOK_NS_C
  1484. #include "xmltok_ns.c"
  1485. #undef XML_TOK_NS_C
  1486. #undef NS
  1487. #undef ns
  1488. ENCODING *
  1489. XmlInitUnknownEncodingNS(void *mem,
  1490. int *table,
  1491. CONVERTER convert,
  1492. void *userData)
  1493. {
  1494. ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1495. if (enc)
  1496. ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1497. return enc;
  1498. }
  1499. #endif /* XML_NS */