1
0

xmltok.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584
  1. /*
  2. Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  3. See the file COPYING for copying permission.
  4. */
  5. #include <cmexpat/expatConfig.h>
  6. #include "xmltok.h"
  7. #include "nametab.h"
  8. #if defined(__BORLANDC__)
  9. #pragma warn -8008 // Disable "condition is always true" warning.
  10. #pragma warn -8066 // Disable "unreachable code" warning.
  11. #endif
  12. #ifdef XML_DTD
  13. #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  14. #else
  15. #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  16. #endif
  17. #define VTABLE1 \
  18. { PREFIX(prologTok), PREFIX(contentTok), \
  19. PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
  20. { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  21. PREFIX(sameName), \
  22. PREFIX(nameMatchesAscii), \
  23. PREFIX(nameLength), \
  24. PREFIX(skipS), \
  25. PREFIX(getAtts), \
  26. PREFIX(charRefNumber), \
  27. PREFIX(predefinedEntityName), \
  28. PREFIX(updatePosition), \
  29. PREFIX(isPublicId)
  30. #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  31. #define UCS2_GET_NAMING(pages, hi, lo) \
  32. (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  33. /* A 2 byte UTF-8 representation splits the characters 11 bits
  34. between the bottom 5 and 6 bits of the bytes.
  35. We need 8 bits to index into pages, 3 bits to add to that index and
  36. 5 bits to generate the mask. */
  37. #define UTF8_GET_NAMING2(pages, byte) \
  38. (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  39. + ((((byte)[0]) & 3) << 1) \
  40. + ((((byte)[1]) >> 5) & 1)] \
  41. & (1 << (((byte)[1]) & 0x1F)))
  42. /* A 3 byte UTF-8 representation splits the characters 16 bits
  43. between the bottom 4, 6 and 6 bits of the bytes.
  44. We need 8 bits to index into pages, 3 bits to add to that index and
  45. 5 bits to generate the mask. */
  46. #define UTF8_GET_NAMING3(pages, byte) \
  47. (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  48. + ((((byte)[1]) >> 2) & 0xF)] \
  49. << 3) \
  50. + ((((byte)[1]) & 3) << 1) \
  51. + ((((byte)[2]) >> 5) & 1)] \
  52. & (1 << (((byte)[2]) & 0x1F)))
  53. #define UTF8_GET_NAMING(pages, p, n) \
  54. ((n) == 2 \
  55. ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  56. : ((n) == 3 \
  57. ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  58. : 0))
  59. #define UTF8_INVALID3(p) \
  60. ((*p) == 0xED \
  61. ? (((p)[1] & 0x20) != 0) \
  62. : ((*p) == 0xEF \
  63. ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
  64. : 0))
  65. #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
  66. static
  67. int isNever(const ENCODING *enc, const char *p)
  68. {
  69. cmExpatUnused(enc);
  70. cmExpatUnused(p);
  71. return 0;
  72. }
  73. static
  74. int utf8_isName2(const ENCODING *enc, const char *p)
  75. {
  76. cmExpatUnused(enc);
  77. return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
  78. }
  79. static
  80. int utf8_isName3(const ENCODING *enc, const char *p)
  81. {
  82. cmExpatUnused(enc);
  83. return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  84. }
  85. #define utf8_isName4 isNever
  86. static
  87. int utf8_isNmstrt2(const ENCODING *enc, const char *p)
  88. {
  89. cmExpatUnused(enc);
  90. return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  91. }
  92. static
  93. int utf8_isNmstrt3(const ENCODING *enc, const char *p)
  94. {
  95. cmExpatUnused(enc);
  96. return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  97. }
  98. #define utf8_isNmstrt4 isNever
  99. #define utf8_isInvalid2 isNever
  100. static
  101. int utf8_isInvalid3(const ENCODING *enc, const char *p)
  102. {
  103. cmExpatUnused(enc);
  104. return UTF8_INVALID3((const unsigned char *)p);
  105. }
  106. static
  107. int utf8_isInvalid4(const ENCODING *enc, const char *p)
  108. {
  109. cmExpatUnused(enc);
  110. return UTF8_INVALID4((const unsigned char *)p);
  111. }
  112. struct normal_encoding {
  113. ENCODING enc;
  114. unsigned char type[256];
  115. #ifdef XML_MIN_SIZE
  116. int (*byteType)(const ENCODING *, const char *);
  117. int (*isNameMin)(const ENCODING *, const char *);
  118. int (*isNmstrtMin)(const ENCODING *, const char *);
  119. int (*byteToAscii)(const ENCODING *, const char *);
  120. int (*charMatches)(const ENCODING *, const char *, int);
  121. #endif /* XML_MIN_SIZE */
  122. int (*isName2)(const ENCODING *, const char *);
  123. int (*isName3)(const ENCODING *, const char *);
  124. int (*isName4)(const ENCODING *, const char *);
  125. int (*isNmstrt2)(const ENCODING *, const char *);
  126. int (*isNmstrt3)(const ENCODING *, const char *);
  127. int (*isNmstrt4)(const ENCODING *, const char *);
  128. int (*isInvalid2)(const ENCODING *, const char *);
  129. int (*isInvalid3)(const ENCODING *, const char *);
  130. int (*isInvalid4)(const ENCODING *, const char *);
  131. };
  132. #ifdef XML_MIN_SIZE
  133. #define STANDARD_VTABLE(E) \
  134. E ## byteType, \
  135. E ## isNameMin, \
  136. E ## isNmstrtMin, \
  137. E ## byteToAscii, \
  138. E ## charMatches,
  139. #else
  140. #define STANDARD_VTABLE(E) /* as nothing */
  141. #endif
  142. #define NORMAL_VTABLE(E) \
  143. E ## isName2, \
  144. E ## isName3, \
  145. E ## isName4, \
  146. E ## isNmstrt2, \
  147. E ## isNmstrt3, \
  148. E ## isNmstrt4, \
  149. E ## isInvalid2, \
  150. E ## isInvalid3, \
  151. E ## isInvalid4
  152. #define EMPTY_VTABLE(E) 0, 0, 0, 0, 0, 0, 0, 0, 0
  153. static int checkCharRefNumber(int);
  154. #include "xmltok_impl.h"
  155. #include "ascii.h"
  156. #ifdef XML_MIN_SIZE
  157. #define sb_isNameMin isNever
  158. #define sb_isNmstrtMin isNever
  159. #endif
  160. #ifdef XML_MIN_SIZE
  161. #define MINBPC(enc) ((enc)->minBytesPerChar)
  162. #else
  163. /* minimum bytes per character */
  164. #define MINBPC(enc) 1
  165. #endif
  166. #define SB_BYTE_TYPE(enc, p) \
  167. (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  168. #ifdef XML_MIN_SIZE
  169. static
  170. int sb_byteType(const ENCODING *enc, const char *p)
  171. {
  172. return SB_BYTE_TYPE(enc, p);
  173. }
  174. #define BYTE_TYPE(enc, p) \
  175. (((const struct normal_encoding *)(enc))->byteType(enc, p))
  176. #else
  177. #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  178. #endif
  179. #ifdef XML_MIN_SIZE
  180. #define BYTE_TO_ASCII(enc, p) \
  181. (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
  182. static
  183. int sb_byteToAscii(const ENCODING *enc, const char *p)
  184. {
  185. return *p;
  186. }
  187. #else
  188. #define BYTE_TO_ASCII(enc, p) (*(p))
  189. #endif
  190. #define IS_NAME_CHAR(enc, p, n) \
  191. (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
  192. #define IS_NMSTRT_CHAR(enc, p, n) \
  193. (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
  194. #define IS_INVALID_CHAR(enc, p, n) \
  195. (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
  196. #ifdef XML_MIN_SIZE
  197. #define IS_NAME_CHAR_MINBPC(enc, p) \
  198. (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
  199. #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  200. (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
  201. #else
  202. #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  203. #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  204. #endif
  205. #ifdef XML_MIN_SIZE
  206. #define CHAR_MATCHES(enc, p, c) \
  207. (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
  208. static
  209. int sb_charMatches(const ENCODING *enc, const char *p, int c)
  210. {
  211. return *p == c;
  212. }
  213. #else
  214. /* c is an ASCII character */
  215. #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  216. #endif
  217. #define PREFIX(ident) normal_ ## ident
  218. #include "xmltok_impl.c"
  219. #undef MINBPC
  220. #undef BYTE_TYPE
  221. #undef BYTE_TO_ASCII
  222. #undef CHAR_MATCHES
  223. #undef IS_NAME_CHAR
  224. #undef IS_NAME_CHAR_MINBPC
  225. #undef IS_NMSTRT_CHAR
  226. #undef IS_NMSTRT_CHAR_MINBPC
  227. #undef IS_INVALID_CHAR
  228. enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
  229. UTF8_cval1 = 0x00,
  230. UTF8_cval2 = 0xc0,
  231. UTF8_cval3 = 0xe0,
  232. UTF8_cval4 = 0xf0
  233. };
  234. static
  235. void utf8_toUtf8(const ENCODING *enc,
  236. const char **fromP, const char *fromLim,
  237. char **toP, const char *toLim)
  238. {
  239. char *to;
  240. const char *from;
  241. cmExpatUnused(enc);
  242. if (fromLim - *fromP > toLim - *toP) {
  243. /* Avoid copying partial characters. */
  244. for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
  245. if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
  246. break;
  247. }
  248. for (to = *toP, from = *fromP; from != fromLim; from++, to++)
  249. *to = *from;
  250. *fromP = from;
  251. *toP = to;
  252. }
  253. static
  254. void utf8_toUtf16(const ENCODING *enc,
  255. const char **fromP, const char *fromLim,
  256. unsigned short **toP, const unsigned short *toLim)
  257. {
  258. unsigned short *to = *toP;
  259. const char *from = *fromP;
  260. while (from != fromLim && to != toLim) {
  261. switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
  262. case BT_LEAD2:
  263. *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
  264. from += 2;
  265. break;
  266. case BT_LEAD3:
  267. *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
  268. from += 3;
  269. break;
  270. case BT_LEAD4:
  271. {
  272. unsigned long n;
  273. if (to + 1 == toLim)
  274. break;
  275. n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
  276. n -= 0x10000;
  277. to[0] = (unsigned short)((n >> 10) | 0xD800);
  278. to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
  279. to += 2;
  280. from += 4;
  281. }
  282. break;
  283. default:
  284. *to++ = *from++;
  285. break;
  286. }
  287. }
  288. *fromP = from;
  289. *toP = to;
  290. }
  291. #ifdef XML_NS
  292. static const struct normal_encoding utf8_encoding_ns = {
  293. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  294. {
  295. #include "asciitab.h"
  296. #include "utf8tab.h"
  297. },
  298. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  299. };
  300. #endif
  301. static const struct normal_encoding utf8_encoding = {
  302. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  303. {
  304. #define BT_COLON BT_NMSTRT
  305. #include "asciitab.h"
  306. #undef BT_COLON
  307. #include "utf8tab.h"
  308. },
  309. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  310. };
  311. #ifdef XML_NS
  312. static const struct normal_encoding internal_utf8_encoding_ns = {
  313. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  314. {
  315. #include "iasciitab.h"
  316. #include "utf8tab.h"
  317. },
  318. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  319. };
  320. #endif
  321. static const struct normal_encoding internal_utf8_encoding = {
  322. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  323. {
  324. #define BT_COLON BT_NMSTRT
  325. #include "iasciitab.h"
  326. #undef BT_COLON
  327. #include "utf8tab.h"
  328. },
  329. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  330. };
  331. static
  332. void latin1_toUtf8(const ENCODING *enc,
  333. const char **fromP, const char *fromLim,
  334. char **toP, const char *toLim)
  335. {
  336. cmExpatUnused(enc);
  337. for (;;) {
  338. unsigned char c;
  339. if (*fromP == fromLim)
  340. break;
  341. c = (unsigned char)**fromP;
  342. if (c & 0x80) {
  343. if (toLim - *toP < 2)
  344. break;
  345. *(*toP)++ = ((c >> 6) | UTF8_cval2);
  346. *(*toP)++ = ((c & 0x3f) | 0x80);
  347. (*fromP)++;
  348. }
  349. else {
  350. if (*toP == toLim)
  351. break;
  352. *(*toP)++ = *(*fromP)++;
  353. }
  354. }
  355. }
  356. static
  357. void latin1_toUtf16(const ENCODING *enc,
  358. const char **fromP, const char *fromLim,
  359. unsigned short **toP, const unsigned short *toLim)
  360. {
  361. cmExpatUnused(enc);
  362. while (*fromP != fromLim && *toP != toLim)
  363. *(*toP)++ = (unsigned char)*(*fromP)++;
  364. }
  365. #ifdef XML_NS
  366. static const struct normal_encoding latin1_encoding_ns = {
  367. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  368. {
  369. #include "asciitab.h"
  370. #include "latin1tab.h"
  371. },
  372. STANDARD_VTABLE(sb_) EMPTY_VTABLE(sb_)
  373. };
  374. #endif
  375. static const struct normal_encoding latin1_encoding = {
  376. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  377. {
  378. #define BT_COLON BT_NMSTRT
  379. #include "asciitab.h"
  380. #undef BT_COLON
  381. #include "latin1tab.h"
  382. },
  383. STANDARD_VTABLE(sb_) EMPTY_VTABLE(sb_)
  384. };
  385. static
  386. void ascii_toUtf8(const ENCODING *enc,
  387. const char **fromP, const char *fromLim,
  388. char **toP, const char *toLim)
  389. {
  390. cmExpatUnused(enc);
  391. while (*fromP != fromLim && *toP != toLim)
  392. *(*toP)++ = *(*fromP)++;
  393. }
  394. #ifdef XML_NS
  395. static const struct normal_encoding ascii_encoding_ns = {
  396. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  397. {
  398. #include "asciitab.h"
  399. /* BT_NONXML == 0 */
  400. },
  401. STANDARD_VTABLE(sb_) EMPTY_VTABLE(sb_)
  402. };
  403. #endif
  404. static const struct normal_encoding ascii_encoding = {
  405. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  406. {
  407. #define BT_COLON BT_NMSTRT
  408. #include "asciitab.h"
  409. #undef BT_COLON
  410. /* BT_NONXML == 0 */
  411. },
  412. STANDARD_VTABLE(sb_) EMPTY_VTABLE(sb_)
  413. };
  414. static int unicode_byte_type(char hi, char lo)
  415. {
  416. switch ((unsigned char)hi) {
  417. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  418. return BT_LEAD4;
  419. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  420. return BT_TRAIL;
  421. case 0xFF:
  422. switch ((unsigned char)lo) {
  423. case 0xFF:
  424. case 0xFE:
  425. return BT_NONXML;
  426. }
  427. break;
  428. }
  429. return BT_NONASCII;
  430. }
  431. #define DEFINE_UTF16_TO_UTF8(E) \
  432. static \
  433. void E ## toUtf8(const ENCODING *enc, \
  434. const char **fromP, const char *fromLim, \
  435. char **toP, const char *toLim) \
  436. { \
  437. const char *from; \
  438. cmExpatUnused(enc);\
  439. for (from = *fromP; from != fromLim; from += 2) { \
  440. int plane; \
  441. unsigned char lo2; \
  442. unsigned char lo = GET_LO(from); \
  443. unsigned char hi = GET_HI(from); \
  444. switch (hi) { \
  445. case 0: \
  446. if (lo < 0x80) { \
  447. if (*toP == toLim) { \
  448. *fromP = from; \
  449. return; \
  450. } \
  451. *(*toP)++ = lo; \
  452. break; \
  453. } \
  454. /* fall through */ \
  455. case 0x1: case 0x2: case 0x3: \
  456. case 0x4: case 0x5: case 0x6: case 0x7: \
  457. if (toLim - *toP < 2) { \
  458. *fromP = from; \
  459. return; \
  460. } \
  461. *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
  462. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  463. break; \
  464. default: \
  465. if (toLim - *toP < 3) { \
  466. *fromP = from; \
  467. return; \
  468. } \
  469. /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
  470. *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
  471. *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
  472. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  473. break; \
  474. case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
  475. if (toLim - *toP < 4) { \
  476. *fromP = from; \
  477. return; \
  478. } \
  479. plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
  480. *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
  481. *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
  482. from += 2; \
  483. lo2 = GET_LO(from); \
  484. *(*toP)++ = (((lo & 0x3) << 4) \
  485. | ((GET_HI(from) & 0x3) << 2) \
  486. | (lo2 >> 6) \
  487. | 0x80); \
  488. *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
  489. break; \
  490. } \
  491. } \
  492. *fromP = from; \
  493. }
  494. #define DEFINE_UTF16_TO_UTF16(E) \
  495. static \
  496. void E ## toUtf16(const ENCODING *enc, \
  497. const char **fromP, const char *fromLim, \
  498. unsigned short **toP, const unsigned short *toLim) \
  499. { \
  500. cmExpatUnused(enc);\
  501. /* Avoid copying first half only of surrogate */ \
  502. if (fromLim - *fromP > ((toLim - *toP) << 1) \
  503. && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
  504. fromLim -= 2; \
  505. for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
  506. *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  507. }
  508. #define SET2(ptr, ch) \
  509. (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  510. #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  511. #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  512. DEFINE_UTF16_TO_UTF8(little2_)
  513. DEFINE_UTF16_TO_UTF16(little2_)
  514. #undef SET2
  515. #undef GET_LO
  516. #undef GET_HI
  517. #define SET2(ptr, ch) \
  518. (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  519. #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  520. #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  521. DEFINE_UTF16_TO_UTF8(big2_)
  522. DEFINE_UTF16_TO_UTF16(big2_)
  523. #undef SET2
  524. #undef GET_LO
  525. #undef GET_HI
  526. #define LITTLE2_BYTE_TYPE(enc, p) \
  527. ((p)[1] == 0 \
  528. ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
  529. : unicode_byte_type((p)[1], (p)[0]))
  530. #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  531. #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  532. #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
  533. UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  534. #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  535. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  536. #ifdef XML_MIN_SIZE
  537. static
  538. int little2_byteType(const ENCODING *enc, const char *p)
  539. {
  540. return LITTLE2_BYTE_TYPE(enc, p);
  541. }
  542. static
  543. int little2_byteToAscii(const ENCODING *enc, const char *p)
  544. {
  545. return LITTLE2_BYTE_TO_ASCII(enc, p);
  546. }
  547. static
  548. int little2_charMatches(const ENCODING *enc, const char *p, int c)
  549. {
  550. return LITTLE2_CHAR_MATCHES(enc, p, c);
  551. }
  552. static
  553. int little2_isNameMin(const ENCODING *enc, const char *p)
  554. {
  555. return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  556. }
  557. static
  558. int little2_isNmstrtMin(const ENCODING *enc, const char *p)
  559. {
  560. return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  561. }
  562. #undef VTABLE
  563. #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  564. #else /* not XML_MIN_SIZE */
  565. #undef PREFIX
  566. #define PREFIX(ident) little2_ ## ident
  567. #define MINBPC(enc) 2
  568. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  569. #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  570. #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
  571. #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  572. #define IS_NAME_CHAR(enc, p, n) 0
  573. #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  574. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  575. #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  576. #include "xmltok_impl.c"
  577. #undef MINBPC
  578. #undef BYTE_TYPE
  579. #undef BYTE_TO_ASCII
  580. #undef CHAR_MATCHES
  581. #undef IS_NAME_CHAR
  582. #undef IS_NAME_CHAR_MINBPC
  583. #undef IS_NMSTRT_CHAR
  584. #undef IS_NMSTRT_CHAR_MINBPC
  585. #undef IS_INVALID_CHAR
  586. #endif /* not XML_MIN_SIZE */
  587. #ifdef XML_NS
  588. static const struct normal_encoding little2_encoding_ns = {
  589. { VTABLE, 2, 0,
  590. #if XML_BYTE_ORDER == 12
  591. 1
  592. #else
  593. 0
  594. #endif
  595. },
  596. {
  597. #include "asciitab.h"
  598. #include "latin1tab.h"
  599. },
  600. STANDARD_VTABLE(little2_) EMPTY_VTABLE(little2_)
  601. };
  602. #endif
  603. static const struct normal_encoding little2_encoding = {
  604. { VTABLE, 2, 0,
  605. #if XML_BYTE_ORDER == 12
  606. 1
  607. #else
  608. 0
  609. #endif
  610. },
  611. {
  612. #define BT_COLON BT_NMSTRT
  613. #include "asciitab.h"
  614. #undef BT_COLON
  615. #include "latin1tab.h"
  616. },
  617. STANDARD_VTABLE(little2_) EMPTY_VTABLE(little2_)
  618. };
  619. #if XML_BYTE_ORDER != 21
  620. #ifdef XML_NS
  621. static const struct normal_encoding internal_little2_encoding_ns = {
  622. { VTABLE, 2, 0, 1 },
  623. {
  624. #include "iasciitab.h"
  625. #include "latin1tab.h"
  626. },
  627. STANDARD_VTABLE(little2_) EMPTY_VTABLE(little2_)
  628. };
  629. #endif
  630. static const struct normal_encoding internal_little2_encoding = {
  631. { VTABLE, 2, 0, 1 },
  632. {
  633. #define BT_COLON BT_NMSTRT
  634. #include "iasciitab.h"
  635. #undef BT_COLON
  636. #include "latin1tab.h"
  637. },
  638. STANDARD_VTABLE(little2_) EMPTY_VTABLE(little2_)
  639. };
  640. #endif
  641. #define BIG2_BYTE_TYPE(enc, p) \
  642. ((p)[0] == 0 \
  643. ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
  644. : unicode_byte_type((p)[0], (p)[1]))
  645. #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  646. #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  647. #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
  648. UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  649. #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  650. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  651. #ifdef XML_MIN_SIZE
  652. static
  653. int big2_byteType(const ENCODING *enc, const char *p)
  654. {
  655. return BIG2_BYTE_TYPE(enc, p);
  656. }
  657. static
  658. int big2_byteToAscii(const ENCODING *enc, const char *p)
  659. {
  660. return BIG2_BYTE_TO_ASCII(enc, p);
  661. }
  662. static
  663. int big2_charMatches(const ENCODING *enc, const char *p, int c)
  664. {
  665. return BIG2_CHAR_MATCHES(enc, p, c);
  666. }
  667. static
  668. int big2_isNameMin(const ENCODING *enc, const char *p)
  669. {
  670. return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  671. }
  672. static
  673. int big2_isNmstrtMin(const ENCODING *enc, const char *p)
  674. {
  675. return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  676. }
  677. #undef VTABLE
  678. #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  679. #else /* not XML_MIN_SIZE */
  680. #undef PREFIX
  681. #define PREFIX(ident) big2_ ## ident
  682. #define MINBPC(enc) 2
  683. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  684. #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  685. #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
  686. #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  687. #define IS_NAME_CHAR(enc, p, n) 0
  688. #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  689. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  690. #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  691. #include "xmltok_impl.c"
  692. #undef MINBPC
  693. #undef BYTE_TYPE
  694. #undef BYTE_TO_ASCII
  695. #undef CHAR_MATCHES
  696. #undef IS_NAME_CHAR
  697. #undef IS_NAME_CHAR_MINBPC
  698. #undef IS_NMSTRT_CHAR
  699. #undef IS_NMSTRT_CHAR_MINBPC
  700. #undef IS_INVALID_CHAR
  701. #endif /* not XML_MIN_SIZE */
  702. #ifdef XML_NS
  703. static const struct normal_encoding big2_encoding_ns = {
  704. { VTABLE, 2, 0,
  705. #if XML_BYTE_ORDER == 21
  706. 1
  707. #else
  708. 0
  709. #endif
  710. },
  711. {
  712. #include "asciitab.h"
  713. #include "latin1tab.h"
  714. },
  715. STANDARD_VTABLE(big2_) EMPTY_VTABLE(big2_)
  716. };
  717. #endif
  718. static const struct normal_encoding big2_encoding = {
  719. { VTABLE, 2, 0,
  720. #if XML_BYTE_ORDER == 21
  721. 1
  722. #else
  723. 0
  724. #endif
  725. },
  726. {
  727. #define BT_COLON BT_NMSTRT
  728. #include "asciitab.h"
  729. #undef BT_COLON
  730. #include "latin1tab.h"
  731. },
  732. STANDARD_VTABLE(big2_) EMPTY_VTABLE(big2_)
  733. };
  734. #if XML_BYTE_ORDER != 12
  735. #ifdef XML_NS
  736. static const struct normal_encoding internal_big2_encoding_ns = {
  737. { VTABLE, 2, 0, 1 },
  738. {
  739. #include "iasciitab.h"
  740. #include "latin1tab.h"
  741. },
  742. STANDARD_VTABLE(big2_) EMPTY_VTABLE(big2_)
  743. };
  744. #endif
  745. static const struct normal_encoding internal_big2_encoding = {
  746. { VTABLE, 2, 0, 1 },
  747. {
  748. #define BT_COLON BT_NMSTRT
  749. #include "iasciitab.h"
  750. #undef BT_COLON
  751. #include "latin1tab.h"
  752. },
  753. STANDARD_VTABLE(big2_) EMPTY_VTABLE(big2_)
  754. };
  755. #endif
  756. #undef PREFIX
  757. static
  758. int streqci(const char *s1, const char *s2)
  759. {
  760. for (;;) {
  761. char c1 = *s1++;
  762. char c2 = *s2++;
  763. if (ASCII_a <= c1 && c1 <= ASCII_z)
  764. c1 += ASCII_A - ASCII_a;
  765. if (ASCII_a <= c2 && c2 <= ASCII_z)
  766. c2 += ASCII_A - ASCII_a;
  767. if (c1 != c2)
  768. return 0;
  769. if (!c1)
  770. break;
  771. }
  772. return 1;
  773. }
  774. static
  775. void initUpdatePosition(const ENCODING *enc, const char *ptr,
  776. const char *end, POSITION *pos)
  777. {
  778. cmExpatUnused(enc);
  779. normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  780. }
  781. static
  782. int toAscii(const ENCODING *enc, const char *ptr, const char *end)
  783. {
  784. char buf[1];
  785. char *p = buf;
  786. XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  787. if (p == buf)
  788. return -1;
  789. else
  790. return buf[0];
  791. }
  792. static
  793. int isSpace(int c)
  794. {
  795. switch (c) {
  796. case 0x20:
  797. case 0xD:
  798. case 0xA:
  799. case 0x9:
  800. return 1;
  801. }
  802. return 0;
  803. }
  804. /* Return 1 if there's just optional white space
  805. or there's an S followed by name=val. */
  806. static
  807. int parsePseudoAttribute(const ENCODING *enc,
  808. const char *ptr,
  809. const char *end,
  810. const char **namePtr,
  811. const char **nameEndPtr,
  812. const char **valPtr,
  813. const char **nextTokPtr)
  814. {
  815. int c;
  816. char open;
  817. if (ptr == end) {
  818. *namePtr = 0;
  819. return 1;
  820. }
  821. if (!isSpace(toAscii(enc, ptr, end))) {
  822. *nextTokPtr = ptr;
  823. return 0;
  824. }
  825. do {
  826. ptr += enc->minBytesPerChar;
  827. } while (isSpace(toAscii(enc, ptr, end)));
  828. if (ptr == end) {
  829. *namePtr = 0;
  830. return 1;
  831. }
  832. *namePtr = ptr;
  833. for (;;) {
  834. c = toAscii(enc, ptr, end);
  835. if (c == -1) {
  836. *nextTokPtr = ptr;
  837. return 0;
  838. }
  839. if (c == ASCII_EQUALS) {
  840. *nameEndPtr = ptr;
  841. break;
  842. }
  843. if (isSpace(c)) {
  844. *nameEndPtr = ptr;
  845. do {
  846. ptr += enc->minBytesPerChar;
  847. } while (isSpace(c = toAscii(enc, ptr, end)));
  848. if (c != ASCII_EQUALS) {
  849. *nextTokPtr = ptr;
  850. return 0;
  851. }
  852. break;
  853. }
  854. ptr += enc->minBytesPerChar;
  855. }
  856. if (ptr == *namePtr) {
  857. *nextTokPtr = ptr;
  858. return 0;
  859. }
  860. ptr += enc->minBytesPerChar;
  861. c = toAscii(enc, ptr, end);
  862. while (isSpace(c)) {
  863. ptr += enc->minBytesPerChar;
  864. c = toAscii(enc, ptr, end);
  865. }
  866. if (c != ASCII_QUOT && c != ASCII_APOS) {
  867. *nextTokPtr = ptr;
  868. return 0;
  869. }
  870. open = c;
  871. ptr += enc->minBytesPerChar;
  872. *valPtr = ptr;
  873. for (;; ptr += enc->minBytesPerChar) {
  874. c = toAscii(enc, ptr, end);
  875. if (c == open)
  876. break;
  877. if (!(ASCII_a <= c && c <= ASCII_z)
  878. && !(ASCII_A <= c && c <= ASCII_Z)
  879. && !(ASCII_0 <= c && c <= ASCII_9)
  880. && c != ASCII_PERIOD
  881. && c != ASCII_MINUS
  882. && c != ASCII_UNDERSCORE) {
  883. *nextTokPtr = ptr;
  884. return 0;
  885. }
  886. }
  887. *nextTokPtr = ptr + enc->minBytesPerChar;
  888. return 1;
  889. }
  890. static const char KW_version[] = {
  891. ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  892. };
  893. static const char KW_encoding[] = {
  894. ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  895. };
  896. static const char KW_standalone[] = {
  897. ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
  898. };
  899. static const char KW_yes[] = {
  900. ASCII_y, ASCII_e, ASCII_s, '\0'
  901. };
  902. static const char KW_no[] = {
  903. ASCII_n, ASCII_o, '\0'
  904. };
  905. static
  906. int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  907. const char *,
  908. const char *),
  909. int isGeneralTextEntity,
  910. const ENCODING *enc,
  911. const char *ptr,
  912. const char *end,
  913. const char **badPtr,
  914. const char **versionPtr,
  915. const char **versionEndPtr,
  916. const char **encodingName,
  917. const ENCODING **encoding,
  918. int *standalone)
  919. {
  920. const char *val = 0;
  921. const char *name = 0;
  922. const char *nameEnd = 0;
  923. ptr += 5 * enc->minBytesPerChar;
  924. end -= 2 * enc->minBytesPerChar;
  925. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) {
  926. *badPtr = ptr;
  927. return 0;
  928. }
  929. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  930. if (!isGeneralTextEntity) {
  931. *badPtr = name;
  932. return 0;
  933. }
  934. }
  935. else {
  936. if (versionPtr)
  937. *versionPtr = val;
  938. if (versionEndPtr)
  939. *versionEndPtr = ptr;
  940. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  941. *badPtr = ptr;
  942. return 0;
  943. }
  944. if (!name) {
  945. if (isGeneralTextEntity) {
  946. /* a TextDecl must have an EncodingDecl */
  947. *badPtr = ptr;
  948. return 0;
  949. }
  950. return 1;
  951. }
  952. }
  953. if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  954. int c = toAscii(enc, val, end);
  955. if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  956. *badPtr = val;
  957. return 0;
  958. }
  959. if (encodingName)
  960. *encodingName = val;
  961. if (encoding)
  962. *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  963. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  964. *badPtr = ptr;
  965. return 0;
  966. }
  967. if (!name)
  968. return 1;
  969. }
  970. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity) {
  971. *badPtr = name;
  972. return 0;
  973. }
  974. if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  975. if (standalone)
  976. *standalone = 1;
  977. }
  978. else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  979. if (standalone)
  980. *standalone = 0;
  981. }
  982. else {
  983. *badPtr = val;
  984. return 0;
  985. }
  986. while (isSpace(toAscii(enc, ptr, end)))
  987. ptr += enc->minBytesPerChar;
  988. if (ptr != end) {
  989. *badPtr = ptr;
  990. return 0;
  991. }
  992. return 1;
  993. }
  994. static
  995. int checkCharRefNumber(int result)
  996. {
  997. switch (result >> 8) {
  998. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  999. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1000. return -1;
  1001. case 0:
  1002. if (latin1_encoding.type[result] == BT_NONXML)
  1003. return -1;
  1004. break;
  1005. case 0xFF:
  1006. if (result == 0xFFFE || result == 0xFFFF)
  1007. return -1;
  1008. break;
  1009. }
  1010. return result;
  1011. }
  1012. int XmlUtf8Encode(int c, char *buf)
  1013. {
  1014. enum {
  1015. /* minN is minimum legal resulting value for N byte sequence */
  1016. min2 = 0x80,
  1017. min3 = 0x800,
  1018. min4 = 0x10000
  1019. };
  1020. if (c < 0)
  1021. return 0;
  1022. if (c < min2) {
  1023. buf[0] = (c | UTF8_cval1);
  1024. return 1;
  1025. }
  1026. if (c < min3) {
  1027. buf[0] = ((c >> 6) | UTF8_cval2);
  1028. buf[1] = ((c & 0x3f) | 0x80);
  1029. return 2;
  1030. }
  1031. if (c < min4) {
  1032. buf[0] = ((c >> 12) | UTF8_cval3);
  1033. buf[1] = (((c >> 6) & 0x3f) | 0x80);
  1034. buf[2] = ((c & 0x3f) | 0x80);
  1035. return 3;
  1036. }
  1037. if (c < 0x110000) {
  1038. buf[0] = ((c >> 18) | UTF8_cval4);
  1039. buf[1] = (((c >> 12) & 0x3f) | 0x80);
  1040. buf[2] = (((c >> 6) & 0x3f) | 0x80);
  1041. buf[3] = ((c & 0x3f) | 0x80);
  1042. return 4;
  1043. }
  1044. return 0;
  1045. }
  1046. int XmlUtf16Encode(int charNum, unsigned short *buf)
  1047. {
  1048. if (charNum < 0)
  1049. return 0;
  1050. if (charNum < 0x10000) {
  1051. buf[0] = charNum;
  1052. return 1;
  1053. }
  1054. if (charNum < 0x110000) {
  1055. charNum -= 0x10000;
  1056. buf[0] = (charNum >> 10) + 0xD800;
  1057. buf[1] = (charNum & 0x3FF) + 0xDC00;
  1058. return 2;
  1059. }
  1060. return 0;
  1061. }
  1062. struct unknown_encoding {
  1063. struct normal_encoding normal;
  1064. int (*convert)(void *userData, const char *p);
  1065. void *userData;
  1066. unsigned short utf16[256];
  1067. char utf8[256][4];
  1068. };
  1069. int XmlSizeOfUnknownEncoding(void)
  1070. {
  1071. return sizeof(struct unknown_encoding);
  1072. }
  1073. static
  1074. int unknown_isName(const ENCODING *enc, const char *p)
  1075. {
  1076. int c = ((const struct unknown_encoding *)enc)
  1077. ->convert(((const struct unknown_encoding *)enc)->userData, p);
  1078. if (c & ~0xFFFF)
  1079. return 0;
  1080. return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1081. }
  1082. static
  1083. int unknown_isNmstrt(const ENCODING *enc, const char *p)
  1084. {
  1085. int c = ((const struct unknown_encoding *)enc)
  1086. ->convert(((const struct unknown_encoding *)enc)->userData, p);
  1087. if (c & ~0xFFFF)
  1088. return 0;
  1089. return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1090. }
  1091. static
  1092. int unknown_isInvalid(const ENCODING *enc, const char *p)
  1093. {
  1094. int c = ((const struct unknown_encoding *)enc)
  1095. ->convert(((const struct unknown_encoding *)enc)->userData, p);
  1096. return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1097. }
  1098. static
  1099. void unknown_toUtf8(const ENCODING *enc,
  1100. const char **fromP, const char *fromLim,
  1101. char **toP, const char *toLim)
  1102. {
  1103. char buf[XML_UTF8_ENCODE_MAX];
  1104. for (;;) {
  1105. const char *utf8;
  1106. int n;
  1107. if (*fromP == fromLim)
  1108. break;
  1109. utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
  1110. n = *utf8++;
  1111. if (n == 0) {
  1112. int c = ((const struct unknown_encoding *)enc)
  1113. ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
  1114. n = XmlUtf8Encode(c, buf);
  1115. if (n > toLim - *toP)
  1116. break;
  1117. utf8 = buf;
  1118. *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
  1119. - (BT_LEAD2 - 2);
  1120. }
  1121. else {
  1122. if (n > toLim - *toP)
  1123. break;
  1124. (*fromP)++;
  1125. }
  1126. do {
  1127. *(*toP)++ = *utf8++;
  1128. } while (--n != 0);
  1129. }
  1130. }
  1131. static
  1132. void unknown_toUtf16(const ENCODING *enc,
  1133. const char **fromP, const char *fromLim,
  1134. unsigned short **toP, const unsigned short *toLim)
  1135. {
  1136. while (*fromP != fromLim && *toP != toLim) {
  1137. unsigned short c
  1138. = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
  1139. if (c == 0) {
  1140. c = (unsigned short)((const struct unknown_encoding *)enc)
  1141. ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
  1142. *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
  1143. - (BT_LEAD2 - 2);
  1144. }
  1145. else
  1146. (*fromP)++;
  1147. *(*toP)++ = c;
  1148. }
  1149. }
  1150. ENCODING *
  1151. XmlInitUnknownEncoding(void *mem,
  1152. int *table,
  1153. int (*convert)(void *userData, const char *p),
  1154. void *userData)
  1155. {
  1156. int i;
  1157. struct unknown_encoding *e = mem;
  1158. for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1159. ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1160. for (i = 0; i < 128; i++)
  1161. if (latin1_encoding.type[i] != BT_OTHER
  1162. && latin1_encoding.type[i] != BT_NONXML
  1163. && table[i] != i)
  1164. return 0;
  1165. for (i = 0; i < 256; i++) {
  1166. int c = table[i];
  1167. if (c == -1) {
  1168. e->normal.type[i] = BT_MALFORM;
  1169. /* This shouldn't really get used. */
  1170. e->utf16[i] = 0xFFFF;
  1171. e->utf8[i][0] = 1;
  1172. e->utf8[i][1] = 0;
  1173. }
  1174. else if (c < 0) {
  1175. if (c < -4)
  1176. return 0;
  1177. e->normal.type[i] = BT_LEAD2 - (c + 2);
  1178. e->utf8[i][0] = 0;
  1179. e->utf16[i] = 0;
  1180. }
  1181. else if (c < 0x80) {
  1182. if (latin1_encoding.type[c] != BT_OTHER
  1183. && latin1_encoding.type[c] != BT_NONXML
  1184. && c != i)
  1185. return 0;
  1186. e->normal.type[i] = latin1_encoding.type[c];
  1187. e->utf8[i][0] = 1;
  1188. e->utf8[i][1] = (char)c;
  1189. e->utf16[i] = c == 0 ? 0xFFFF : c;
  1190. }
  1191. else if (checkCharRefNumber(c) < 0) {
  1192. e->normal.type[i] = BT_NONXML;
  1193. /* This shouldn't really get used. */
  1194. e->utf16[i] = 0xFFFF;
  1195. e->utf8[i][0] = 1;
  1196. e->utf8[i][1] = 0;
  1197. }
  1198. else {
  1199. if (c > 0xFFFF)
  1200. return 0;
  1201. if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1202. e->normal.type[i] = BT_NMSTRT;
  1203. else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1204. e->normal.type[i] = BT_NAME;
  1205. else
  1206. e->normal.type[i] = BT_OTHER;
  1207. e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1208. e->utf16[i] = c;
  1209. }
  1210. }
  1211. e->userData = userData;
  1212. e->convert = convert;
  1213. if (convert) {
  1214. e->normal.isName2 = unknown_isName;
  1215. e->normal.isName3 = unknown_isName;
  1216. e->normal.isName4 = unknown_isName;
  1217. e->normal.isNmstrt2 = unknown_isNmstrt;
  1218. e->normal.isNmstrt3 = unknown_isNmstrt;
  1219. e->normal.isNmstrt4 = unknown_isNmstrt;
  1220. e->normal.isInvalid2 = unknown_isInvalid;
  1221. e->normal.isInvalid3 = unknown_isInvalid;
  1222. e->normal.isInvalid4 = unknown_isInvalid;
  1223. }
  1224. e->normal.enc.utf8Convert = unknown_toUtf8;
  1225. e->normal.enc.utf16Convert = unknown_toUtf16;
  1226. return &(e->normal.enc);
  1227. }
  1228. /* If this enumeration is changed, getEncodingIndex and encodings
  1229. must also be changed. */
  1230. enum {
  1231. UNKNOWN_ENC = -1,
  1232. ISO_8859_1_ENC = 0,
  1233. US_ASCII_ENC,
  1234. UTF_8_ENC,
  1235. UTF_16_ENC,
  1236. UTF_16BE_ENC,
  1237. UTF_16LE_ENC,
  1238. /* must match encodingNames up to here */
  1239. NO_ENC
  1240. };
  1241. static const char KW_ISO_8859_1[] = {
  1242. ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
  1243. };
  1244. static const char KW_US_ASCII[] = {
  1245. ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
  1246. };
  1247. static const char KW_UTF_8[] = {
  1248. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1249. };
  1250. static const char KW_UTF_16[] = {
  1251. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1252. };
  1253. static const char KW_UTF_16BE[] = {
  1254. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
  1255. };
  1256. static const char KW_UTF_16LE[] = {
  1257. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
  1258. };
  1259. static
  1260. int getEncodingIndex(const char *name)
  1261. {
  1262. static const char *encodingNames[] = {
  1263. KW_ISO_8859_1,
  1264. KW_US_ASCII,
  1265. KW_UTF_8,
  1266. KW_UTF_16,
  1267. KW_UTF_16BE,
  1268. KW_UTF_16LE,
  1269. };
  1270. int i;
  1271. if (name == 0)
  1272. return NO_ENC;
  1273. for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1274. if (streqci(name, encodingNames[i]))
  1275. return i;
  1276. return UNKNOWN_ENC;
  1277. }
  1278. /* For binary compatibility, we store the index of the encoding specified
  1279. at initialization in the isUtf16 member. */
  1280. #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1281. #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1282. /* This is what detects the encoding.
  1283. encodingTable maps from encoding indices to encodings;
  1284. INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
  1285. state is XML_CONTENT_STATE if we're parsing an external text entity,
  1286. and XML_PROLOG_STATE otherwise.
  1287. */
  1288. static
  1289. int initScan(const ENCODING **encodingTable,
  1290. const INIT_ENCODING *enc,
  1291. int state,
  1292. const char *ptr,
  1293. const char *end,
  1294. const char **nextTokPtr)
  1295. {
  1296. const ENCODING **encPtr;
  1297. if (ptr == end)
  1298. return XML_TOK_NONE;
  1299. encPtr = enc->encPtr;
  1300. if (ptr + 1 == end) {
  1301. /* only a single byte available for auto-detection */
  1302. #ifndef XML_DTD /* FIXME */
  1303. /* a well-formed document entity must have more than one byte */
  1304. if (state != XML_CONTENT_STATE)
  1305. return XML_TOK_PARTIAL;
  1306. #endif
  1307. /* so we're parsing an external text entity... */
  1308. /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1309. switch (INIT_ENC_INDEX(enc)) {
  1310. case UTF_16_ENC:
  1311. case UTF_16LE_ENC:
  1312. case UTF_16BE_ENC:
  1313. return XML_TOK_PARTIAL;
  1314. }
  1315. switch ((unsigned char)*ptr) {
  1316. case 0xFE:
  1317. case 0xFF:
  1318. case 0xEF: /* possibly first byte of UTF-8 BOM */
  1319. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1320. && state == XML_CONTENT_STATE)
  1321. break;
  1322. /* fall through */
  1323. case 0x00:
  1324. case 0x3C:
  1325. return XML_TOK_PARTIAL;
  1326. }
  1327. }
  1328. else {
  1329. switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1330. case 0xFEFF:
  1331. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1332. && state == XML_CONTENT_STATE)
  1333. break;
  1334. *nextTokPtr = ptr + 2;
  1335. *encPtr = encodingTable[UTF_16BE_ENC];
  1336. return XML_TOK_BOM;
  1337. /* 00 3C is handled in the default case */
  1338. case 0x3C00:
  1339. if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1340. || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1341. && state == XML_CONTENT_STATE)
  1342. break;
  1343. *encPtr = encodingTable[UTF_16LE_ENC];
  1344. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1345. case 0xFFFE:
  1346. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1347. && state == XML_CONTENT_STATE)
  1348. break;
  1349. *nextTokPtr = ptr + 2;
  1350. *encPtr = encodingTable[UTF_16LE_ENC];
  1351. return XML_TOK_BOM;
  1352. case 0xEFBB:
  1353. /* Maybe a UTF-8 BOM (EF BB BF) */
  1354. /* If there's an explicitly specified (external) encoding
  1355. of ISO-8859-1 or some flavour of UTF-16
  1356. and this is an external text entity,
  1357. don't look for the BOM,
  1358. because it might be a legal data. */
  1359. if (state == XML_CONTENT_STATE) {
  1360. int e = INIT_ENC_INDEX(enc);
  1361. if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1362. break;
  1363. }
  1364. if (ptr + 2 == end)
  1365. return XML_TOK_PARTIAL;
  1366. if ((unsigned char)ptr[2] == 0xBF) {
  1367. *nextTokPtr = ptr + 3;
  1368. *encPtr = encodingTable[UTF_8_ENC];
  1369. return XML_TOK_BOM;
  1370. }
  1371. break;
  1372. default:
  1373. if (ptr[0] == '\0') {
  1374. /* 0 isn't a legal data character. Furthermore a document entity can only
  1375. start with ASCII characters. So the only way this can fail to be big-endian
  1376. UTF-16 if it it's an external parsed general entity that's labelled as
  1377. UTF-16LE. */
  1378. if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1379. break;
  1380. *encPtr = encodingTable[UTF_16BE_ENC];
  1381. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1382. }
  1383. else if (ptr[1] == '\0') {
  1384. /* We could recover here in the case:
  1385. - parsing an external entity
  1386. - second byte is 0
  1387. - no externally specified encoding
  1388. - no encoding declaration
  1389. by assuming UTF-16LE. But we don't, because this would mean when
  1390. presented just with a single byte, we couldn't reliably determine
  1391. whether we needed further bytes. */
  1392. if (state == XML_CONTENT_STATE)
  1393. break;
  1394. *encPtr = encodingTable[UTF_16LE_ENC];
  1395. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1396. }
  1397. break;
  1398. }
  1399. }
  1400. *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1401. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1402. }
  1403. #define NS(x) x
  1404. #define ns(x) x
  1405. #include "xmltok_ns.c"
  1406. #undef NS
  1407. #undef ns
  1408. #ifdef XML_NS
  1409. #define NS(x) x ## NS
  1410. #define ns(x) x ## _ns
  1411. #include "xmltok_ns.c"
  1412. #undef NS
  1413. #undef ns
  1414. ENCODING *
  1415. XmlInitUnknownEncodingNS(void *mem,
  1416. int *table,
  1417. int (*convert)(void *userData, const char *p),
  1418. void *userData)
  1419. {
  1420. ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1421. if (enc)
  1422. ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1423. return enc;
  1424. }
  1425. #endif /* XML_NS */