xmltok.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651
  1. /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
  2. See the file COPYING for copying permission.
  3. */
  4. #include <stddef.h>
  5. #ifdef COMPILED_FROM_DSP
  6. #include "winconfig.h"
  7. #elif defined(MACOS_CLASSIC)
  8. #include "macconfig.h"
  9. #elif defined(__amigaos__)
  10. #include "amigaconfig.h"
  11. #elif defined(__WATCOMC__)
  12. #include "watcomconfig.h"
  13. #else
  14. #ifdef HAVE_EXPAT_CONFIG_H
  15. #include <expat_config.h>
  16. #endif
  17. #endif /* ndef COMPILED_FROM_DSP */
  18. #include "expat_external.h"
  19. #include "internal.h"
  20. #include "xmltok.h"
  21. #include "nametab.h"
  22. #ifdef XML_DTD
  23. #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
  24. #else
  25. #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
  26. #endif
  27. #define VTABLE1 \
  28. { PREFIX(prologTok), PREFIX(contentTok), \
  29. PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
  30. { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  31. PREFIX(sameName), \
  32. PREFIX(nameMatchesAscii), \
  33. PREFIX(nameLength), \
  34. PREFIX(skipS), \
  35. PREFIX(getAtts), \
  36. PREFIX(charRefNumber), \
  37. PREFIX(predefinedEntityName), \
  38. PREFIX(updatePosition), \
  39. PREFIX(isPublicId)
  40. #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
  41. #define UCS2_GET_NAMING(pages, hi, lo) \
  42. (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
  43. /* A 2 byte UTF-8 representation splits the characters 11 bits between
  44. the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
  45. pages, 3 bits to add to that index and 5 bits to generate the mask.
  46. */
  47. #define UTF8_GET_NAMING2(pages, byte) \
  48. (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
  49. + ((((byte)[0]) & 3) << 1) \
  50. + ((((byte)[1]) >> 5) & 1)] \
  51. & (1 << (((byte)[1]) & 0x1F)))
  52. /* A 3 byte UTF-8 representation splits the characters 16 bits between
  53. the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
  54. into pages, 3 bits to add to that index and 5 bits to generate the
  55. mask.
  56. */
  57. #define UTF8_GET_NAMING3(pages, byte) \
  58. (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
  59. + ((((byte)[1]) >> 2) & 0xF)] \
  60. << 3) \
  61. + ((((byte)[1]) & 3) << 1) \
  62. + ((((byte)[2]) >> 5) & 1)] \
  63. & (1 << (((byte)[2]) & 0x1F)))
  64. #define UTF8_GET_NAMING(pages, p, n) \
  65. ((n) == 2 \
  66. ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
  67. : ((n) == 3 \
  68. ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
  69. : 0))
  70. /* Detection of invalid UTF-8 sequences is based on Table 3.1B
  71. of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
  72. with the additional restriction of not allowing the Unicode
  73. code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
  74. Implementation details:
  75. (A & 0x80) == 0 means A < 0x80
  76. and
  77. (A & 0xC0) == 0xC0 means A > 0xBF
  78. */
  79. #define UTF8_INVALID2(p) \
  80. ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
  81. #define UTF8_INVALID3(p) \
  82. (((p)[2] & 0x80) == 0 \
  83. || \
  84. ((*p) == 0xEF && (p)[1] == 0xBF \
  85. ? \
  86. (p)[2] > 0xBD \
  87. : \
  88. ((p)[2] & 0xC0) == 0xC0) \
  89. || \
  90. ((*p) == 0xE0 \
  91. ? \
  92. (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
  93. : \
  94. ((p)[1] & 0x80) == 0 \
  95. || \
  96. ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
  97. #define UTF8_INVALID4(p) \
  98. (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
  99. || \
  100. ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
  101. || \
  102. ((*p) == 0xF0 \
  103. ? \
  104. (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
  105. : \
  106. ((p)[1] & 0x80) == 0 \
  107. || \
  108. ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
  109. static int PTRFASTCALL
  110. isNever(const ENCODING *enc, const char *p)
  111. {
  112. return 0;
  113. }
  114. static int PTRFASTCALL
  115. utf8_isName2(const ENCODING *enc, const char *p)
  116. {
  117. return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
  118. }
  119. static int PTRFASTCALL
  120. utf8_isName3(const ENCODING *enc, const char *p)
  121. {
  122. return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
  123. }
  124. #define utf8_isName4 isNever
  125. static int PTRFASTCALL
  126. utf8_isNmstrt2(const ENCODING *enc, const char *p)
  127. {
  128. return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
  129. }
  130. static int PTRFASTCALL
  131. utf8_isNmstrt3(const ENCODING *enc, const char *p)
  132. {
  133. return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
  134. }
  135. #define utf8_isNmstrt4 isNever
  136. static int PTRFASTCALL
  137. utf8_isInvalid2(const ENCODING *enc, const char *p)
  138. {
  139. return UTF8_INVALID2((const unsigned char *)p);
  140. }
  141. static int PTRFASTCALL
  142. utf8_isInvalid3(const ENCODING *enc, const char *p)
  143. {
  144. return UTF8_INVALID3((const unsigned char *)p);
  145. }
  146. static int PTRFASTCALL
  147. utf8_isInvalid4(const ENCODING *enc, const char *p)
  148. {
  149. return UTF8_INVALID4((const unsigned char *)p);
  150. }
  151. struct normal_encoding {
  152. ENCODING enc;
  153. unsigned char type[256];
  154. #ifdef XML_MIN_SIZE
  155. int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
  156. int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
  157. int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
  158. int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
  159. int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
  160. #endif /* XML_MIN_SIZE */
  161. int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
  162. int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
  163. int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
  164. int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
  165. int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
  166. int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
  167. int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
  168. int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
  169. int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
  170. };
  171. #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
  172. #ifdef XML_MIN_SIZE
  173. #define STANDARD_VTABLE(E) \
  174. E ## byteType, \
  175. E ## isNameMin, \
  176. E ## isNmstrtMin, \
  177. E ## byteToAscii, \
  178. E ## charMatches,
  179. #else
  180. #define STANDARD_VTABLE(E) /* as nothing */
  181. #endif
  182. #define NORMAL_VTABLE(E) \
  183. E ## isName2, \
  184. E ## isName3, \
  185. E ## isName4, \
  186. E ## isNmstrt2, \
  187. E ## isNmstrt3, \
  188. E ## isNmstrt4, \
  189. E ## isInvalid2, \
  190. E ## isInvalid3, \
  191. E ## isInvalid4
  192. static int FASTCALL checkCharRefNumber(int);
  193. #include "xmltok_impl.h"
  194. #include "ascii.h"
  195. #ifdef XML_MIN_SIZE
  196. #define sb_isNameMin isNever
  197. #define sb_isNmstrtMin isNever
  198. #endif
  199. #ifdef XML_MIN_SIZE
  200. #define MINBPC(enc) ((enc)->minBytesPerChar)
  201. #else
  202. /* minimum bytes per character */
  203. #define MINBPC(enc) 1
  204. #endif
  205. #define SB_BYTE_TYPE(enc, p) \
  206. (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
  207. #ifdef XML_MIN_SIZE
  208. static int PTRFASTCALL
  209. sb_byteType(const ENCODING *enc, const char *p)
  210. {
  211. return SB_BYTE_TYPE(enc, p);
  212. }
  213. #define BYTE_TYPE(enc, p) \
  214. (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
  215. #else
  216. #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
  217. #endif
  218. #ifdef XML_MIN_SIZE
  219. #define BYTE_TO_ASCII(enc, p) \
  220. (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
  221. static int PTRFASTCALL
  222. sb_byteToAscii(const ENCODING *enc, const char *p)
  223. {
  224. return *p;
  225. }
  226. #else
  227. #define BYTE_TO_ASCII(enc, p) (*(p))
  228. #endif
  229. #define IS_NAME_CHAR(enc, p, n) \
  230. (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
  231. #define IS_NMSTRT_CHAR(enc, p, n) \
  232. (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
  233. #define IS_INVALID_CHAR(enc, p, n) \
  234. (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
  235. #ifdef XML_MIN_SIZE
  236. #define IS_NAME_CHAR_MINBPC(enc, p) \
  237. (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
  238. #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  239. (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
  240. #else
  241. #define IS_NAME_CHAR_MINBPC(enc, p) (0)
  242. #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
  243. #endif
  244. #ifdef XML_MIN_SIZE
  245. #define CHAR_MATCHES(enc, p, c) \
  246. (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
  247. static int PTRCALL
  248. sb_charMatches(const ENCODING *enc, const char *p, int c)
  249. {
  250. return *p == c;
  251. }
  252. #else
  253. /* c is an ASCII character */
  254. #define CHAR_MATCHES(enc, p, c) (*(p) == c)
  255. #endif
  256. #define PREFIX(ident) normal_ ## ident
  257. #define XML_TOK_IMPL_C
  258. #include "xmltok_impl.c"
  259. #undef XML_TOK_IMPL_C
  260. #undef MINBPC
  261. #undef BYTE_TYPE
  262. #undef BYTE_TO_ASCII
  263. #undef CHAR_MATCHES
  264. #undef IS_NAME_CHAR
  265. #undef IS_NAME_CHAR_MINBPC
  266. #undef IS_NMSTRT_CHAR
  267. #undef IS_NMSTRT_CHAR_MINBPC
  268. #undef IS_INVALID_CHAR
  269. enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
  270. UTF8_cval1 = 0x00,
  271. UTF8_cval2 = 0xc0,
  272. UTF8_cval3 = 0xe0,
  273. UTF8_cval4 = 0xf0
  274. };
  275. static void PTRCALL
  276. utf8_toUtf8(const ENCODING *enc,
  277. const char **fromP, const char *fromLim,
  278. char **toP, const char *toLim)
  279. {
  280. char *to;
  281. const char *from;
  282. if (fromLim - *fromP > toLim - *toP) {
  283. /* Avoid copying partial characters. */
  284. for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
  285. if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
  286. break;
  287. }
  288. for (to = *toP, from = *fromP; from != fromLim; from++, to++)
  289. *to = *from;
  290. *fromP = from;
  291. *toP = to;
  292. }
  293. static void PTRCALL
  294. utf8_toUtf16(const ENCODING *enc,
  295. const char **fromP, const char *fromLim,
  296. unsigned short **toP, const unsigned short *toLim)
  297. {
  298. unsigned short *to = *toP;
  299. const char *from = *fromP;
  300. while (from != fromLim && to != toLim) {
  301. switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
  302. case BT_LEAD2:
  303. *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
  304. from += 2;
  305. break;
  306. case BT_LEAD3:
  307. *to++ = (unsigned short)(((from[0] & 0xf) << 12)
  308. | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
  309. from += 3;
  310. break;
  311. case BT_LEAD4:
  312. {
  313. unsigned long n;
  314. if (to + 1 == toLim)
  315. goto after;
  316. n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
  317. | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
  318. n -= 0x10000;
  319. to[0] = (unsigned short)((n >> 10) | 0xD800);
  320. to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
  321. to += 2;
  322. from += 4;
  323. }
  324. break;
  325. default:
  326. *to++ = *from++;
  327. break;
  328. }
  329. }
  330. after:
  331. *fromP = from;
  332. *toP = to;
  333. }
  334. #ifdef XML_NS
  335. static const struct normal_encoding utf8_encoding_ns = {
  336. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  337. {
  338. #include "asciitab.h"
  339. #include "utf8tab.h"
  340. },
  341. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  342. };
  343. #endif
  344. static const struct normal_encoding utf8_encoding = {
  345. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  346. {
  347. #define BT_COLON BT_NMSTRT
  348. #include "asciitab.h"
  349. #undef BT_COLON
  350. #include "utf8tab.h"
  351. },
  352. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  353. };
  354. #ifdef XML_NS
  355. static const struct normal_encoding internal_utf8_encoding_ns = {
  356. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  357. {
  358. #include "iasciitab.h"
  359. #include "utf8tab.h"
  360. },
  361. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  362. };
  363. #endif
  364. static const struct normal_encoding internal_utf8_encoding = {
  365. { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  366. {
  367. #define BT_COLON BT_NMSTRT
  368. #include "iasciitab.h"
  369. #undef BT_COLON
  370. #include "utf8tab.h"
  371. },
  372. STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
  373. };
  374. static void PTRCALL
  375. latin1_toUtf8(const ENCODING *enc,
  376. const char **fromP, const char *fromLim,
  377. char **toP, const char *toLim)
  378. {
  379. for (;;) {
  380. unsigned char c;
  381. if (*fromP == fromLim)
  382. break;
  383. c = (unsigned char)**fromP;
  384. if (c & 0x80) {
  385. if (toLim - *toP < 2)
  386. break;
  387. *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
  388. *(*toP)++ = (char)((c & 0x3f) | 0x80);
  389. (*fromP)++;
  390. }
  391. else {
  392. if (*toP == toLim)
  393. break;
  394. *(*toP)++ = *(*fromP)++;
  395. }
  396. }
  397. }
  398. static void PTRCALL
  399. latin1_toUtf16(const ENCODING *enc,
  400. const char **fromP, const char *fromLim,
  401. unsigned short **toP, const unsigned short *toLim)
  402. {
  403. while (*fromP != fromLim && *toP != toLim)
  404. *(*toP)++ = (unsigned char)*(*fromP)++;
  405. }
  406. #ifdef XML_NS
  407. static const struct normal_encoding latin1_encoding_ns = {
  408. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  409. {
  410. #include "asciitab.h"
  411. #include "latin1tab.h"
  412. },
  413. STANDARD_VTABLE(sb_)
  414. };
  415. #endif
  416. static const struct normal_encoding latin1_encoding = {
  417. { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  418. {
  419. #define BT_COLON BT_NMSTRT
  420. #include "asciitab.h"
  421. #undef BT_COLON
  422. #include "latin1tab.h"
  423. },
  424. STANDARD_VTABLE(sb_)
  425. };
  426. static void PTRCALL
  427. ascii_toUtf8(const ENCODING *enc,
  428. const char **fromP, const char *fromLim,
  429. char **toP, const char *toLim)
  430. {
  431. while (*fromP != fromLim && *toP != toLim)
  432. *(*toP)++ = *(*fromP)++;
  433. }
  434. #ifdef XML_NS
  435. static const struct normal_encoding ascii_encoding_ns = {
  436. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  437. {
  438. #include "asciitab.h"
  439. /* BT_NONXML == 0 */
  440. },
  441. STANDARD_VTABLE(sb_)
  442. };
  443. #endif
  444. static const struct normal_encoding ascii_encoding = {
  445. { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
  446. {
  447. #define BT_COLON BT_NMSTRT
  448. #include "asciitab.h"
  449. #undef BT_COLON
  450. /* BT_NONXML == 0 */
  451. },
  452. STANDARD_VTABLE(sb_)
  453. };
  454. static int PTRFASTCALL
  455. unicode_byte_type(char hi, char lo)
  456. {
  457. switch ((unsigned char)hi) {
  458. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  459. return BT_LEAD4;
  460. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  461. return BT_TRAIL;
  462. case 0xFF:
  463. switch ((unsigned char)lo) {
  464. case 0xFF:
  465. case 0xFE:
  466. return BT_NONXML;
  467. }
  468. break;
  469. }
  470. return BT_NONASCII;
  471. }
  472. #define DEFINE_UTF16_TO_UTF8(E) \
  473. static void PTRCALL \
  474. E ## toUtf8(const ENCODING *enc, \
  475. const char **fromP, const char *fromLim, \
  476. char **toP, const char *toLim) \
  477. { \
  478. const char *from; \
  479. for (from = *fromP; from != fromLim; from += 2) { \
  480. int plane; \
  481. unsigned char lo2; \
  482. unsigned char lo = GET_LO(from); \
  483. unsigned char hi = GET_HI(from); \
  484. switch (hi) { \
  485. case 0: \
  486. if (lo < 0x80) { \
  487. if (*toP == toLim) { \
  488. *fromP = from; \
  489. return; \
  490. } \
  491. *(*toP)++ = lo; \
  492. break; \
  493. } \
  494. /* fall through */ \
  495. case 0x1: case 0x2: case 0x3: \
  496. case 0x4: case 0x5: case 0x6: case 0x7: \
  497. if (toLim - *toP < 2) { \
  498. *fromP = from; \
  499. return; \
  500. } \
  501. *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
  502. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  503. break; \
  504. default: \
  505. if (toLim - *toP < 3) { \
  506. *fromP = from; \
  507. return; \
  508. } \
  509. /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
  510. *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
  511. *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
  512. *(*toP)++ = ((lo & 0x3f) | 0x80); \
  513. break; \
  514. case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
  515. if (toLim - *toP < 4) { \
  516. *fromP = from; \
  517. return; \
  518. } \
  519. plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
  520. *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
  521. *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
  522. from += 2; \
  523. lo2 = GET_LO(from); \
  524. *(*toP)++ = (((lo & 0x3) << 4) \
  525. | ((GET_HI(from) & 0x3) << 2) \
  526. | (lo2 >> 6) \
  527. | 0x80); \
  528. *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
  529. break; \
  530. } \
  531. } \
  532. *fromP = from; \
  533. }
  534. #define DEFINE_UTF16_TO_UTF16(E) \
  535. static void PTRCALL \
  536. E ## toUtf16(const ENCODING *enc, \
  537. const char **fromP, const char *fromLim, \
  538. unsigned short **toP, const unsigned short *toLim) \
  539. { \
  540. /* Avoid copying first half only of surrogate */ \
  541. if (fromLim - *fromP > ((toLim - *toP) << 1) \
  542. && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
  543. fromLim -= 2; \
  544. for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
  545. *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
  546. }
  547. #define SET2(ptr, ch) \
  548. (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
  549. #define GET_LO(ptr) ((unsigned char)(ptr)[0])
  550. #define GET_HI(ptr) ((unsigned char)(ptr)[1])
  551. DEFINE_UTF16_TO_UTF8(little2_)
  552. DEFINE_UTF16_TO_UTF16(little2_)
  553. #undef SET2
  554. #undef GET_LO
  555. #undef GET_HI
  556. #define SET2(ptr, ch) \
  557. (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
  558. #define GET_LO(ptr) ((unsigned char)(ptr)[1])
  559. #define GET_HI(ptr) ((unsigned char)(ptr)[0])
  560. DEFINE_UTF16_TO_UTF8(big2_)
  561. DEFINE_UTF16_TO_UTF16(big2_)
  562. #undef SET2
  563. #undef GET_LO
  564. #undef GET_HI
  565. #define LITTLE2_BYTE_TYPE(enc, p) \
  566. ((p)[1] == 0 \
  567. ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
  568. : unicode_byte_type((p)[1], (p)[0]))
  569. #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
  570. #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
  571. #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
  572. UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
  573. #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  574. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
  575. #ifdef XML_MIN_SIZE
  576. static int PTRFASTCALL
  577. little2_byteType(const ENCODING *enc, const char *p)
  578. {
  579. return LITTLE2_BYTE_TYPE(enc, p);
  580. }
  581. static int PTRFASTCALL
  582. little2_byteToAscii(const ENCODING *enc, const char *p)
  583. {
  584. return LITTLE2_BYTE_TO_ASCII(enc, p);
  585. }
  586. static int PTRCALL
  587. little2_charMatches(const ENCODING *enc, const char *p, int c)
  588. {
  589. return LITTLE2_CHAR_MATCHES(enc, p, c);
  590. }
  591. static int PTRFASTCALL
  592. little2_isNameMin(const ENCODING *enc, const char *p)
  593. {
  594. return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
  595. }
  596. static int PTRFASTCALL
  597. little2_isNmstrtMin(const ENCODING *enc, const char *p)
  598. {
  599. return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  600. }
  601. #undef VTABLE
  602. #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
  603. #else /* not XML_MIN_SIZE */
  604. #undef PREFIX
  605. #define PREFIX(ident) little2_ ## ident
  606. #define MINBPC(enc) 2
  607. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  608. #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
  609. #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
  610. #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
  611. #define IS_NAME_CHAR(enc, p, n) 0
  612. #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
  613. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  614. #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  615. #define XML_TOK_IMPL_C
  616. #include "xmltok_impl.c"
  617. #undef XML_TOK_IMPL_C
  618. #undef MINBPC
  619. #undef BYTE_TYPE
  620. #undef BYTE_TO_ASCII
  621. #undef CHAR_MATCHES
  622. #undef IS_NAME_CHAR
  623. #undef IS_NAME_CHAR_MINBPC
  624. #undef IS_NMSTRT_CHAR
  625. #undef IS_NMSTRT_CHAR_MINBPC
  626. #undef IS_INVALID_CHAR
  627. #endif /* not XML_MIN_SIZE */
  628. #ifdef XML_NS
  629. static const struct normal_encoding little2_encoding_ns = {
  630. { VTABLE, 2, 0,
  631. #if BYTEORDER == 1234
  632. 1
  633. #else
  634. 0
  635. #endif
  636. },
  637. {
  638. #include "asciitab.h"
  639. #include "latin1tab.h"
  640. },
  641. STANDARD_VTABLE(little2_)
  642. };
  643. #endif
  644. static const struct normal_encoding little2_encoding = {
  645. { VTABLE, 2, 0,
  646. #if BYTEORDER == 1234
  647. 1
  648. #else
  649. 0
  650. #endif
  651. },
  652. {
  653. #define BT_COLON BT_NMSTRT
  654. #include "asciitab.h"
  655. #undef BT_COLON
  656. #include "latin1tab.h"
  657. },
  658. STANDARD_VTABLE(little2_)
  659. };
  660. #if BYTEORDER != 4321
  661. #ifdef XML_NS
  662. static const struct normal_encoding internal_little2_encoding_ns = {
  663. { VTABLE, 2, 0, 1 },
  664. {
  665. #include "iasciitab.h"
  666. #include "latin1tab.h"
  667. },
  668. STANDARD_VTABLE(little2_)
  669. };
  670. #endif
  671. static const struct normal_encoding internal_little2_encoding = {
  672. { VTABLE, 2, 0, 1 },
  673. {
  674. #define BT_COLON BT_NMSTRT
  675. #include "iasciitab.h"
  676. #undef BT_COLON
  677. #include "latin1tab.h"
  678. },
  679. STANDARD_VTABLE(little2_)
  680. };
  681. #endif
  682. #define BIG2_BYTE_TYPE(enc, p) \
  683. ((p)[0] == 0 \
  684. ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
  685. : unicode_byte_type((p)[0], (p)[1]))
  686. #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
  687. #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
  688. #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
  689. UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
  690. #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
  691. UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
  692. #ifdef XML_MIN_SIZE
  693. static int PTRFASTCALL
  694. big2_byteType(const ENCODING *enc, const char *p)
  695. {
  696. return BIG2_BYTE_TYPE(enc, p);
  697. }
  698. static int PTRFASTCALL
  699. big2_byteToAscii(const ENCODING *enc, const char *p)
  700. {
  701. return BIG2_BYTE_TO_ASCII(enc, p);
  702. }
  703. static int PTRCALL
  704. big2_charMatches(const ENCODING *enc, const char *p, int c)
  705. {
  706. return BIG2_CHAR_MATCHES(enc, p, c);
  707. }
  708. static int PTRFASTCALL
  709. big2_isNameMin(const ENCODING *enc, const char *p)
  710. {
  711. return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
  712. }
  713. static int PTRFASTCALL
  714. big2_isNmstrtMin(const ENCODING *enc, const char *p)
  715. {
  716. return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
  717. }
  718. #undef VTABLE
  719. #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
  720. #else /* not XML_MIN_SIZE */
  721. #undef PREFIX
  722. #define PREFIX(ident) big2_ ## ident
  723. #define MINBPC(enc) 2
  724. /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
  725. #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
  726. #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
  727. #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
  728. #define IS_NAME_CHAR(enc, p, n) 0
  729. #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
  730. #define IS_NMSTRT_CHAR(enc, p, n) (0)
  731. #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
  732. #define XML_TOK_IMPL_C
  733. #include "xmltok_impl.c"
  734. #undef XML_TOK_IMPL_C
  735. #undef MINBPC
  736. #undef BYTE_TYPE
  737. #undef BYTE_TO_ASCII
  738. #undef CHAR_MATCHES
  739. #undef IS_NAME_CHAR
  740. #undef IS_NAME_CHAR_MINBPC
  741. #undef IS_NMSTRT_CHAR
  742. #undef IS_NMSTRT_CHAR_MINBPC
  743. #undef IS_INVALID_CHAR
  744. #endif /* not XML_MIN_SIZE */
  745. #ifdef XML_NS
  746. static const struct normal_encoding big2_encoding_ns = {
  747. { VTABLE, 2, 0,
  748. #if BYTEORDER == 4321
  749. 1
  750. #else
  751. 0
  752. #endif
  753. },
  754. {
  755. #include "asciitab.h"
  756. #include "latin1tab.h"
  757. },
  758. STANDARD_VTABLE(big2_)
  759. };
  760. #endif
  761. static const struct normal_encoding big2_encoding = {
  762. { VTABLE, 2, 0,
  763. #if BYTEORDER == 4321
  764. 1
  765. #else
  766. 0
  767. #endif
  768. },
  769. {
  770. #define BT_COLON BT_NMSTRT
  771. #include "asciitab.h"
  772. #undef BT_COLON
  773. #include "latin1tab.h"
  774. },
  775. STANDARD_VTABLE(big2_)
  776. };
  777. #if BYTEORDER != 1234
  778. #ifdef XML_NS
  779. static const struct normal_encoding internal_big2_encoding_ns = {
  780. { VTABLE, 2, 0, 1 },
  781. {
  782. #include "iasciitab.h"
  783. #include "latin1tab.h"
  784. },
  785. STANDARD_VTABLE(big2_)
  786. };
  787. #endif
  788. static const struct normal_encoding internal_big2_encoding = {
  789. { VTABLE, 2, 0, 1 },
  790. {
  791. #define BT_COLON BT_NMSTRT
  792. #include "iasciitab.h"
  793. #undef BT_COLON
  794. #include "latin1tab.h"
  795. },
  796. STANDARD_VTABLE(big2_)
  797. };
  798. #endif
  799. #undef PREFIX
  800. static int FASTCALL
  801. streqci(const char *s1, const char *s2)
  802. {
  803. for (;;) {
  804. char c1 = *s1++;
  805. char c2 = *s2++;
  806. if (ASCII_a <= c1 && c1 <= ASCII_z)
  807. c1 += ASCII_A - ASCII_a;
  808. if (ASCII_a <= c2 && c2 <= ASCII_z)
  809. c2 += ASCII_A - ASCII_a;
  810. if (c1 != c2)
  811. return 0;
  812. if (!c1)
  813. break;
  814. }
  815. return 1;
  816. }
  817. static void PTRCALL
  818. initUpdatePosition(const ENCODING *enc, const char *ptr,
  819. const char *end, POSITION *pos)
  820. {
  821. normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
  822. }
  823. static int
  824. toAscii(const ENCODING *enc, const char *ptr, const char *end)
  825. {
  826. char buf[1];
  827. char *p = buf;
  828. XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  829. if (p == buf)
  830. return -1;
  831. else
  832. return buf[0];
  833. }
  834. static int FASTCALL
  835. isSpace(int c)
  836. {
  837. switch (c) {
  838. case 0x20:
  839. case 0xD:
  840. case 0xA:
  841. case 0x9:
  842. return 1;
  843. }
  844. return 0;
  845. }
  846. /* Return 1 if there's just optional white space or there's an S
  847. followed by name=val.
  848. */
  849. static int
  850. parsePseudoAttribute(const ENCODING *enc,
  851. const char *ptr,
  852. const char *end,
  853. const char **namePtr,
  854. const char **nameEndPtr,
  855. const char **valPtr,
  856. const char **nextTokPtr)
  857. {
  858. int c;
  859. char open;
  860. if (ptr == end) {
  861. *namePtr = NULL;
  862. return 1;
  863. }
  864. if (!isSpace(toAscii(enc, ptr, end))) {
  865. *nextTokPtr = ptr;
  866. return 0;
  867. }
  868. do {
  869. ptr += enc->minBytesPerChar;
  870. } while (isSpace(toAscii(enc, ptr, end)));
  871. if (ptr == end) {
  872. *namePtr = NULL;
  873. return 1;
  874. }
  875. *namePtr = ptr;
  876. for (;;) {
  877. c = toAscii(enc, ptr, end);
  878. if (c == -1) {
  879. *nextTokPtr = ptr;
  880. return 0;
  881. }
  882. if (c == ASCII_EQUALS) {
  883. *nameEndPtr = ptr;
  884. break;
  885. }
  886. if (isSpace(c)) {
  887. *nameEndPtr = ptr;
  888. do {
  889. ptr += enc->minBytesPerChar;
  890. } while (isSpace(c = toAscii(enc, ptr, end)));
  891. if (c != ASCII_EQUALS) {
  892. *nextTokPtr = ptr;
  893. return 0;
  894. }
  895. break;
  896. }
  897. ptr += enc->minBytesPerChar;
  898. }
  899. if (ptr == *namePtr) {
  900. *nextTokPtr = ptr;
  901. return 0;
  902. }
  903. ptr += enc->minBytesPerChar;
  904. c = toAscii(enc, ptr, end);
  905. while (isSpace(c)) {
  906. ptr += enc->minBytesPerChar;
  907. c = toAscii(enc, ptr, end);
  908. }
  909. if (c != ASCII_QUOT && c != ASCII_APOS) {
  910. *nextTokPtr = ptr;
  911. return 0;
  912. }
  913. open = (char)c;
  914. ptr += enc->minBytesPerChar;
  915. *valPtr = ptr;
  916. for (;; ptr += enc->minBytesPerChar) {
  917. c = toAscii(enc, ptr, end);
  918. if (c == open)
  919. break;
  920. if (!(ASCII_a <= c && c <= ASCII_z)
  921. && !(ASCII_A <= c && c <= ASCII_Z)
  922. && !(ASCII_0 <= c && c <= ASCII_9)
  923. && c != ASCII_PERIOD
  924. && c != ASCII_MINUS
  925. && c != ASCII_UNDERSCORE) {
  926. *nextTokPtr = ptr;
  927. return 0;
  928. }
  929. }
  930. *nextTokPtr = ptr + enc->minBytesPerChar;
  931. return 1;
  932. }
  933. static const char KW_version[] = {
  934. ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
  935. };
  936. static const char KW_encoding[] = {
  937. ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
  938. };
  939. static const char KW_standalone[] = {
  940. ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
  941. ASCII_n, ASCII_e, '\0'
  942. };
  943. static const char KW_yes[] = {
  944. ASCII_y, ASCII_e, ASCII_s, '\0'
  945. };
  946. static const char KW_no[] = {
  947. ASCII_n, ASCII_o, '\0'
  948. };
  949. static int
  950. doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
  951. const char *,
  952. const char *),
  953. int isGeneralTextEntity,
  954. const ENCODING *enc,
  955. const char *ptr,
  956. const char *end,
  957. const char **badPtr,
  958. const char **versionPtr,
  959. const char **versionEndPtr,
  960. const char **encodingName,
  961. const ENCODING **encoding,
  962. int *standalone)
  963. {
  964. const char *val = NULL;
  965. const char *name = NULL;
  966. const char *nameEnd = NULL;
  967. ptr += 5 * enc->minBytesPerChar;
  968. end -= 2 * enc->minBytesPerChar;
  969. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
  970. || !name) {
  971. *badPtr = ptr;
  972. return 0;
  973. }
  974. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
  975. if (!isGeneralTextEntity) {
  976. *badPtr = name;
  977. return 0;
  978. }
  979. }
  980. else {
  981. if (versionPtr)
  982. *versionPtr = val;
  983. if (versionEndPtr)
  984. *versionEndPtr = ptr;
  985. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  986. *badPtr = ptr;
  987. return 0;
  988. }
  989. if (!name) {
  990. if (isGeneralTextEntity) {
  991. /* a TextDecl must have an EncodingDecl */
  992. *badPtr = ptr;
  993. return 0;
  994. }
  995. return 1;
  996. }
  997. }
  998. if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
  999. int c = toAscii(enc, val, end);
  1000. if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
  1001. *badPtr = val;
  1002. return 0;
  1003. }
  1004. if (encodingName)
  1005. *encodingName = val;
  1006. if (encoding)
  1007. *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
  1008. if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
  1009. *badPtr = ptr;
  1010. return 0;
  1011. }
  1012. if (!name)
  1013. return 1;
  1014. }
  1015. if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
  1016. || isGeneralTextEntity) {
  1017. *badPtr = name;
  1018. return 0;
  1019. }
  1020. if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
  1021. if (standalone)
  1022. *standalone = 1;
  1023. }
  1024. else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
  1025. if (standalone)
  1026. *standalone = 0;
  1027. }
  1028. else {
  1029. *badPtr = val;
  1030. return 0;
  1031. }
  1032. while (isSpace(toAscii(enc, ptr, end)))
  1033. ptr += enc->minBytesPerChar;
  1034. if (ptr != end) {
  1035. *badPtr = ptr;
  1036. return 0;
  1037. }
  1038. return 1;
  1039. }
  1040. static int FASTCALL
  1041. checkCharRefNumber(int result)
  1042. {
  1043. switch (result >> 8) {
  1044. case 0xD8: case 0xD9: case 0xDA: case 0xDB:
  1045. case 0xDC: case 0xDD: case 0xDE: case 0xDF:
  1046. return -1;
  1047. case 0:
  1048. if (latin1_encoding.type[result] == BT_NONXML)
  1049. return -1;
  1050. break;
  1051. case 0xFF:
  1052. if (result == 0xFFFE || result == 0xFFFF)
  1053. return -1;
  1054. break;
  1055. }
  1056. return result;
  1057. }
  1058. int FASTCALL
  1059. XmlUtf8Encode(int c, char *buf)
  1060. {
  1061. enum {
  1062. /* minN is minimum legal resulting value for N byte sequence */
  1063. min2 = 0x80,
  1064. min3 = 0x800,
  1065. min4 = 0x10000
  1066. };
  1067. if (c < 0)
  1068. return 0;
  1069. if (c < min2) {
  1070. buf[0] = (char)(c | UTF8_cval1);
  1071. return 1;
  1072. }
  1073. if (c < min3) {
  1074. buf[0] = (char)((c >> 6) | UTF8_cval2);
  1075. buf[1] = (char)((c & 0x3f) | 0x80);
  1076. return 2;
  1077. }
  1078. if (c < min4) {
  1079. buf[0] = (char)((c >> 12) | UTF8_cval3);
  1080. buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
  1081. buf[2] = (char)((c & 0x3f) | 0x80);
  1082. return 3;
  1083. }
  1084. if (c < 0x110000) {
  1085. buf[0] = (char)((c >> 18) | UTF8_cval4);
  1086. buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
  1087. buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
  1088. buf[3] = (char)((c & 0x3f) | 0x80);
  1089. return 4;
  1090. }
  1091. return 0;
  1092. }
  1093. int FASTCALL
  1094. XmlUtf16Encode(int charNum, unsigned short *buf)
  1095. {
  1096. if (charNum < 0)
  1097. return 0;
  1098. if (charNum < 0x10000) {
  1099. buf[0] = (unsigned short)charNum;
  1100. return 1;
  1101. }
  1102. if (charNum < 0x110000) {
  1103. charNum -= 0x10000;
  1104. buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
  1105. buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
  1106. return 2;
  1107. }
  1108. return 0;
  1109. }
  1110. struct unknown_encoding {
  1111. struct normal_encoding normal;
  1112. CONVERTER convert;
  1113. void *userData;
  1114. unsigned short utf16[256];
  1115. char utf8[256][4];
  1116. };
  1117. #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
  1118. int
  1119. XmlSizeOfUnknownEncoding(void)
  1120. {
  1121. return sizeof(struct unknown_encoding);
  1122. }
  1123. static int PTRFASTCALL
  1124. unknown_isName(const ENCODING *enc, const char *p)
  1125. {
  1126. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1127. int c = uenc->convert(uenc->userData, p);
  1128. if (c & ~0xFFFF)
  1129. return 0;
  1130. return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
  1131. }
  1132. static int PTRFASTCALL
  1133. unknown_isNmstrt(const ENCODING *enc, const char *p)
  1134. {
  1135. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1136. int c = uenc->convert(uenc->userData, p);
  1137. if (c & ~0xFFFF)
  1138. return 0;
  1139. return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
  1140. }
  1141. static int PTRFASTCALL
  1142. unknown_isInvalid(const ENCODING *enc, const char *p)
  1143. {
  1144. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1145. int c = uenc->convert(uenc->userData, p);
  1146. return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
  1147. }
  1148. static void PTRCALL
  1149. unknown_toUtf8(const ENCODING *enc,
  1150. const char **fromP, const char *fromLim,
  1151. char **toP, const char *toLim)
  1152. {
  1153. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1154. char buf[XML_UTF8_ENCODE_MAX];
  1155. for (;;) {
  1156. const char *utf8;
  1157. int n;
  1158. if (*fromP == fromLim)
  1159. break;
  1160. utf8 = uenc->utf8[(unsigned char)**fromP];
  1161. n = *utf8++;
  1162. if (n == 0) {
  1163. int c = uenc->convert(uenc->userData, *fromP);
  1164. n = XmlUtf8Encode(c, buf);
  1165. if (n > toLim - *toP)
  1166. break;
  1167. utf8 = buf;
  1168. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1169. - (BT_LEAD2 - 2));
  1170. }
  1171. else {
  1172. if (n > toLim - *toP)
  1173. break;
  1174. (*fromP)++;
  1175. }
  1176. do {
  1177. *(*toP)++ = *utf8++;
  1178. } while (--n != 0);
  1179. }
  1180. }
  1181. static void PTRCALL
  1182. unknown_toUtf16(const ENCODING *enc,
  1183. const char **fromP, const char *fromLim,
  1184. unsigned short **toP, const unsigned short *toLim)
  1185. {
  1186. const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
  1187. while (*fromP != fromLim && *toP != toLim) {
  1188. unsigned short c = uenc->utf16[(unsigned char)**fromP];
  1189. if (c == 0) {
  1190. c = (unsigned short)
  1191. uenc->convert(uenc->userData, *fromP);
  1192. *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
  1193. - (BT_LEAD2 - 2));
  1194. }
  1195. else
  1196. (*fromP)++;
  1197. *(*toP)++ = c;
  1198. }
  1199. }
  1200. ENCODING *
  1201. XmlInitUnknownEncoding(void *mem,
  1202. int *table,
  1203. CONVERTER convert,
  1204. void *userData)
  1205. {
  1206. int i;
  1207. struct unknown_encoding *e = (struct unknown_encoding *)mem;
  1208. for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
  1209. ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
  1210. for (i = 0; i < 128; i++)
  1211. if (latin1_encoding.type[i] != BT_OTHER
  1212. && latin1_encoding.type[i] != BT_NONXML
  1213. && table[i] != i)
  1214. return 0;
  1215. for (i = 0; i < 256; i++) {
  1216. int c = table[i];
  1217. if (c == -1) {
  1218. e->normal.type[i] = BT_MALFORM;
  1219. /* This shouldn't really get used. */
  1220. e->utf16[i] = 0xFFFF;
  1221. e->utf8[i][0] = 1;
  1222. e->utf8[i][1] = 0;
  1223. }
  1224. else if (c < 0) {
  1225. if (c < -4)
  1226. return 0;
  1227. e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
  1228. e->utf8[i][0] = 0;
  1229. e->utf16[i] = 0;
  1230. }
  1231. else if (c < 0x80) {
  1232. if (latin1_encoding.type[c] != BT_OTHER
  1233. && latin1_encoding.type[c] != BT_NONXML
  1234. && c != i)
  1235. return 0;
  1236. e->normal.type[i] = latin1_encoding.type[c];
  1237. e->utf8[i][0] = 1;
  1238. e->utf8[i][1] = (char)c;
  1239. e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
  1240. }
  1241. else if (checkCharRefNumber(c) < 0) {
  1242. e->normal.type[i] = BT_NONXML;
  1243. /* This shouldn't really get used. */
  1244. e->utf16[i] = 0xFFFF;
  1245. e->utf8[i][0] = 1;
  1246. e->utf8[i][1] = 0;
  1247. }
  1248. else {
  1249. if (c > 0xFFFF)
  1250. return 0;
  1251. if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
  1252. e->normal.type[i] = BT_NMSTRT;
  1253. else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
  1254. e->normal.type[i] = BT_NAME;
  1255. else
  1256. e->normal.type[i] = BT_OTHER;
  1257. e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
  1258. e->utf16[i] = (unsigned short)c;
  1259. }
  1260. }
  1261. e->userData = userData;
  1262. e->convert = convert;
  1263. if (convert) {
  1264. e->normal.isName2 = unknown_isName;
  1265. e->normal.isName3 = unknown_isName;
  1266. e->normal.isName4 = unknown_isName;
  1267. e->normal.isNmstrt2 = unknown_isNmstrt;
  1268. e->normal.isNmstrt3 = unknown_isNmstrt;
  1269. e->normal.isNmstrt4 = unknown_isNmstrt;
  1270. e->normal.isInvalid2 = unknown_isInvalid;
  1271. e->normal.isInvalid3 = unknown_isInvalid;
  1272. e->normal.isInvalid4 = unknown_isInvalid;
  1273. }
  1274. e->normal.enc.utf8Convert = unknown_toUtf8;
  1275. e->normal.enc.utf16Convert = unknown_toUtf16;
  1276. return &(e->normal.enc);
  1277. }
  1278. /* If this enumeration is changed, getEncodingIndex and encodings
  1279. must also be changed. */
  1280. enum {
  1281. UNKNOWN_ENC = -1,
  1282. ISO_8859_1_ENC = 0,
  1283. US_ASCII_ENC,
  1284. UTF_8_ENC,
  1285. UTF_16_ENC,
  1286. UTF_16BE_ENC,
  1287. UTF_16LE_ENC,
  1288. /* must match encodingNames up to here */
  1289. NO_ENC
  1290. };
  1291. static const char KW_ISO_8859_1[] = {
  1292. ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
  1293. ASCII_MINUS, ASCII_1, '\0'
  1294. };
  1295. static const char KW_US_ASCII[] = {
  1296. ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
  1297. '\0'
  1298. };
  1299. static const char KW_UTF_8[] = {
  1300. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
  1301. };
  1302. static const char KW_UTF_16[] = {
  1303. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
  1304. };
  1305. static const char KW_UTF_16BE[] = {
  1306. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
  1307. '\0'
  1308. };
  1309. static const char KW_UTF_16LE[] = {
  1310. ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
  1311. '\0'
  1312. };
  1313. static int FASTCALL
  1314. getEncodingIndex(const char *name)
  1315. {
  1316. static const char * const encodingNames[] = {
  1317. KW_ISO_8859_1,
  1318. KW_US_ASCII,
  1319. KW_UTF_8,
  1320. KW_UTF_16,
  1321. KW_UTF_16BE,
  1322. KW_UTF_16LE,
  1323. };
  1324. int i;
  1325. if (name == NULL)
  1326. return NO_ENC;
  1327. for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
  1328. if (streqci(name, encodingNames[i]))
  1329. return i;
  1330. return UNKNOWN_ENC;
  1331. }
  1332. /* For binary compatibility, we store the index of the encoding
  1333. specified at initialization in the isUtf16 member.
  1334. */
  1335. #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
  1336. #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
  1337. /* This is what detects the encoding. encodingTable maps from
  1338. encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
  1339. the external (protocol) specified encoding; state is
  1340. XML_CONTENT_STATE if we're parsing an external text entity, and
  1341. XML_PROLOG_STATE otherwise.
  1342. */
  1343. static int
  1344. initScan(const ENCODING * const *encodingTable,
  1345. const INIT_ENCODING *enc,
  1346. int state,
  1347. const char *ptr,
  1348. const char *end,
  1349. const char **nextTokPtr)
  1350. {
  1351. const ENCODING **encPtr;
  1352. if (ptr == end)
  1353. return XML_TOK_NONE;
  1354. encPtr = enc->encPtr;
  1355. if (ptr + 1 == end) {
  1356. /* only a single byte available for auto-detection */
  1357. #ifndef XML_DTD /* FIXME */
  1358. /* a well-formed document entity must have more than one byte */
  1359. if (state != XML_CONTENT_STATE)
  1360. return XML_TOK_PARTIAL;
  1361. #endif
  1362. /* so we're parsing an external text entity... */
  1363. /* if UTF-16 was externally specified, then we need at least 2 bytes */
  1364. switch (INIT_ENC_INDEX(enc)) {
  1365. case UTF_16_ENC:
  1366. case UTF_16LE_ENC:
  1367. case UTF_16BE_ENC:
  1368. return XML_TOK_PARTIAL;
  1369. }
  1370. switch ((unsigned char)*ptr) {
  1371. case 0xFE:
  1372. case 0xFF:
  1373. case 0xEF: /* possibly first byte of UTF-8 BOM */
  1374. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1375. && state == XML_CONTENT_STATE)
  1376. break;
  1377. /* fall through */
  1378. case 0x00:
  1379. case 0x3C:
  1380. return XML_TOK_PARTIAL;
  1381. }
  1382. }
  1383. else {
  1384. switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
  1385. case 0xFEFF:
  1386. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1387. && state == XML_CONTENT_STATE)
  1388. break;
  1389. *nextTokPtr = ptr + 2;
  1390. *encPtr = encodingTable[UTF_16BE_ENC];
  1391. return XML_TOK_BOM;
  1392. /* 00 3C is handled in the default case */
  1393. case 0x3C00:
  1394. if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
  1395. || INIT_ENC_INDEX(enc) == UTF_16_ENC)
  1396. && state == XML_CONTENT_STATE)
  1397. break;
  1398. *encPtr = encodingTable[UTF_16LE_ENC];
  1399. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1400. case 0xFFFE:
  1401. if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
  1402. && state == XML_CONTENT_STATE)
  1403. break;
  1404. *nextTokPtr = ptr + 2;
  1405. *encPtr = encodingTable[UTF_16LE_ENC];
  1406. return XML_TOK_BOM;
  1407. case 0xEFBB:
  1408. /* Maybe a UTF-8 BOM (EF BB BF) */
  1409. /* If there's an explicitly specified (external) encoding
  1410. of ISO-8859-1 or some flavour of UTF-16
  1411. and this is an external text entity,
  1412. don't look for the BOM,
  1413. because it might be a legal data.
  1414. */
  1415. if (state == XML_CONTENT_STATE) {
  1416. int e = INIT_ENC_INDEX(enc);
  1417. if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
  1418. || e == UTF_16LE_ENC || e == UTF_16_ENC)
  1419. break;
  1420. }
  1421. if (ptr + 2 == end)
  1422. return XML_TOK_PARTIAL;
  1423. if ((unsigned char)ptr[2] == 0xBF) {
  1424. *nextTokPtr = ptr + 3;
  1425. *encPtr = encodingTable[UTF_8_ENC];
  1426. return XML_TOK_BOM;
  1427. }
  1428. break;
  1429. default:
  1430. if (ptr[0] == '\0') {
  1431. /* 0 isn't a legal data character. Furthermore a document
  1432. entity can only start with ASCII characters. So the only
  1433. way this can fail to be big-endian UTF-16 if it it's an
  1434. external parsed general entity that's labelled as
  1435. UTF-16LE.
  1436. */
  1437. if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
  1438. break;
  1439. *encPtr = encodingTable[UTF_16BE_ENC];
  1440. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1441. }
  1442. else if (ptr[1] == '\0') {
  1443. /* We could recover here in the case:
  1444. - parsing an external entity
  1445. - second byte is 0
  1446. - no externally specified encoding
  1447. - no encoding declaration
  1448. by assuming UTF-16LE. But we don't, because this would mean when
  1449. presented just with a single byte, we couldn't reliably determine
  1450. whether we needed further bytes.
  1451. */
  1452. if (state == XML_CONTENT_STATE)
  1453. break;
  1454. *encPtr = encodingTable[UTF_16LE_ENC];
  1455. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1456. }
  1457. break;
  1458. }
  1459. }
  1460. *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
  1461. return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
  1462. }
  1463. #define NS(x) x
  1464. #define ns(x) x
  1465. #define XML_TOK_NS_C
  1466. #include "xmltok_ns.c"
  1467. #undef XML_TOK_NS_C
  1468. #undef NS
  1469. #undef ns
  1470. #ifdef XML_NS
  1471. #define NS(x) x ## NS
  1472. #define ns(x) x ## _ns
  1473. #define XML_TOK_NS_C
  1474. #include "xmltok_ns.c"
  1475. #undef XML_TOK_NS_C
  1476. #undef NS
  1477. #undef ns
  1478. ENCODING *
  1479. XmlInitUnknownEncodingNS(void *mem,
  1480. int *table,
  1481. CONVERTER convert,
  1482. void *userData)
  1483. {
  1484. ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
  1485. if (enc)
  1486. ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
  1487. return enc;
  1488. }
  1489. #endif /* XML_NS */