archive_string.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. /*-
  2. * Copyright (c) 2003-2007 Tim Kientzle
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  15. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #ifndef _XOPEN_SOURCE
  26. # define _XOPEN_SOURCE 500 /* mbstate_t define on some hpux */
  27. #endif
  28. #include "archive_platform.h"
  29. __FBSDID("$FreeBSD: src/lib/libarchive/archive_string.c,v 1.17 2008/12/06 05:56:43 kientzle Exp $");
  30. /*
  31. * Basic resizable string support, to simplify manipulating arbitrary-sized
  32. * strings while minimizing heap activity.
  33. */
  34. #ifdef HAVE_STDLIB_H
  35. #include <stdlib.h>
  36. #endif
  37. #ifdef HAVE_STRING_H
  38. #include <string.h>
  39. #endif
  40. #ifdef HAVE_WCHAR_H
  41. #include <wchar.h>
  42. #endif
  43. #if defined(_WIN32) && !defined(__CYGWIN__)
  44. #include <windows.h>
  45. #endif
  46. #include "archive_private.h"
  47. #include "archive_string.h"
  48. struct archive_string *
  49. __archive_string_append(struct archive_string *as, const char *p, size_t s)
  50. {
  51. if (__archive_string_ensure(as, as->length + s + 1) == NULL)
  52. __archive_errx(1, "Out of memory");
  53. memcpy(as->s + as->length, p, s);
  54. as->s[as->length + s] = 0;
  55. as->length += s;
  56. return (as);
  57. }
  58. void
  59. __archive_string_copy(struct archive_string *dest, struct archive_string *src)
  60. {
  61. if (src->length == 0)
  62. dest->length = 0;
  63. else {
  64. if (__archive_string_ensure(dest, src->length + 1) == NULL)
  65. __archive_errx(1, "Out of memory");
  66. memcpy(dest->s, src->s, src->length);
  67. dest->length = src->length;
  68. dest->s[dest->length] = 0;
  69. }
  70. }
  71. void
  72. __archive_string_concat(struct archive_string *dest, struct archive_string *src)
  73. {
  74. if (src->length > 0) {
  75. if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL)
  76. __archive_errx(1, "Out of memory");
  77. memcpy(dest->s + dest->length, src->s, src->length);
  78. dest->length += src->length;
  79. dest->s[dest->length] = 0;
  80. }
  81. }
  82. void
  83. __archive_string_free(struct archive_string *as)
  84. {
  85. as->length = 0;
  86. as->buffer_length = 0;
  87. if (as->s != NULL) {
  88. free(as->s);
  89. as->s = NULL;
  90. }
  91. }
  92. /* Returns NULL on any allocation failure. */
  93. struct archive_string *
  94. __archive_string_ensure(struct archive_string *as, size_t s)
  95. {
  96. /* If buffer is already big enough, don't reallocate. */
  97. if (as->s && (s <= as->buffer_length))
  98. return (as);
  99. /*
  100. * Growing the buffer at least exponentially ensures that
  101. * append operations are always linear in the number of
  102. * characters appended. Using a smaller growth rate for
  103. * larger buffers reduces memory waste somewhat at the cost of
  104. * a larger constant factor.
  105. */
  106. if (as->buffer_length < 32)
  107. /* Start with a minimum 32-character buffer. */
  108. as->buffer_length = 32;
  109. else if (as->buffer_length < 8192)
  110. /* Buffers under 8k are doubled for speed. */
  111. as->buffer_length += as->buffer_length;
  112. else {
  113. /* Buffers 8k and over grow by at least 25% each time. */
  114. size_t old_length = as->buffer_length;
  115. as->buffer_length += as->buffer_length / 4;
  116. /* Be safe: If size wraps, release buffer and return NULL. */
  117. if (as->buffer_length < old_length) {
  118. free(as->s);
  119. as->s = NULL;
  120. return (NULL);
  121. }
  122. }
  123. /*
  124. * The computation above is a lower limit to how much we'll
  125. * grow the buffer. In any case, we have to grow it enough to
  126. * hold the request.
  127. */
  128. if (as->buffer_length < s)
  129. as->buffer_length = s;
  130. /* Now we can reallocate the buffer. */
  131. as->s = (char *)realloc(as->s, as->buffer_length);
  132. if (as->s == NULL)
  133. return (NULL);
  134. return (as);
  135. }
  136. struct archive_string *
  137. __archive_strncat(struct archive_string *as, const void *_p, size_t n)
  138. {
  139. size_t s;
  140. const char *p, *pp;
  141. p = (const char *)_p;
  142. /* Like strlen(p), except won't examine positions beyond p[n]. */
  143. s = 0;
  144. pp = p;
  145. while (*pp && s < n) {
  146. pp++;
  147. s++;
  148. }
  149. return (__archive_string_append(as, p, s));
  150. }
  151. struct archive_string *
  152. __archive_strappend_char(struct archive_string *as, char c)
  153. {
  154. return (__archive_string_append(as, &c, 1));
  155. }
  156. /*
  157. * Translates a wide character string into UTF-8 and appends
  158. * to the archive_string. Note: returns NULL if conversion fails,
  159. * but still leaves a best-effort conversion in the argument as.
  160. */
  161. struct archive_string *
  162. __archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w)
  163. {
  164. char *p;
  165. unsigned wc;
  166. char buff[256];
  167. struct archive_string *return_val = as;
  168. /*
  169. * Convert one wide char at a time into 'buff', whenever that
  170. * fills, append it to the string.
  171. */
  172. p = buff;
  173. while (*w != L'\0') {
  174. /* Flush the buffer when we have <=16 bytes free. */
  175. /* (No encoding has a single character >16 bytes.) */
  176. if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) {
  177. *p = '\0';
  178. archive_strcat(as, buff);
  179. p = buff;
  180. }
  181. wc = *w++;
  182. /* If this is a surrogate pair, assemble the full code point.*/
  183. /* Note: wc must not be wchar_t here, because the full code
  184. * point can be more than 16 bits! */
  185. if (wc >= 0xD800 && wc <= 0xDBff
  186. && *w >= 0xDC00 && *w <= 0xDFFF) {
  187. wc -= 0xD800;
  188. wc *= 0x400;
  189. wc += (*w - 0xDC00);
  190. wc += 0x10000;
  191. ++w;
  192. }
  193. /* Translate code point to UTF8 */
  194. if (wc <= 0x7f) {
  195. *p++ = (char)wc;
  196. } else if (wc <= 0x7ff) {
  197. *p++ = 0xc0 | ((wc >> 6) & 0x1f);
  198. *p++ = 0x80 | (wc & 0x3f);
  199. } else if (wc <= 0xffff) {
  200. *p++ = 0xe0 | ((wc >> 12) & 0x0f);
  201. *p++ = 0x80 | ((wc >> 6) & 0x3f);
  202. *p++ = 0x80 | (wc & 0x3f);
  203. } else if (wc <= 0x1fffff) {
  204. *p++ = 0xf0 | ((wc >> 18) & 0x07);
  205. *p++ = 0x80 | ((wc >> 12) & 0x3f);
  206. *p++ = 0x80 | ((wc >> 6) & 0x3f);
  207. *p++ = 0x80 | (wc & 0x3f);
  208. } else {
  209. /* Unicode has no codes larger than 0x1fffff. */
  210. /* TODO: use \uXXXX escape here instead of ? */
  211. *p++ = '?';
  212. return_val = NULL;
  213. }
  214. }
  215. *p = '\0';
  216. archive_strcat(as, buff);
  217. return (return_val);
  218. }
  219. static int
  220. utf8_to_unicode(int *pwc, const char *s, size_t n)
  221. {
  222. int ch;
  223. /*
  224. * Decode 1-4 bytes depending on the value of the first byte.
  225. */
  226. ch = (unsigned char)*s;
  227. if (ch == 0) {
  228. return (0); /* Standard: return 0 for end-of-string. */
  229. }
  230. if ((ch & 0x80) == 0) {
  231. *pwc = ch & 0x7f;
  232. return (1);
  233. }
  234. if ((ch & 0xe0) == 0xc0) {
  235. if (n < 2)
  236. return (-1);
  237. if ((s[1] & 0xc0) != 0x80) return (-1);
  238. *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
  239. return (2);
  240. }
  241. if ((ch & 0xf0) == 0xe0) {
  242. if (n < 3)
  243. return (-1);
  244. if ((s[1] & 0xc0) != 0x80) return (-1);
  245. if ((s[2] & 0xc0) != 0x80) return (-1);
  246. *pwc = ((ch & 0x0f) << 12)
  247. | ((s[1] & 0x3f) << 6)
  248. | (s[2] & 0x3f);
  249. return (3);
  250. }
  251. if ((ch & 0xf8) == 0xf0) {
  252. if (n < 4)
  253. return (-1);
  254. if ((s[1] & 0xc0) != 0x80) return (-1);
  255. if ((s[2] & 0xc0) != 0x80) return (-1);
  256. if ((s[3] & 0xc0) != 0x80) return (-1);
  257. *pwc = ((ch & 0x07) << 18)
  258. | ((s[1] & 0x3f) << 12)
  259. | ((s[2] & 0x3f) << 6)
  260. | (s[3] & 0x3f);
  261. return (4);
  262. }
  263. /* Invalid first byte. */
  264. return (-1);
  265. }
  266. /*
  267. * Return a wide-character Unicode string by converting this archive_string
  268. * from UTF-8. We assume that systems with 16-bit wchar_t always use
  269. * UTF16 and systems with 32-bit wchar_t can accept UCS4.
  270. */
  271. wchar_t *
  272. __archive_string_utf8_w(struct archive_string *as)
  273. {
  274. wchar_t *ws, *dest;
  275. int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */
  276. const char *src;
  277. int n;
  278. int err;
  279. ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t));
  280. if (ws == NULL)
  281. __archive_errx(1, "Out of memory");
  282. err = 0;
  283. dest = ws;
  284. src = as->s;
  285. while (*src != '\0') {
  286. n = utf8_to_unicode(&wc, src, 8);
  287. if (n == 0)
  288. break;
  289. if (n < 0) {
  290. free(ws);
  291. return (NULL);
  292. }
  293. src += n;
  294. if (wc >= 0xDC00 && wc <= 0xDBFF) {
  295. /* This is a leading surrogate; some idiot
  296. * has translated UTF16 to UTF8 without combining
  297. * surrogates; rebuild the full code point before
  298. * continuing. */
  299. n = utf8_to_unicode(&wc2, src, 8);
  300. if (n < 0) {
  301. free(ws);
  302. return (NULL);
  303. }
  304. if (n == 0) /* Ignore the leading surrogate */
  305. break;
  306. if (wc2 < 0xDC00 || wc2 > 0xDFFF) {
  307. /* If the second character isn't a
  308. * trailing surrogate, then someone
  309. * has really screwed up and this is
  310. * invalid. */
  311. free(ws);
  312. return (NULL);
  313. } else {
  314. src += n;
  315. wc -= 0xD800;
  316. wc *= 0x400;
  317. wc += wc2 - 0xDC00;
  318. wc += 0x10000;
  319. }
  320. }
  321. if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) {
  322. /* We have a code point that won't fit into a
  323. * wchar_t; convert it to a surrogate pair. */
  324. wc -= 0x10000;
  325. *dest++ = ((wc >> 10) & 0x3ff) + 0xD800;
  326. *dest++ = (wc & 0x3ff) + 0xDC00;
  327. } else
  328. *dest++ = wc;
  329. }
  330. *dest++ = L'\0';
  331. return (ws);
  332. }
  333. #if defined(_WIN32) && !defined(__CYGWIN__)
  334. /*
  335. * Translates a wide character string into current locale character set
  336. * and appends to the archive_string. Note: returns NULL if conversion
  337. * fails.
  338. *
  339. * Win32 builds use WideCharToMultiByte from the Windows API.
  340. * (Maybe Cygwin should too? WideCharToMultiByte will know a
  341. * lot more about local character encodings than the wcrtomb()
  342. * wrapper is going to know.)
  343. */
  344. struct archive_string *
  345. __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
  346. {
  347. char *p;
  348. int l, wl;
  349. BOOL useDefaultChar = FALSE;
  350. wl = (int)wcslen(w);
  351. l = wl * 4 + 4;
  352. p = malloc(l);
  353. if (p == NULL)
  354. __archive_errx(1, "Out of memory");
  355. /* To check a useDefaultChar is to simulate error handling of
  356. * the my_wcstombs() which is running on non Windows system with
  357. * wctomb().
  358. * And to set NULL for last argument is necessary when a codepage
  359. * is not CP_ACP(current locale).
  360. */
  361. l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar);
  362. if (l == 0) {
  363. free(p);
  364. return (NULL);
  365. }
  366. __archive_string_append(as, p, l);
  367. free(p);
  368. return (as);
  369. }
  370. #else
  371. /*
  372. * Translates a wide character string into current locale character set
  373. * and appends to the archive_string. Note: returns NULL if conversion
  374. * fails.
  375. *
  376. * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
  377. * one character at a time. If a non-Windows platform doesn't have
  378. * either of these, fall back to the built-in UTF8 conversion.
  379. */
  380. struct archive_string *
  381. __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
  382. {
  383. #if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB)
  384. /* If there's no built-in locale support, fall back to UTF8 always. */
  385. return __archive_strappend_w_utf8(as, w);
  386. #else
  387. /* We cannot use the standard wcstombs() here because it
  388. * cannot tell us how big the output buffer should be. So
  389. * I've built a loop around wcrtomb() or wctomb() that
  390. * converts a character at a time and resizes the string as
  391. * needed. We prefer wcrtomb() when it's available because
  392. * it's thread-safe. */
  393. int n;
  394. char *p;
  395. char buff[256];
  396. #if HAVE_WCRTOMB
  397. mbstate_t shift_state;
  398. memset(&shift_state, 0, sizeof(shift_state));
  399. #else
  400. /* Clear the shift state before starting. */
  401. wctomb(NULL, L'\0');
  402. #endif
  403. /*
  404. * Convert one wide char at a time into 'buff', whenever that
  405. * fills, append it to the string.
  406. */
  407. p = buff;
  408. while (*w != L'\0') {
  409. /* Flush the buffer when we have <=16 bytes free. */
  410. /* (No encoding has a single character >16 bytes.) */
  411. if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) {
  412. *p = '\0';
  413. archive_strcat(as, buff);
  414. p = buff;
  415. }
  416. #if HAVE_WCRTOMB
  417. n = wcrtomb(p, *w++, &shift_state);
  418. #else
  419. n = wctomb(p, *w++);
  420. #endif
  421. if (n == -1)
  422. return (NULL);
  423. p += n;
  424. }
  425. *p = '\0';
  426. archive_strcat(as, buff);
  427. return (as);
  428. #endif
  429. }
  430. #endif /* _WIN32 && ! __CYGWIN__ */