stripctrl.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. /*
  2. * stripctrl.c: a facility for stripping control characters out of a
  3. * data stream (defined as any multibyte character in the system
  4. * locale which is neither printable nor \n), using the standard C
  5. * library multibyte character facilities.
  6. */
  7. #include <assert.h>
  8. #include <locale.h>
  9. #include <string.h>
  10. #include <wchar.h>
  11. #include <wctype.h>
  12. #include "putty.h"
  13. #ifndef WINSCP
  14. #include "terminal.h"
  15. #endif
  16. #include "misc.h"
  17. #include "marshal.h"
  18. #define SCC_BUFSIZE 64
  19. #define LINE_LIMIT 77
  20. typedef struct StripCtrlCharsImpl StripCtrlCharsImpl;
  21. struct StripCtrlCharsImpl {
  22. mbstate_t mbs_in, mbs_out;
  23. bool permit_cr;
  24. wchar_t substitution;
  25. char buf[SCC_BUFSIZE];
  26. size_t buflen;
  27. #ifndef WINSCP
  28. Terminal *term;
  29. bool last_term_utf;
  30. struct term_utf8_decode utf8;
  31. unsigned long (*translate)(Terminal *, term_utf8_decode *, unsigned char);
  32. #endif
  33. bool line_limit;
  34. bool line_start;
  35. size_t line_chars_remaining;
  36. BinarySink *bs_out;
  37. StripCtrlChars public;
  38. };
  39. static void stripctrl_locale_BinarySink_write(
  40. BinarySink *bs, const void *vp, size_t len);
  41. static void stripctrl_term_BinarySink_write(
  42. BinarySink *bs, const void *vp, size_t len);
  43. static StripCtrlCharsImpl *stripctrl_new_common(
  44. BinarySink *bs_out, bool permit_cr, wchar_t substitution)
  45. {
  46. StripCtrlCharsImpl *scc = snew(StripCtrlCharsImpl);
  47. memset(scc, 0, sizeof(StripCtrlCharsImpl)); /* zeroes mbstates */
  48. scc->bs_out = bs_out;
  49. scc->permit_cr = permit_cr;
  50. scc->substitution = substitution;
  51. return scc;
  52. }
  53. #ifndef WINSCP
  54. StripCtrlChars *stripctrl_new(
  55. BinarySink *bs_out, bool permit_cr, wchar_t substitution)
  56. {
  57. StripCtrlCharsImpl *scc = stripctrl_new_common(
  58. bs_out, permit_cr, substitution);
  59. BinarySink_INIT(&scc->public, stripctrl_locale_BinarySink_write);
  60. return &scc->public;
  61. }
  62. StripCtrlChars *stripctrl_new_term_fn(
  63. BinarySink *bs_out, bool permit_cr, wchar_t substitution,
  64. Terminal *term, unsigned long (*translate)(
  65. Terminal *, term_utf8_decode *, unsigned char))
  66. {
  67. StripCtrlCharsImpl *scc = stripctrl_new_common(
  68. bs_out, permit_cr, substitution);
  69. scc->term = term;
  70. scc->translate = translate;
  71. BinarySink_INIT(&scc->public, stripctrl_term_BinarySink_write);
  72. return &scc->public;
  73. }
  74. #endif
  75. void stripctrl_retarget(StripCtrlChars *sccpub, BinarySink *new_bs_out)
  76. {
  77. StripCtrlCharsImpl *scc =
  78. container_of(sccpub, StripCtrlCharsImpl, public);
  79. scc->bs_out = new_bs_out;
  80. stripctrl_reset(sccpub);
  81. }
  82. void stripctrl_reset(StripCtrlChars *sccpub)
  83. {
  84. StripCtrlCharsImpl *scc =
  85. container_of(sccpub, StripCtrlCharsImpl, public);
  86. /*
  87. * Clear all the fields that might have been in the middle of a
  88. * multibyte character or non-default shift state, so that we can
  89. * start converting a fresh piece of data to send to a channel
  90. * that hasn't seen the previous output.
  91. */
  92. #ifndef WINSCP
  93. memset(&scc->utf8, 0, sizeof(scc->utf8));
  94. #endif
  95. memset(&scc->mbs_in, 0, sizeof(scc->mbs_in));
  96. memset(&scc->mbs_out, 0, sizeof(scc->mbs_out));
  97. /*
  98. * Also, reset the line-limiting system to its starting state.
  99. */
  100. scc->line_start = true;
  101. }
  102. void stripctrl_free(StripCtrlChars *sccpub)
  103. {
  104. StripCtrlCharsImpl *scc =
  105. container_of(sccpub, StripCtrlCharsImpl, public);
  106. smemclr(scc, sizeof(StripCtrlCharsImpl));
  107. sfree(scc);
  108. }
  109. void stripctrl_enable_line_limiting(StripCtrlChars *sccpub)
  110. {
  111. StripCtrlCharsImpl *scc =
  112. container_of(sccpub, StripCtrlCharsImpl, public);
  113. scc->line_limit = true;
  114. scc->line_start = true;
  115. }
  116. #ifndef WINSCP
  117. static inline bool stripctrl_ctrlchar_ok(StripCtrlCharsImpl *scc, wchar_t wc)
  118. {
  119. return wc == L'\n' || (wc == L'\r' && scc->permit_cr);
  120. }
  121. static inline void stripctrl_check_line_limit(
  122. StripCtrlCharsImpl *scc, wchar_t wc, size_t width)
  123. {
  124. if (!scc->line_limit)
  125. return; /* nothing to do */
  126. if (scc->line_start) {
  127. put_datapl(scc->bs_out, PTRLEN_LITERAL("| "));
  128. scc->line_start = false;
  129. scc->line_chars_remaining = LINE_LIMIT;
  130. }
  131. if (wc == '\n') {
  132. scc->line_start = true;
  133. return;
  134. }
  135. if (scc->line_chars_remaining < width) {
  136. put_datapl(scc->bs_out, PTRLEN_LITERAL("\r\n> "));
  137. scc->line_chars_remaining = LINE_LIMIT;
  138. }
  139. assert(width <= scc->line_chars_remaining);
  140. scc->line_chars_remaining -= width;
  141. }
  142. static inline void stripctrl_locale_put_wc(StripCtrlCharsImpl *scc, wchar_t wc)
  143. {
  144. int width = mk_wcwidth(wc);
  145. if ((iswprint(wc) && width >= 0) || stripctrl_ctrlchar_ok(scc, wc)) {
  146. /* Printable character, or one we're going to let through anyway. */
  147. if (width < 0)
  148. width = 0; /* sanitise for stripctrl_check_line_limit */
  149. } else if (scc->substitution) {
  150. wc = scc->substitution;
  151. width = mk_wcwidth(wc);
  152. assert(width >= 0);
  153. } else {
  154. /* No defined substitution, so don't write any output wchar_t. */
  155. return;
  156. }
  157. stripctrl_check_line_limit(scc, wc, width);
  158. char outbuf[MB_LEN_MAX];
  159. size_t produced = wcrtomb(outbuf, wc, &scc->mbs_out);
  160. if (produced > 0)
  161. put_data(scc->bs_out, outbuf, produced);
  162. }
  163. static inline void stripctrl_term_put_wc(
  164. StripCtrlCharsImpl *scc, unsigned long wc)
  165. {
  166. ptrlen prefix = PTRLEN_LITERAL("");
  167. int width = term_char_width(scc->term, wc);
  168. if (!(wc & ~0x9F) || width < 0) {
  169. /* This is something the terminal interprets as a control
  170. * character. */
  171. if (!stripctrl_ctrlchar_ok(scc, wc)) {
  172. if (!scc->substitution) {
  173. return;
  174. } else {
  175. wc = scc->substitution;
  176. width = term_char_width(scc->term, wc);
  177. assert(width >= 0);
  178. }
  179. } else {
  180. if (width < 0)
  181. width = 0; /* sanitise for stripctrl_check_line_limit */
  182. }
  183. if (wc == '\012') {
  184. /* Precede \n with \r, because our terminal will not
  185. * generally be in the ONLCR mode where it assumes that
  186. * internally, and any \r on input has been stripped
  187. * out. */
  188. prefix = PTRLEN_LITERAL("\r");
  189. }
  190. }
  191. stripctrl_check_line_limit(scc, wc, width);
  192. if (prefix.len)
  193. put_datapl(scc->bs_out, prefix);
  194. /*
  195. * The Terminal implementation encodes 7-bit ASCII characters in
  196. * UTF-8 mode, and all printing characters in non-UTF-8 (i.e.
  197. * single-byte character set) mode, as values in the surrogate
  198. * range (a conveniently unused piece of space in this context)
  199. * whose low byte is the original 1-byte representation of the
  200. * character.
  201. */
  202. if ((wc - 0xD800) < (0xE000 - 0xD800))
  203. wc &= 0xFF;
  204. if (in_utf(scc->term)) {
  205. put_utf8_char(scc->bs_out, wc);
  206. } else {
  207. put_byte(scc->bs_out, wc);
  208. }
  209. }
  210. static inline size_t stripctrl_locale_try_consume(
  211. StripCtrlCharsImpl *scc, const char *p, size_t len)
  212. {
  213. wchar_t wc;
  214. mbstate_t mbs_orig = scc->mbs_in;
  215. size_t consumed = mbrtowc(&wc, p, len, &scc->mbs_in);
  216. if (consumed == (size_t)-2) {
  217. /*
  218. * The buffer is too short to see the end of the multibyte
  219. * character that it appears to be starting with. We return 0
  220. * for 'no data consumed', restore the conversion state from
  221. * before consuming the partial character, and our caller will
  222. * come back when it has more data available.
  223. */
  224. scc->mbs_in = mbs_orig;
  225. return 0;
  226. }
  227. if (consumed == (size_t)-1) {
  228. /*
  229. * The buffer contains an illegal multibyte sequence. There's
  230. * no really good way to recover from this, so we'll just
  231. * reset our input state, consume a single byte without
  232. * emitting anything, and hope we can resynchronise to
  233. * _something_ sooner or later.
  234. */
  235. memset(&scc->mbs_in, 0, sizeof(scc->mbs_in));
  236. return 1;
  237. }
  238. if (consumed == 0) {
  239. /*
  240. * A zero wide character is encoded by the data, but mbrtowc
  241. * hasn't told us how many input bytes it takes. There isn't
  242. * really anything good we can do here, so we just advance by
  243. * one byte in the hope that that was the NUL.
  244. *
  245. * (If it wasn't - that is, if we're in a multibyte encoding
  246. * in which the terminator of a normal C string is encoded in
  247. * some way other than a single zero byte - then probably lots
  248. * of other things will have gone wrong before we get here!)
  249. */
  250. stripctrl_locale_put_wc(scc, L'\0');
  251. return 1;
  252. }
  253. /*
  254. * Otherwise, this is the easy case: consumed > 0, and we've eaten
  255. * a valid multibyte character.
  256. */
  257. stripctrl_locale_put_wc(scc, wc);
  258. return consumed;
  259. }
  260. static void stripctrl_locale_BinarySink_write(
  261. BinarySink *bs, const void *vp, size_t len)
  262. {
  263. StripCtrlChars *sccpub = BinarySink_DOWNCAST(bs, StripCtrlChars);
  264. StripCtrlCharsImpl *scc =
  265. container_of(sccpub, StripCtrlCharsImpl, public);
  266. const char *p = (const char *)vp;
  267. char *previous_locale = dupstr(setlocale(LC_CTYPE, NULL));
  268. setlocale(LC_CTYPE, "");
  269. /*
  270. * Deal with any partial multibyte character buffered from last
  271. * time.
  272. */
  273. while (scc->buflen > 0) {
  274. size_t to_copy = SCC_BUFSIZE - scc->buflen;
  275. if (to_copy > len)
  276. to_copy = len;
  277. memcpy(scc->buf + scc->buflen, p, to_copy);
  278. { // WINSCP
  279. size_t consumed = stripctrl_locale_try_consume(
  280. scc, scc->buf, scc->buflen + to_copy);
  281. if (consumed >= scc->buflen) {
  282. /*
  283. * We've consumed a multibyte character that includes all
  284. * the data buffered from last time. So we can clear our
  285. * buffer and move on to processing the main input string
  286. * in situ, having first discarded whatever initial
  287. * segment of it completed our previous character.
  288. */
  289. size_t consumed_from_main_string = consumed - scc->buflen;
  290. assert(consumed_from_main_string <= len);
  291. p += consumed_from_main_string;
  292. len -= consumed_from_main_string;
  293. scc->buflen = 0;
  294. break;
  295. }
  296. if (consumed == 0) {
  297. /*
  298. * If we didn't manage to consume anything, i.e. the whole
  299. * buffer contains an incomplete sequence, it had better
  300. * be because our entire input string _this_ time plus
  301. * whatever leftover data we had from _last_ time still
  302. * comes to less than SCC_BUFSIZE. In other words, we've
  303. * already copied all the new data on to the end of our
  304. * buffer, and it still hasn't helped. So increment buflen
  305. * to reflect the new data, and return.
  306. */
  307. assert(to_copy == len);
  308. scc->buflen += to_copy;
  309. goto out;
  310. }
  311. /*
  312. * Otherwise, we've somehow consumed _less_ data than we had
  313. * buffered, and yet we weren't able to consume that data in
  314. * the last call to this function. That sounds impossible, but
  315. * I can think of one situation in which it could happen: if
  316. * we had an incomplete MB sequence last time, and now more
  317. * data has arrived, it turns out to be an _illegal_ one, so
  318. * we consume one byte in the hope of resynchronising.
  319. *
  320. * Anyway, in this case we move the buffer up and go back
  321. * round this initial loop.
  322. */
  323. scc->buflen -= consumed;
  324. memmove(scc->buf, scc->buf + consumed, scc->buflen);
  325. } // WINSCP
  326. }
  327. /*
  328. * Now charge along the main string.
  329. */
  330. while (len > 0) {
  331. size_t consumed = stripctrl_locale_try_consume(scc, p, len);
  332. if (consumed == 0)
  333. break;
  334. assert(consumed <= len);
  335. p += consumed;
  336. len -= consumed;
  337. }
  338. /*
  339. * Any data remaining should be copied into our buffer, to keep
  340. * for next time.
  341. */
  342. assert(len <= SCC_BUFSIZE);
  343. memcpy(scc->buf, p, len);
  344. scc->buflen = len;
  345. out:
  346. setlocale(LC_CTYPE, previous_locale);
  347. sfree(previous_locale);
  348. }
  349. static void stripctrl_term_BinarySink_write(
  350. BinarySink *bs, const void *vp, size_t len)
  351. {
  352. StripCtrlChars *sccpub = BinarySink_DOWNCAST(bs, StripCtrlChars);
  353. StripCtrlCharsImpl *scc =
  354. container_of(sccpub, StripCtrlCharsImpl, public);
  355. bool utf = in_utf(scc->term);
  356. if (utf != scc->last_term_utf) {
  357. scc->last_term_utf = utf;
  358. scc->utf8.state = 0;
  359. }
  360. for (const unsigned char *p = (const unsigned char *)vp;
  361. len > 0; len--, p++) {
  362. unsigned long t = scc->translate(scc->term, &scc->utf8, *p);
  363. if (t == UCSTRUNCATED) {
  364. stripctrl_term_put_wc(scc, 0xFFFD);
  365. /* go round again */
  366. t = scc->translate(scc->term, &scc->utf8, *p);
  367. }
  368. if (t == UCSINCOMPLETE)
  369. continue;
  370. if (t == UCSINVALID)
  371. t = 0xFFFD;
  372. stripctrl_term_put_wc(scc, t);
  373. }
  374. }
  375. char *stripctrl_string_ptrlen(StripCtrlChars *sccpub, ptrlen str)
  376. {
  377. strbuf *out = strbuf_new();
  378. stripctrl_retarget(sccpub, BinarySink_UPCAST(out));
  379. put_datapl(sccpub, str);
  380. stripctrl_retarget(sccpub, NULL);
  381. return strbuf_to_str(out);
  382. }
  383. #endif
  384. #ifdef STRIPCTRL_TEST
  385. /*
  386. gcc -std=c99 -DSTRIPCTRL_TEST -o scctest stripctrl.c marshal.c utils.c memory.c wcwidth.c -I . -I unix -I charset
  387. */
  388. void out_of_memory(void) { fprintf(stderr, "out of memory\n"); abort(); }
  389. void stripctrl_write(BinarySink *bs, const void *vdata, size_t len)
  390. {
  391. const uint8_t *p = vdata;
  392. printf("[");
  393. for (size_t i = 0; i < len; i++)
  394. printf("%*s%02x", i?1:0, "", (unsigned)p[i]);
  395. printf("]");
  396. }
  397. void stripctrl_test(StripCtrlChars *scc, ptrlen pl)
  398. {
  399. stripctrl_write(NULL, pl.ptr, pl.len);
  400. printf(" -> ");
  401. put_datapl(scc, pl);
  402. printf("\n");
  403. }
  404. int main(void)
  405. {
  406. struct foo { BinarySink_IMPLEMENTATION; } foo;
  407. BinarySink_INIT(&foo, stripctrl_write);
  408. StripCtrlChars *scc = stripctrl_new(BinarySink_UPCAST(&foo), false, '?');
  409. stripctrl_test(scc, PTRLEN_LITERAL("a\033[1mb"));
  410. stripctrl_test(scc, PTRLEN_LITERAL("a\xC2\x9B[1mb"));
  411. stripctrl_test(scc, PTRLEN_LITERAL("a\xC2\xC2[1mb"));
  412. stripctrl_test(scc, PTRLEN_LITERAL("\xC3"));
  413. stripctrl_test(scc, PTRLEN_LITERAL("\xA9"));
  414. stripctrl_test(scc, PTRLEN_LITERAL("\xE2\x80\x8F"));
  415. stripctrl_test(scc, PTRLEN_LITERAL("a\0b"));
  416. stripctrl_free(scc);
  417. return 0;
  418. }
  419. #endif /* STRIPCTRL_TEST */