cm_codecvt.cxx 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
  2. file Copyright.txt or https://cmake.org/licensing for details. */
  3. #include "cm_codecvt.hxx"
  4. #if defined(_WIN32)
  5. # include <cassert>
  6. # include <cstring>
  7. # include <windows.h>
  8. # undef max
  9. # include "cmsys/Encoding.hxx"
  10. # include "cm_utf8.h"
  11. #endif
  12. codecvt::codecvt(codecvt_Encoding e)
  13. #if defined(_WIN32)
  14. : m_codepage(0)
  15. #endif
  16. {
  17. switch (e) {
  18. case codecvt_Encoding::ConsoleOutput:
  19. #if defined(_WIN32)
  20. m_noconv = false;
  21. m_codepage = GetConsoleOutputCP();
  22. break;
  23. #endif
  24. case codecvt_Encoding::ANSI:
  25. #if defined(_WIN32)
  26. m_noconv = false;
  27. m_codepage = CP_ACP;
  28. break;
  29. #endif
  30. // We don't know which ANSI encoding to use for other platforms than
  31. // Windows so we don't do any conversion there
  32. case codecvt_Encoding::UTF8:
  33. case codecvt_Encoding::UTF8_WITH_BOM:
  34. // Assume internal encoding is UTF-8
  35. case codecvt_Encoding::None:
  36. // No encoding
  37. default:
  38. this->m_noconv = true;
  39. }
  40. }
  41. codecvt::~codecvt() = default;
  42. bool codecvt::do_always_noconv() const noexcept
  43. {
  44. return this->m_noconv;
  45. }
  46. std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
  47. const char* from_end,
  48. const char*& from_next, char* to,
  49. char* to_end, char*& to_next) const
  50. {
  51. from_next = from;
  52. to_next = to;
  53. if (this->m_noconv) {
  54. return std::codecvt_base::noconv;
  55. }
  56. #if defined(_WIN32)
  57. // Use a const view of the state because we should not modify it until we
  58. // have fully processed and consume a byte (with sufficient space in the
  59. // output buffer). We call helpers to re-cast and modify the state
  60. State const& lstate = reinterpret_cast<State&>(state);
  61. while (from_next != from_end) {
  62. // Count leading ones in the bits of the next byte.
  63. unsigned char const ones =
  64. cm_utf8_ones[static_cast<unsigned char>(*from_next)];
  65. if (ones != 1 && lstate.buffered != 0) {
  66. // We have a buffered partial codepoint that we never completed.
  67. return std::codecvt_base::error;
  68. } else if (ones == 1 && lstate.buffered == 0) {
  69. // This is a continuation of a codepoint that never started.
  70. return std::codecvt_base::error;
  71. }
  72. // Compute the number of bytes in the current codepoint.
  73. int need = 0;
  74. switch (ones) {
  75. case 0: // 0xxx xxxx: new codepoint of size 1
  76. need = 1;
  77. break;
  78. case 1: // 10xx xxxx: continues a codepoint
  79. assert(lstate.size != 0);
  80. need = lstate.size;
  81. break;
  82. case 2: // 110x xxxx: new codepoint of size 2
  83. need = 2;
  84. break;
  85. case 3: // 1110 xxxx: new codepoint of size 3
  86. need = 3;
  87. break;
  88. case 4: // 1111 0xxx: new codepoint of size 4
  89. need = 4;
  90. break;
  91. default: // invalid byte
  92. return std::codecvt_base::error;
  93. }
  94. assert(need > 0);
  95. if (lstate.buffered + 1 == need) {
  96. // This byte completes a codepoint.
  97. std::codecvt_base::result decode_result =
  98. this->Decode(state, need, from_next, to_next, to_end);
  99. if (decode_result != std::codecvt_base::ok) {
  100. return decode_result;
  101. }
  102. } else {
  103. // This byte does not complete a codepoint.
  104. this->BufferPartial(state, need, from_next);
  105. }
  106. }
  107. return std::codecvt_base::ok;
  108. #else
  109. static_cast<void>(state);
  110. static_cast<void>(from);
  111. static_cast<void>(from_end);
  112. static_cast<void>(from_next);
  113. static_cast<void>(to);
  114. static_cast<void>(to_end);
  115. static_cast<void>(to_next);
  116. return std::codecvt_base::noconv;
  117. #endif
  118. }
  119. std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
  120. char* to_end,
  121. char*& to_next) const
  122. {
  123. to_next = to;
  124. if (this->m_noconv) {
  125. return std::codecvt_base::noconv;
  126. }
  127. #if defined(_WIN32)
  128. State& lstate = reinterpret_cast<State&>(state);
  129. if (lstate.buffered != 0) {
  130. return this->DecodePartial(state, to_next, to_end);
  131. }
  132. return std::codecvt_base::ok;
  133. #else
  134. static_cast<void>(state);
  135. static_cast<void>(to_end);
  136. return std::codecvt_base::ok;
  137. #endif
  138. }
  139. #if defined(_WIN32)
  140. std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
  141. const char*& from_next,
  142. char*& to_next, char* to_end) const
  143. {
  144. State& lstate = reinterpret_cast<State&>(state);
  145. // Collect all the bytes for this codepoint.
  146. char buf[4];
  147. memcpy(buf, lstate.partial, lstate.buffered);
  148. buf[lstate.buffered] = *from_next;
  149. // Convert the encoding.
  150. wchar_t wbuf[2];
  151. int wlen =
  152. MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
  153. if (wlen <= 0) {
  154. return std::codecvt_base::error;
  155. }
  156. int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
  157. to_end - to_next, nullptr, nullptr);
  158. if (tlen <= 0) {
  159. if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
  160. return std::codecvt_base::partial;
  161. }
  162. return std::codecvt_base::error;
  163. }
  164. // Move past the now-consumed byte in the input buffer.
  165. ++from_next;
  166. // Move past the converted codepoint in the output buffer.
  167. to_next += tlen;
  168. // Re-initialize the state for the next codepoint to start.
  169. lstate = State();
  170. return std::codecvt_base::ok;
  171. }
  172. std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
  173. char*& to_next,
  174. char* to_end) const
  175. {
  176. State& lstate = reinterpret_cast<State&>(state);
  177. // Try converting the partial codepoint.
  178. wchar_t wbuf[2];
  179. int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
  180. lstate.buffered, wbuf, 2);
  181. if (wlen <= 0) {
  182. return std::codecvt_base::error;
  183. }
  184. int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
  185. to_end - to_next, nullptr, nullptr);
  186. if (tlen <= 0) {
  187. if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
  188. return std::codecvt_base::partial;
  189. }
  190. return std::codecvt_base::error;
  191. }
  192. // Move past the converted codepoint in the output buffer.
  193. to_next += tlen;
  194. // Re-initialize the state for the next codepoint to start.
  195. lstate = State();
  196. return std::codecvt_base::ok;
  197. }
  198. void codecvt::BufferPartial(mbstate_t& state, int size,
  199. const char*& from_next) const
  200. {
  201. State& lstate = reinterpret_cast<State&>(state);
  202. // Save the byte in our buffer for later.
  203. lstate.partial[lstate.buffered++] = *from_next;
  204. lstate.size = size;
  205. // Move past the now-consumed byte in the input buffer.
  206. ++from_next;
  207. }
  208. #endif
  209. int codecvt::do_max_length() const noexcept
  210. {
  211. return 4;
  212. }
  213. int codecvt::do_encoding() const noexcept
  214. {
  215. return 0;
  216. }