cm_codecvt.cxx 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
  2. file Copyright.txt or https://cmake.org/licensing for details. */
  3. #include "cm_codecvt.hxx"
  4. #include <limits>
  5. #if defined(_WIN32)
  6. #include <windows.h>
  7. #undef max
  8. #include <cmsys/Encoding.hxx>
  9. #endif
  10. codecvt::codecvt(Encoding e)
  11. : m_lastState(0)
  12. #if defined(_WIN32)
  13. , m_codepage(0)
  14. #endif
  15. {
  16. switch (e) {
  17. case codecvt::ANSI:
  18. #if defined(_WIN32)
  19. m_noconv = false;
  20. m_codepage = CP_ACP;
  21. break;
  22. #endif
  23. // We don't know which ANSI encoding to use for other platforms than
  24. // Windows so we don't do any conversion there
  25. case codecvt::UTF8:
  26. // Assume internal encoding is UTF-8
  27. case codecvt::None:
  28. // No encoding
  29. default:
  30. m_noconv = true;
  31. }
  32. }
  33. codecvt::~codecvt(){};
  34. bool codecvt::do_always_noconv() const throw()
  35. {
  36. return m_noconv;
  37. };
  38. std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
  39. const char* from_end,
  40. const char*& from_next, char* to,
  41. char* to_end, char*& to_next) const
  42. {
  43. if (m_noconv) {
  44. return noconv;
  45. }
  46. std::codecvt_base::result res = error;
  47. #if defined(_WIN32)
  48. from_next = from;
  49. to_next = to;
  50. bool convert = true;
  51. size_t count = from_end - from;
  52. const char* data = from;
  53. unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
  54. if (count == 0) {
  55. return codecvt::ok;
  56. } else if (count == 1) {
  57. if (stateId == 0) {
  58. // decode first byte for UTF-8
  59. if ((*from & 0xF8) == 0xF0 || // 1111 0xxx; 4 bytes for codepoint
  60. (*from & 0xF0) == 0xE0 || // 1110 xxxx; 3 bytes for codepoint
  61. (*from & 0xE0) == 0xC0) // 110x xxxx; 2 bytes for codepoint
  62. {
  63. stateId = findStateId();
  64. codecvt::State& s = m_states.at(stateId - 1);
  65. s.bytes[0] = *from;
  66. convert = false;
  67. if ((*from & 0xF8) == 0xF0) {
  68. s.totalBytes = 4;
  69. } else if ((*from & 0xF0) == 0xE0) {
  70. s.totalBytes = 3;
  71. } else if ((*from & 0xE0) == 0xC0) {
  72. s.totalBytes = 2;
  73. }
  74. s.bytesLeft = s.totalBytes - 1;
  75. };
  76. // else 1 byte for codepoint
  77. } else {
  78. codecvt::State& s = m_states.at(stateId - 1);
  79. s.bytes[s.totalBytes - s.bytesLeft] = *from;
  80. s.bytesLeft--;
  81. data = s.bytes;
  82. count = s.totalBytes - s.bytesLeft;
  83. if ((*from & 0xC0) == 0x80) { // 10xx xxxx
  84. convert = s.bytesLeft == 0;
  85. } else {
  86. // invalid multi-byte
  87. convert = true;
  88. }
  89. if (convert) {
  90. s.used = false;
  91. if (stateId == m_lastState) {
  92. m_lastState--;
  93. }
  94. stateId = 0;
  95. }
  96. }
  97. if (convert) {
  98. std::wstring wide = cmsys::Encoding::ToWide(std::string(data, count));
  99. int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
  100. static_cast<int>(wide.size()), to,
  101. to_end - to, NULL, NULL);
  102. if (r > 0) {
  103. from_next = from_end;
  104. to_next = to + r;
  105. res = ok;
  106. }
  107. } else {
  108. res = partial;
  109. from_next = from_end;
  110. to_next = to;
  111. }
  112. }
  113. #else
  114. static_cast<void>(state);
  115. static_cast<void>(from);
  116. static_cast<void>(from_end);
  117. static_cast<void>(from_next);
  118. static_cast<void>(to);
  119. static_cast<void>(to_end);
  120. static_cast<void>(to_next);
  121. res = codecvt::noconv;
  122. #endif
  123. return res;
  124. };
  125. std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
  126. char* to_end,
  127. char*& to_next) const
  128. {
  129. std::codecvt_base::result res = error;
  130. to_next = to;
  131. #if defined(_WIN32)
  132. unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
  133. if (stateId > 0) {
  134. codecvt::State& s = m_states.at(stateId - 1);
  135. s.used = false;
  136. if (stateId == m_lastState) {
  137. m_lastState--;
  138. }
  139. stateId = 0;
  140. std::wstring wide = cmsys::Encoding::ToWide(
  141. std::string(s.bytes, s.totalBytes - s.bytesLeft));
  142. int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
  143. static_cast<int>(wide.size()), to, to_end - to,
  144. NULL, NULL);
  145. if (r > 0) {
  146. to_next = to + r;
  147. res = ok;
  148. }
  149. } else {
  150. res = ok;
  151. }
  152. #else
  153. static_cast<void>(state);
  154. static_cast<void>(to_end);
  155. res = ok;
  156. #endif
  157. return res;
  158. };
  159. int codecvt::do_max_length() const throw()
  160. {
  161. return 4;
  162. };
  163. int codecvt::do_encoding() const throw()
  164. {
  165. return 0;
  166. };
  167. unsigned int codecvt::findStateId() const
  168. {
  169. unsigned int stateId = 0;
  170. bool add = false;
  171. const unsigned int maxSize = std::numeric_limits<unsigned int>::max();
  172. if (m_lastState >= maxSize) {
  173. m_lastState = 0;
  174. }
  175. if (m_states.size() <= m_lastState) {
  176. add = true;
  177. } else {
  178. unsigned int i = m_lastState;
  179. while (i < maxSize) {
  180. codecvt::State& s = m_states.at(i);
  181. i++;
  182. if (!s.used) {
  183. m_lastState = i;
  184. stateId = m_lastState;
  185. s.used = true;
  186. s.totalBytes = 0;
  187. s.bytesLeft = 0;
  188. break;
  189. }
  190. if (i >= m_states.size()) {
  191. i = 0;
  192. }
  193. if (i == m_lastState) {
  194. add = true;
  195. break;
  196. }
  197. }
  198. };
  199. if (add) {
  200. codecvt::State s = { true, 0, 0, { 0, 0, 0, 0 } };
  201. m_states.push_back(s);
  202. m_lastState = (unsigned int)m_states.size();
  203. stateId = m_lastState;
  204. }
  205. return stateId;
  206. };