8 年之前 · 690acadc17
--- a/Source/cm_codecvt.cxx
+++ b/Source/cm_codecvt.cxx
@@ -1,18 +1,23 @@
 
				 /* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
			
 
				    file Copyright.txt or https://cmake.org/licensing for details.  */
			
 
				 #include "cm_codecvt.hxx"
			
 
				-#include <limits>
			
 
				 
			
 
				 #if defined(_WIN32)
			
 
				+#include <assert.h>
			
 
				+#include <string.h>
			
 
				 #include <windows.h>
			
 
				 #undef max
			
 
				 #include "cmsys/Encoding.hxx"
			
 
				 #endif
			
 
				 
			
 
				+#if defined(_WIN32)
			
 
				+/* Number of leading ones before a zero in the byte (see cm_utf8.c).  */
			
 
				+extern "C" unsigned char const cm_utf8_ones[256];
			
 
				+#endif
			
 
				+
			
 
				 codecvt::codecvt(Encoding e)
			
 
				-  : m_lastState(0)
			
 
				 #if defined(_WIN32)
			
 
				-  , m_codepage(0)
			
 
				+  : m_codepage(0)
			
 
				 #endif
			
 
				 {
			
 
				   switch (e) {
			
@@ -45,76 +50,68 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
 
				                                           const char*& from_next, char* to,
			
 
				                                           char* to_end, char*& to_next) const
			
 
				 {
			
 
				+  from_next = from;
			
 
				+  to_next = to;
			
 
				   if (m_noconv) {
			
 
				-    return noconv;
			
 
				+    return std::codecvt_base::noconv;
			
 
				   }
			
 
				-  std::codecvt_base::result res = error;
			
 
				 #if defined(_WIN32)
			
 
				-  from_next = from;
			
 
				-  to_next = to;
			
 
				-  bool convert = true;
			
 
				-  size_t count = from_end - from;
			
 
				-  const char* data = from;
			
 
				-  unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
			
 
				-  if (count == 0) {
			
 
				-    return codecvt::ok;
			
 
				-  } else if (count == 1) {
			
 
				-    if (stateId == 0) {
			
 
				-      // decode first byte for UTF-8
			
 
				-      if ((*from & 0xF8) == 0xF0 || // 1111 0xxx; 4 bytes for codepoint
			
 
				-          (*from & 0xF0) == 0xE0 || // 1110 xxxx; 3 bytes for codepoint
			
 
				-          (*from & 0xE0) == 0xC0)   // 110x xxxx; 2 bytes for codepoint
			
 
				-      {
			
 
				-        stateId = findStateId();
			
 
				-        codecvt::State& s = m_states.at(stateId - 1);
			
 
				-        s.bytes[0] = *from;
			
 
				-        convert = false;
			
 
				-        if ((*from & 0xF8) == 0xF0) {
			
 
				-          s.totalBytes = 4;
			
 
				-        } else if ((*from & 0xF0) == 0xE0) {
			
 
				-          s.totalBytes = 3;
			
 
				-        } else if ((*from & 0xE0) == 0xC0) {
			
 
				-          s.totalBytes = 2;
			
 
				-        }
			
 
				-        s.bytesLeft = s.totalBytes - 1;
			
 
				-      };
			
 
				-      // else 1 byte for codepoint
			
 
				-    } else {
			
 
				-      codecvt::State& s = m_states.at(stateId - 1);
			
 
				-      s.bytes[s.totalBytes - s.bytesLeft] = *from;
			
 
				-      s.bytesLeft--;
			
 
				-      data = s.bytes;
			
 
				-      count = s.totalBytes - s.bytesLeft;
			
 
				-      if ((*from & 0xC0) == 0x80) { // 10xx xxxx
			
 
				-        convert = s.bytesLeft == 0;
			
 
				-      } else {
			
 
				-        // invalid multi-byte
			
 
				-        convert = true;
			
 
				-      }
			
 
				-      if (convert) {
			
 
				-        s.used = false;
			
 
				-        if (stateId == m_lastState) {
			
 
				-          m_lastState--;
			
 
				-        }
			
 
				-        stateId = 0;
			
 
				-      }
			
 
				+  // Use a const view of the state because we should not modify it until we
			
 
				+  // have fully processed and consume a byte (with sufficient space in the
			
 
				+  // output buffer).  We call helpers to re-cast and modify the state
			
 
				+  State const& lstate = reinterpret_cast<State&>(state);
			
 
				+
			
 
				+  while (from_next != from_end) {
			
 
				+    // Count leading ones in the bits of the next byte.
			
 
				+    unsigned char const ones =
			
 
				+      cm_utf8_ones[static_cast<unsigned char>(*from_next)];
			
 
				+
			
 
				+    if (ones != 1 && lstate.buffered != 0) {
			
 
				+      // We have a buffered partial codepoint that we never completed.
			
 
				+      return std::codecvt_base::error;
			
 
				+    } else if (ones == 1 && lstate.buffered == 0) {
			
 
				+      // This is a continuation of a codepoint that never started.
			
 
				+      return std::codecvt_base::error;
			
 
				+    }
			
 
				+
			
 
				+    // Compute the number of bytes in the current codepoint.
			
 
				+    int need = 0;
			
 
				+    switch (ones) {
			
 
				+      case 0: // 0xxx xxxx: new codepoint of size 1
			
 
				+        need = 1;
			
 
				+        break;
			
 
				+      case 1: // 10xx xxxx: continues a codepoint
			
 
				+        assert(lstate.size != 0);
			
 
				+        need = lstate.size;
			
 
				+        break;
			
 
				+      case 2: // 110x xxxx: new codepoint of size 2
			
 
				+        need = 2;
			
 
				+        break;
			
 
				+      case 3: // 1110 xxxx: new codepoint of size 3
			
 
				+        need = 3;
			
 
				+        break;
			
 
				+      case 4: // 1111 0xxx: new codepoint of size 4
			
 
				+        need = 4;
			
 
				+        break;
			
 
				+      default: // invalid byte
			
 
				+        return std::codecvt_base::error;
			
 
				     }
			
 
				-    if (convert) {
			
 
				-      std::wstring wide = cmsys::Encoding::ToWide(std::string(data, count));
			
 
				-      int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
			
 
				-                                  static_cast<int>(wide.size()), to,
			
 
				-                                  to_end - to, NULL, NULL);
			
 
				-      if (r > 0) {
			
 
				-        from_next = from_end;
			
 
				-        to_next = to + r;
			
 
				-        res = ok;
			
 
				+    assert(need > 0);
			
 
				+
			
 
				+    if (lstate.buffered + 1 == need) {
			
 
				+      // This byte completes a codepoint.
			
 
				+      std::codecvt_base::result decode_result =
			
 
				+        this->Decode(state, need, from_next, to_next, to_end);
			
 
				+      if (decode_result != std::codecvt_base::ok) {
			
 
				+        return decode_result;
			
 
				       }
			
 
				     } else {
			
 
				-      res = partial;
			
 
				-      from_next = from_end;
			
 
				-      to_next = to;
			
 
				+      // This byte does not complete a codepoint.
			
 
				+      this->BufferPartial(state, need, from_next);
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  return std::codecvt_base::ok;
			
 
				 #else
			
 
				   static_cast<void>(state);
			
 
				   static_cast<void>(from);
			
@@ -123,46 +120,118 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
 
				   static_cast<void>(to);
			
 
				   static_cast<void>(to_end);
			
 
				   static_cast<void>(to_next);
			
 
				-  res = codecvt::noconv;
			
 
				+  return std::codecvt_base::noconv;
			
 
				 #endif
			
 
				-  return res;
			
 
				 };
			
 
				 
			
 
				 std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
			
 
				                                               char* to_end,
			
 
				                                               char*& to_next) const
			
 
				 {
			
 
				-  std::codecvt_base::result res = error;
			
 
				   to_next = to;
			
 
				+  if (m_noconv) {
			
 
				+    return std::codecvt_base::noconv;
			
 
				+  }
			
 
				 #if defined(_WIN32)
			
 
				-  unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
			
 
				-  if (stateId > 0) {
			
 
				-    codecvt::State& s = m_states.at(stateId - 1);
			
 
				-    s.used = false;
			
 
				-    if (stateId == m_lastState) {
			
 
				-      m_lastState--;
			
 
				-    }
			
 
				-    stateId = 0;
			
 
				-    std::wstring wide = cmsys::Encoding::ToWide(
			
 
				-      std::string(s.bytes, s.totalBytes - s.bytesLeft));
			
 
				-    int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
			
 
				-                                static_cast<int>(wide.size()), to, to_end - to,
			
 
				-                                NULL, NULL);
			
 
				-    if (r > 0) {
			
 
				-      to_next = to + r;
			
 
				-      res = ok;
			
 
				-    }
			
 
				-  } else {
			
 
				-    res = ok;
			
 
				+  State& lstate = reinterpret_cast<State&>(state);
			
 
				+  if (lstate.buffered != 0) {
			
 
				+    return this->DecodePartial(state, to_next, to_end);
			
 
				   }
			
 
				+  return std::codecvt_base::ok;
			
 
				 #else
			
 
				   static_cast<void>(state);
			
 
				   static_cast<void>(to_end);
			
 
				-  res = ok;
			
 
				+  return std::codecvt_base::ok;
			
 
				 #endif
			
 
				-  return res;
			
 
				 };
			
 
				 
			
 
				+#if defined(_WIN32)
			
 
				+std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
			
 
				+                                          const char*& from_next,
			
 
				+                                          char*& to_next, char* to_end) const
			
 
				+{
			
 
				+  State& lstate = reinterpret_cast<State&>(state);
			
 
				+
			
 
				+  // Collect all the bytes for this codepoint.
			
 
				+  char buf[4];
			
 
				+  memcpy(buf, lstate.partial, lstate.buffered);
			
 
				+  buf[lstate.buffered] = *from_next;
			
 
				+
			
 
				+  // Convert the encoding.
			
 
				+  wchar_t wbuf[2];
			
 
				+  int wlen =
			
 
				+    MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
			
 
				+  if (wlen <= 0) {
			
 
				+    return std::codecvt_base::error;
			
 
				+  }
			
 
				+
			
 
				+  int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
			
 
				+                                 to_end - to_next, NULL, NULL);
			
 
				+  if (tlen <= 0) {
			
 
				+    if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
			
 
				+      return std::codecvt_base::partial;
			
 
				+    }
			
 
				+    return std::codecvt_base::error;
			
 
				+  }
			
 
				+
			
 
				+  // Move past the now-consumed byte in the input buffer.
			
 
				+  ++from_next;
			
 
				+
			
 
				+  // Move past the converted codepoint in the output buffer.
			
 
				+  to_next += tlen;
			
 
				+
			
 
				+  // Re-initialize the state for the next codepoint to start.
			
 
				+  lstate = State();
			
 
				+
			
 
				+  return std::codecvt_base::ok;
			
 
				+}
			
 
				+
			
 
				+std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
			
 
				+                                                 char*& to_next,
			
 
				+                                                 char* to_end) const
			
 
				+{
			
 
				+  State& lstate = reinterpret_cast<State&>(state);
			
 
				+
			
 
				+  // Try converting the partial codepoint.
			
 
				+  wchar_t wbuf[2];
			
 
				+  int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
			
 
				+                                 lstate.buffered, wbuf, 2);
			
 
				+  if (wlen <= 0) {
			
 
				+    return std::codecvt_base::error;
			
 
				+  }
			
 
				+
			
 
				+  int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
			
 
				+                                 to_end - to_next, NULL, NULL);
			
 
				+  if (tlen <= 0) {
			
 
				+    if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
			
 
				+      return std::codecvt_base::partial;
			
 
				+    }
			
 
				+    return std::codecvt_base::error;
			
 
				+  }
			
 
				+
			
 
				+  // Move past the converted codepoint in the output buffer.
			
 
				+  to_next += tlen;
			
 
				+
			
 
				+  // Re-initialize the state for the next codepoint to start.
			
 
				+  lstate = State();
			
 
				+
			
 
				+  return std::codecvt_base::ok;
			
 
				+}
			
 
				+
			
 
				+void codecvt::BufferPartial(mbstate_t& state, int size,
			
 
				+                            const char*& from_next) const
			
 
				+{
			
 
				+  State& lstate = reinterpret_cast<State&>(state);
			
 
				+
			
 
				+  // Save the byte in our buffer for later.
			
 
				+  lstate.partial[lstate.buffered++] = *from_next;
			
 
				+  lstate.size = size;
			
 
				+
			
 
				+  // Move past the now-consumed byte in the input buffer.
			
 
				+  ++from_next;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 int codecvt::do_max_length() const throw()
			
 
				 {
			
 
				   return 4;
			
@@ -172,44 +241,3 @@ int codecvt::do_encoding() const throw()
 
				 {
			
 
				   return 0;
			
 
				 };
			
 
				-
			
 
				-unsigned int codecvt::findStateId() const
			
 
				-{
			
 
				-  unsigned int stateId = 0;
			
 
				-  bool add = false;
			
 
				-  const unsigned int maxSize = std::numeric_limits<unsigned int>::max();
			
 
				-  if (m_lastState >= maxSize) {
			
 
				-    m_lastState = 0;
			
 
				-  }
			
 
				-  if (m_states.size() <= m_lastState) {
			
 
				-    add = true;
			
 
				-  } else {
			
 
				-    unsigned int i = m_lastState;
			
 
				-    while (i < maxSize) {
			
 
				-      codecvt::State& s = m_states.at(i);
			
 
				-      i++;
			
 
				-      if (!s.used) {
			
 
				-        m_lastState = i;
			
 
				-        stateId = m_lastState;
			
 
				-        s.used = true;
			
 
				-        s.totalBytes = 0;
			
 
				-        s.bytesLeft = 0;
			
 
				-        break;
			
 
				-      }
			
 
				-      if (i >= m_states.size()) {
			
 
				-        i = 0;
			
 
				-      }
			
 
				-      if (i == m_lastState) {
			
 
				-        add = true;
			
 
				-        break;
			
 
				-      }
			
 
				-    }
			
 
				-  };
			
 
				-  if (add) {
			
 
				-    codecvt::State s = { true, 0, 0, { 0, 0, 0, 0 } };
			
 
				-    m_states.push_back(s);
			
 
				-    m_lastState = (unsigned int)m_states.size();
			
 
				-    stateId = m_lastState;
			
 
				-  }
			
 
				-  return stateId;
			
 
				-};
			
--- a/Source/cm_codecvt.hxx
+++ b/Source/cm_codecvt.hxx
@@ -6,7 +6,6 @@
 
				 #include "cmConfigure.h"
			
 
				 
			
 
				 #include <locale>
			
 
				-#include <vector>
			
 
				 #include <wchar.h>
			
 
				 
			
 
				 class codecvt : public std::codecvt<char, char, mbstate_t>
			
@@ -35,21 +34,30 @@ protected:
 
				   int do_encoding() const throw() CM_OVERRIDE;
			
 
				 
			
 
				 private:
			
 
				-  typedef struct
			
 
				+  // The mbstate_t argument to do_out and do_unshift is responsible
			
 
				+  // for storing state between calls.  We cannot control the type
			
 
				+  // since we want to imbue on standard streams.  However, we do
			
 
				+  // know that it is a trivial type.  Define our own type to overlay
			
 
				+  // on it safely with no alignment requirements.
			
 
				+  struct State
			
 
				   {
			
 
				-    bool used;
			
 
				-    unsigned char totalBytes;
			
 
				-    unsigned char bytesLeft;
			
 
				-    char bytes[4];
			
 
				-  } State;
			
 
				+    // Buffer bytes we have consumed from a partial codepoint.
			
 
				+    char partial[3];
			
 
				 
			
 
				-  unsigned int findStateId() const;
			
 
				+    // Number of bytes we have buffered from a partial codepoint.
			
 
				+    unsigned char buffered : 4;
			
 
				+
			
 
				+    // Size of the current codepoint in bytes.
			
 
				+    unsigned char size : 4;
			
 
				+  };
			
 
				 
			
 
				   bool m_noconv;
			
 
				-  mutable std::vector<State> m_states;
			
 
				-  mutable unsigned int m_lastState;
			
 
				 #if defined(_WIN32)
			
 
				   unsigned int m_codepage;
			
 
				+  result Decode(mbstate_t& state, int need, const char*& from_next,
			
 
				+                char*& to_next, char* to_end) const;
			
 
				+  result DecodePartial(mbstate_t& state, char*& to_next, char* to_end) const;
			
 
				+  void BufferPartial(mbstate_t& state, int need, const char*& from_next) const;
			
 
				 #endif
			
 
				 
			
 
				 #endif
			
--- a/Source/cm_utf8.c
+++ b/Source/cm_utf8.c
@@ -15,7 +15,7 @@
 
				 */
			
 
				 
			
 
				 /* Number of leading ones before a zero in the byte.  */
			
 
				-static unsigned char const cm_utf8_ones[256] = {
			
 
				+unsigned char const cm_utf8_ones[256] = {
			
 
				   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,