瀏覽代碼

StdIo: Add a Windows Console adaptor for cin, cout, and cerr

On Windows, the only reliable way to read/write Unicode text from/to a
Console is to use `ReadConsoleW` and `WriteConsoleW` and convert from/to
wide-character encoding.  When `cin`, `cout`, and/or `cerr` are attached
to a Windows Console, use a custom C++ `streambuf` to handle the I/O.

This will replace KWSys ConsoleBuf, whose implementation is more complex
to support narrow output streams on Windows non-UTF-8 narrow encodings.
We only need to support UTF-8.

Issue: #26924
Brad King 5 月之前
父節點
當前提交
f9f1f9a8cd
共有 5 個文件被更改,包括 463 次插入1 次删除
  1. 2 0
      Source/CMakeLists.txt
  2. 361 0
      Source/cmStdIoConsole.cxx
  3. 47 0
      Source/cmStdIoConsole.h
  4. 48 1
      Tests/CMakeLib/testStdIo.cxx
  5. 5 0
      bootstrap

+ 2 - 0
Source/CMakeLists.txt

@@ -468,6 +468,8 @@ add_library(
   cmStateSnapshot.cxx
   cmStateSnapshot.h
   cmStateTypes.h
+  cmStdIoConsole.h
+  cmStdIoConsole.cxx
   cmStdIoInit.h
   cmStdIoInit.cxx
   cmStdIoStream.h

+ 361 - 0
Source/cmStdIoConsole.cxx

@@ -0,0 +1,361 @@
+/* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+   file LICENSE.rst or https://cmake.org/licensing for details.  */
+#include "cmStdIoConsole.h"
+
+#ifdef _WIN32
+#  include <cstddef>
+#  include <cstdlib>
+#  include <ios>
+#  include <streambuf>
+#  include <utility>
+#  include <vector>
+
+#  include <cm/memory>
+
+#  include <windows.h>
+
+#  include <fcntl.h> // for _O_BINARY
+#  include <io.h>    // for _setmode
+
+#  include "cm_utf8.h"
+
+#  include "cmStdIoStream.h"
+#endif
+
+namespace cm {
+namespace StdIo {
+
+namespace {
+
+#ifdef _WIN32
+// Base class for a streambuf that reads or writes a Windows Console.
+class ConsoleBuf : public std::streambuf
+{
+public:
+  ConsoleBuf(HANDLE console)
+    : console_(console)
+  {
+  }
+
+  ~ConsoleBuf() throw() override {}
+
+protected:
+  HANDLE console_ = nullptr;
+};
+
+// A streambuf that reads from a Windows Console using wide-character
+// encoding to avoid conversion through the console output code page.
+class ConsoleBufRead : public ConsoleBuf
+{
+public:
+  ConsoleBufRead(HANDLE console, DWORD consoleMode)
+    : ConsoleBuf(console)
+    , ConsoleMode_(consoleMode)
+  {
+  }
+
+  ~ConsoleBufRead() throw() override {}
+
+protected:
+  // Called to read an input character when the input buffer may be empty.
+  int_type underflow() override
+  {
+    // If the input buffer is not empty, return the next input character.
+    if (this->gptr() < this->egptr()) {
+      return traits_type::to_int_type(*this->gptr());
+    }
+
+    // The input buffer is empty.  Read more input from the console.
+    static constexpr std::size_t kBufSize = 4096;
+    this->TmpW_.resize(kBufSize);
+    DWORD wlen = 0;
+    if (!ReadConsoleW(this->console_, this->TmpW_.data(),
+                      DWORD(this->TmpW_.size()), &wlen, nullptr)) {
+      // Failure.  Nothing was read.
+      return traits_type::eof();
+    }
+
+    // Emulate ReadFile behavior when the console is in "cooked mode".
+    // Treat a leading Ctrl+Z as EOF.
+    static constexpr char ctrl_z = 26; // Ctrl+Z is Ctrl + 26th letter.
+    if ((this->ConsoleMode_ & ENABLE_LINE_INPUT) &&
+        (wlen > 0 && this->TmpW_.front() == ctrl_z)) {
+      wlen = 0;
+    }
+
+    // Convert the wide-character encoding from the console to our
+    // internal UTF-8 narrow encoding.
+    if (int nlen =
+          WideCharToMultiByte(CP_UTF8, 0, this->TmpW_.data(), int(wlen),
+                              nullptr, 0, nullptr, nullptr)) {
+      this->Buf_.resize(nlen);
+      if (WideCharToMultiByte(CP_UTF8, 0, this->TmpW_.data(), int(wlen),
+                              this->Buf_.data(), int(nlen), nullptr,
+                              nullptr)) {
+        // The converted content is now in the input buffer.
+        this->setg_();
+
+        // Success.  Return the next input character.
+        return traits_type::to_int_type(*this->gptr());
+      }
+    }
+
+    // Failure.  Nothing was read.
+    return traits_type::eof();
+  }
+
+private:
+  DWORD ConsoleMode_ = 0;
+  std::vector<char> Buf_;
+  std::vector<wchar_t> TmpW_;
+
+  // Set input buffer pointers.
+  void setg_()
+  {
+    this->setg(this->Buf_.data(), this->Buf_.data(),
+               this->Buf_.data() + this->Buf_.size());
+  }
+};
+
+// A streambuf that writes to a Windows Console using wide-character
+// encoding to avoid conversion through the console output code page.
+class ConsoleBufWrite : public ConsoleBuf
+{
+public:
+  ConsoleBufWrite(HANDLE console)
+    : ConsoleBuf(console)
+  {
+    this->setp_();
+  }
+
+  ~ConsoleBufWrite() throw() override { sync(); }
+
+protected:
+  // Called to sync input and output buffers with the underlying device.
+  int sync() override
+  {
+    // Flush buffered output, if any.
+    if (this->pptr() != this->pbase()) {
+      // Use overflow() to flush the entire output buffer.
+      // It returns eof on failure.
+      if (traits_type::eq_int_type(this->overflow(), traits_type::eof())) {
+        return -1;
+      }
+    }
+    return 0;
+  }
+
+  // Called to flush at least some content from the output buffer.
+  int_type overflow(int_type ch = traits_type::eof()) override
+  {
+    std::size_t nlen;     // Number of chars to emit.
+    std::size_t rlen = 0; // Number of chars to roll over.
+    if (traits_type::eq_int_type(ch, traits_type::eof())) {
+      // Our caller wants to flush the entire buffer.  If there is a
+      // trailing partial codepoint, it's the caller's fault.
+      nlen = this->pptr() - this->pbase();
+
+      // If the buffer is empty, trivially succeed.
+      if (nlen == 0) {
+        return traits_type::not_eof(ch);
+      }
+    } else {
+      // Our caller had no room for this character in the buffer.
+      // However, setp_() reserved one byte for us to store it.
+      *this->pptr() = traits_type::to_char_type(ch);
+      this->pbump(1);
+
+      // Flush all complete codepoints, of which we expect at least one.
+      // If there is a trailing partial codepoint, roll over those chars.
+      char const* p = this->pptr_();
+      nlen = p - this->pbase();
+      rlen = this->pptr() - p;
+    }
+
+    // Fail unless we emit at least one (wide) character.
+    int_type result = traits_type::eof();
+
+    // Convert our internal UTF-8 narrow encoding to wide-character
+    // encoding to write to the console.
+    if (int wlen = MultiByteToWideChar(CP_UTF8, 0, this->pbase(), int(nlen),
+                                       nullptr, 0)) {
+      this->TmpW_.resize(wlen);
+      if (MultiByteToWideChar(CP_UTF8, 0, this->pbase(), int(nlen),
+                              this->TmpW_.data(), int(wlen)) &&
+          WriteConsoleW(this->console_, this->TmpW_.data(), wlen, nullptr,
+                        nullptr)) {
+        result = traits_type::not_eof(ch);
+      }
+    }
+
+    // Remove emitted contents from the buffer.
+    this->Buf_.erase(this->Buf_.begin(), this->Buf_.begin() + nlen);
+
+    // Re-initialize the output buffer.
+    this->setp_();
+
+    // Move the put-pointer past the rollover content.
+    this->pbump(rlen);
+
+    return result;
+  }
+
+private:
+  std::vector<char> Buf_;
+  std::vector<wchar_t> TmpW_;
+
+  // Initialize the output buffer and set its put-pointer.
+  void setp_()
+  {
+    // Allocate the output buffer.
+    static constexpr std::size_t kBufSize = 4096;
+    this->Buf_.resize(kBufSize);
+
+    // Reserve one byte for the overflow() character.
+    this->setp(this->Buf_.data(), this->Buf_.data() + this->Buf_.size() - 1);
+  }
+
+  // Return pptr() adjusted backward past a partial codepoint.
+  char const* pptr_() const
+  {
+    char const* p = this->pptr();
+    while (p != this->pbase()) {
+      --p;
+      switch (cm_utf8_ones[static_cast<unsigned char>(*p)]) {
+        case 0: // 0xxx xxxx: starts codepoint of size 1
+          return p + 1;
+        case 1: // 10xx xxxx: continues a codepoint
+          continue;
+        case 2: // 110x xxxx: starts codepoint of size 2
+          return ((p + 2) <= this->pptr()) ? (p + 2) : p;
+        case 3: // 1110 xxxx: starts codepoint of size 3
+          return ((p + 3) <= this->pptr()) ? (p + 3) : p;
+        case 4: // 1111 0xxx: starts codepoint of size 4
+          return ((p + 4) <= this->pptr()) ? (p + 4) : p;
+        default: // invalid byte
+          // Roll over the invalid byte.
+          // The next overflow() will fail to convert it.
+          return p;
+      }
+    }
+    // No complete codepoint found.  This overflow() will fail.
+    return p;
+  }
+};
+
+#endif
+
+} // anonymous namespace
+
+#ifdef _WIN32
+class Console::Impl
+{
+protected:
+  class RAII
+  {
+    std::ios* IOS_ = nullptr;
+    int FD_ = -1;
+    std::unique_ptr<ConsoleBuf> ConsoleBuf_;
+    std::streambuf* OldStreamBuf_ = nullptr;
+    int OldMode_ = 0;
+
+    RAII(Stream& s);
+    void Init();
+
+  public:
+    RAII(IStream& is);
+    RAII(OStream& os);
+    ~RAII();
+  };
+  RAII In_;
+  RAII Out_;
+  RAII Err_;
+
+public:
+  Impl();
+  ~Impl();
+};
+
+Console::Impl::RAII::RAII(Stream& s)
+  : IOS_(&s.IOS())
+  , FD_(s.FD())
+{
+}
+
+Console::Impl::RAII::RAII(IStream& is)
+  : RAII(static_cast<Stream&>(is))
+{
+  DWORD mode;
+  if (is.Console() && GetConsoleMode(is.Console(), &mode) &&
+      GetConsoleCP() != CP_UTF8) {
+    // The input stream reads from a console whose input code page is not
+    // UTF-8.  Use a ConsoleBufRead to read wide-character encoding.
+    this->ConsoleBuf_ = cm::make_unique<ConsoleBufRead>(is.Console(), mode);
+  }
+  this->Init();
+}
+
+Console::Impl::RAII::RAII(OStream& os)
+  : RAII(static_cast<Stream&>(os))
+{
+  DWORD mode;
+  if (os.Console() && GetConsoleMode(os.Console(), &mode) &&
+      GetConsoleOutputCP() != CP_UTF8) {
+    // The output stream writes to a console whose output code page is not
+    // UTF-8.  Use a ConsoleBufWrite to write wide-character encoding.
+    this->ConsoleBuf_ = cm::make_unique<ConsoleBufWrite>(os.Console());
+  }
+  this->Init();
+}
+
+void Console::Impl::RAII::Init()
+{
+  if (this->ConsoleBuf_) {
+    this->OldStreamBuf_ = this->IOS_->rdbuf(this->ConsoleBuf_.get());
+  } else if (this->FD_ >= 0) {
+    // The stream reads/writes a pipe, a file, or a console whose code
+    // page is UTF-8.  Read/write UTF-8 using the default streambuf,
+    // but disable newline conversion to match ConsoleBuf behavior.
+    this->OldMode_ = _setmode(this->FD_, _O_BINARY);
+  }
+}
+
+Console::Impl::RAII::~RAII()
+{
+  if (this->ConsoleBuf_) {
+    this->IOS_->rdbuf(this->OldStreamBuf_);
+    this->OldStreamBuf_ = nullptr;
+    this->ConsoleBuf_.reset();
+  } else if (this->FD_ >= 0) {
+    this->IOS_->rdbuf()->pubsync();
+    _setmode(this->FD_, this->OldMode_);
+    this->OldMode_ = 0;
+  }
+  this->FD_ = -1;
+  this->IOS_ = nullptr;
+}
+
+Console::Impl::Impl()
+  : In_(In())
+  , Out_(Out())
+  , Err_(Err())
+{
+}
+
+Console::Impl::~Impl() = default;
+
+Console::Console()
+  : Impl_(cm::make_unique<Impl>())
+{
+}
+#else
+Console::Console() = default;
+#endif
+
+Console::~Console() = default;
+
+Console::Console(Console&&) noexcept = default;
+Console& Console::operator=(Console&&) noexcept = default;
+
+}
+}

+ 47 - 0
Source/cmStdIoConsole.h

@@ -0,0 +1,47 @@
+/* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+   file LICENSE.rst or https://cmake.org/licensing for details.  */
+#pragma once
+
+#include "cmConfigure.h" // IWYU pragma: keep
+
+#ifdef _WIN32
+#  include <memory>
+#endif
+
+namespace cm {
+namespace StdIo {
+
+/**
+ * On Windows, enables I/O with `cin`, `cout`, and `cerr` in UTF-8 encoding.
+ * On non-Windows platforms, does nothing.
+ *
+ * Construct an instance of this at the beginning of `main`:
+ *
+ * * If `cin`, `cout`, or `cerr` is attached to a Windows Console whose
+ *   input/output code page is not UTF-8, this replaces its `streambuf`
+ *   with one that reads/writes from/to the console using wide-character
+ *   Windows APIs to avoid limitations of the code page's narrow encoding.
+ *
+ * * If `cin`, `cout`, or `cerr` is not attached to a Windows Console,
+ *   this sets its stream to binary mode for consistency with the case
+ *   that it's attached to a console.
+ *
+ * Destroy the instance of this to restore the original `streambuf`s.
+ */
+class Console
+{
+#ifdef _WIN32
+  class Impl;
+  std::unique_ptr<Impl> Impl_;
+#endif
+public:
+  Console();
+  ~Console(); // NOLINT(performance-trivially-destructible)
+  Console(Console&&) noexcept;
+  Console(Console const&) = delete;
+  Console& operator=(Console&&) noexcept;
+  Console& operator=(Console const&) = delete;
+};
+
+}
+}

+ 48 - 1
Tests/CMakeLib/testStdIo.cxx

@@ -1,9 +1,12 @@
 /* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
    file LICENSE.rst or https://cmake.org/licensing for details.  */
 
+#include <string>
+
 #include <cm/string_view>
 #include <cmext/string_view>
 
+#include "cmStdIoConsole.h"
 #include "cmStdIoInit.h"
 #include "cmStdIoStream.h"
 
@@ -11,6 +14,18 @@
 
 namespace {
 
+#ifdef _WIN32
+cm::string_view const kUTF8 =
+  "  Chinese Hindi  Greek English Russian\n  "
+  "\xe6\xb3\xa8\xe6\x84\x8f    "                             // Chinese
+  "\xe0\xa4\xaf\xe0\xa5\x82\xe0\xa4\xa8\xe0"                 // ...
+  "\xa4\xbf\xe0\xa4\x95\xe0\xa5\x8b\xe0\xa4\xa1 "            // Hindi
+  "\xce\xb5\xce\xaf\xce\xbd\xce\xb1\xce\xb9 "                // Greek
+  "very    "                                                 // English
+  "\xd0\xb7\xd0\xb4\xd0\xbe\xd1\x80\xd0\xbe\xd0\xb2\xd0\xbe" // Russian
+  "!"_s;
+#endif
+
 void printTermKind(cm::string_view t, cm::StdIo::Stream& s)
 {
   switch (s.Kind()) {
@@ -37,12 +52,44 @@ bool testStream()
   return true;
 }
 
+bool testConsoleStdIn = false;
+
+bool testConsole()
+{
+  std::cout << "testConsole()\n";
+#ifdef _WIN32
+  std::cout << kUTF8 << '\n';
+#endif
+  if (testConsoleStdIn) {
+    std::cout << "  input: " << std::flush;
+    std::string line;
+    if (std::getline(std::cin, line)) {
+      std::cout << " output: " << line << '\n';
+    }
+  }
+  return true;
+}
+
+cm::string_view const kUsage = "usage: CMakeLibTests testStdIo [--stdin]"_s;
+
 }
 
-int testStdIo(int /*unused*/, char* /*unused*/[])
+int testStdIo(int argc, char* argv[])
 {
   cm::StdIo::Init();
+  cm::StdIo::Console console;
+
+  for (int i = 1; i < argc; ++i) {
+    if (argv[i] == "--stdin"_s && !testConsoleStdIn) {
+      testConsoleStdIn = true;
+    } else {
+      std::cerr << kUsage << '\n';
+      return 1;
+    }
+  }
+
   return runTests({
     testStream,
+    testConsole,
   });
 }

+ 5 - 0
bootstrap

@@ -489,6 +489,7 @@ CMAKE_CXX_SOURCES="\
   cmState \
   cmStateDirectory \
   cmStateSnapshot \
+  cmStdIoConsole \
   cmStdIoInit \
   cmStdIoStream \
   cmString \
@@ -544,6 +545,10 @@ if ${cmake_system_mingw}; then
   "
 fi
 
+CMAKE_C_SOURCES="\
+  cm_utf8 \
+"
+
 CMAKE_STD_CXX_HEADERS="\
   filesystem \
   memory \