%!s(int64=11) %!d(string=hai) anos · 5b30ec28f9
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
 
				 ``REGEX <regex>``
			
 
				  Consider only strings that match the given regular expression.
			
 
				 
			
 
				+``ENCODING <encoding-type>``
			
 
				+ Consider strings of a given encoding.  "UTF-8" is currently supported.
			
 
				+
			
 
				 For example, the code
			
 
				 
			
 
				 .. code-block:: cmake
			
--- a/Help/release/dev/file-strings-encoding.rst
+++ b/Help/release/dev/file-strings-encoding.rst
@@ -0,0 +1,5 @@
 
				+file-strings-encoding
			
 
				+---------------------
			
 
				+
			
 
				+* The :command:`file(STRINGS)` command gained a new ``ENCODING``
			
 
				+  option to enable extraction of ``UTF-8`` strings.
			
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
 
				          arg_length_minimum,
			
 
				          arg_length_maximum,
			
 
				          arg__maximum,
			
 
				-         arg_regex };
			
 
				+         arg_regex,
			
 
				+         arg_encoding };
			
 
				   unsigned int minlen = 0;
			
 
				   unsigned int maxlen = 0;
			
 
				   int limit_input = -1;
			
@@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
 
				   bool have_regex = false;
			
 
				   bool newline_consume = false;
			
 
				   bool hex_conversion_enabled = true;
			
 
				+  bool utf8_encoding = false;
			
 
				   int arg_mode = arg_none;
			
 
				   for(unsigned int i=3; i < args.size(); ++i)
			
 
				     {
			
@@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
 
				       hex_conversion_enabled = false;
			
 
				       arg_mode = arg_none;
			
 
				       }
			
 
				+    else if(args[i] == "ENCODING")
			
 
				+      {
			
 
				+      arg_mode = arg_encoding;
			
 
				+      }
			
 
				     else if(arg_mode == arg_limit_input)
			
 
				       {
			
 
				       if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
			
@@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
 
				       have_regex = true;
			
 
				       arg_mode = arg_none;
			
 
				       }
			
 
				+    else if(arg_mode == arg_encoding)
			
 
				+      {
			
 
				+      if(args[i] == "UTF-8")
			
 
				+        {
			
 
				+        utf8_encoding = true;
			
 
				+        }
			
 
				+      else
			
 
				+        {
			
 
				+        cmOStringStream e;
			
 
				+        e << "STRINGS option ENCODING \""
			
 
				+          << args[i] << "\" not recognized.";
			
 
				+        this->SetError(e.str());
			
 
				+        return false;
			
 
				+        }
			
 
				+      arg_mode = arg_none;
			
 
				+      }
			
 
				     else
			
 
				       {
			
 
				       cmOStringStream e;
			
@@ -618,6 +640,52 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
 
				       // c is guaranteed to fit in char by the above if...
			
 
				       current_str += static_cast<char>(c);
			
 
				       }
			
 
				+    else if(utf8_encoding)
			
 
				+      {
			
 
				+      // Check for UTF-8 encoded string (up to 4 octets)
			
 
				+      static const unsigned char utf8_check_table[3][2] =
			
 
				+        {
			
 
				+          {0xE0, 0xC0},
			
 
				+          {0xF0, 0xE0},
			
 
				+          {0xF8, 0xF0},
			
 
				+        };
			
 
				+
			
 
				+      // how many octets are there?
			
 
				+      unsigned int num_utf8_bytes = 0;
			
 
				+      for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
			
 
				+        {
			
 
				+        if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
			
 
				+          num_utf8_bytes = j+2;
			
 
				+        }
			
 
				+
			
 
				+      // get subsequent octets and check that they are valid
			
 
				+      for(unsigned int j=0; j<num_utf8_bytes; j++)
			
 
				+        {
			
 
				+        if(j != 0)
			
 
				+          {
			
 
				+          c = fin.get();
			
 
				+          if(!fin || (c & 0xC0) != 0x80)
			
 
				+            {
			
 
				+            fin.putback(static_cast<char>(c));
			
 
				+            break;
			
 
				+            }
			
 
				+          }
			
 
				+        current_str += static_cast<char>(c);
			
 
				+        }
			
 
				+
			
 
				+      // if this was an invalid utf8 sequence, discard the data, and put
			
 
				+      // back subsequent characters
			
 
				+      if((current_str.length() != num_utf8_bytes))
			
 
				+        {
			
 
				+        for(unsigned int j=0; j<current_str.size()-1; j++)
			
 
				+          {
			
 
				+          c = current_str[current_str.size() - 1 - j];
			
 
				+          fin.putback(static_cast<char>(c));
			
 
				+          }
			
 
				+        current_str = "";
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				 
			
 
				     if(c == '\n' && !newline_consume)
			
 
				       {
			
--- a/Tests/StringFileTest/CMakeLists.txt
+++ b/Tests/StringFileTest/CMakeLists.txt
@@ -55,6 +55,16 @@ else()
 
				     "file(STRINGS) incorrectly read from srec file [${infile_strings}]")
			
 
				 endif()
			
 
				 
			
 
				+#this file has utf-8 content
			
 
				+file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
			
 
				+list(LENGTH infile_strings content_len)
			
 
				+if(content_len MATCHES "3")
			
 
				+  message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
			
 
				+else()
			
 
				+  message(SEND_ERROR
			
 
				+    "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
			
 
				+endif()
			
 
				+
			
 
				 # String test
			
 
				 string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
			
 
				 string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
			
--- a/Tests/StringFileTest/test.utf8
+++ b/Tests/StringFileTest/test.utf8
@@ -0,0 +1,3 @@
 
				+The value of Ï€ (pi) is 3.141593
			
 
				+Line mixed with binary partially matches valid utf8: Ï€ is à93.1593
			
 
				+à