Procházet zdrojové kódy

file: Teach STRINGS to support UTF-16 and UTF-32 encodings

Justin Borodinsky před 10 roky
rodič
revize
1f77a7001b

+ 4 - 1
Help/command/file.rst

@@ -65,7 +65,10 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
  Consider only strings that match the given regular expression.
 
 ``ENCODING <encoding-type>``
- Consider strings of a given encoding.  "UTF-8" is currently supported.
+ Consider strings of a given encoding.  Currently supported encodings are:
+ UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE.  If the ENCODING option
+ is not provided and the file has a Byte Order Mark, the ENCODING option
+ will be defaulted to respect the Byte Order Mark.
 
 For example, the code
 

+ 5 - 0
Help/release/dev/file-strings-utf-16.rst

@@ -0,0 +1,5 @@
+file-strings-utf-16
+-------------------
+
+* The :command:`file(STRINGS)` now supports UTF-16LE, UTF-16BE,
+  UTF-32LE, UTF-32BE as ``ENCODING`` options.

+ 61 - 3
Source/cmFileCommand.cxx

@@ -472,7 +472,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
   bool have_regex = false;
   bool newline_consume = false;
   bool hex_conversion_enabled = true;
-  bool utf8_encoding = false;
+  enum { encoding_none = cmsys::FStream::BOM_None,
+         encoding_utf8 = cmsys::FStream::BOM_UTF8,
+         encoding_utf16le = cmsys::FStream::BOM_UTF16LE,
+         encoding_utf16be = cmsys::FStream::BOM_UTF16BE,
+         encoding_utf32le = cmsys::FStream::BOM_UTF32LE,
+         encoding_utf32be = cmsys::FStream::BOM_UTF32BE};
+  int encoding = encoding_none;
   int arg_mode = arg_none;
   for(unsigned int i=3; i < args.size(); ++i)
     {
@@ -599,7 +605,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       {
       if(args[i] == "UTF-8")
         {
-        utf8_encoding = true;
+        encoding = encoding_utf8;
+        }
+      else if(args[i] == "UTF-16LE")
+        {
+        encoding = encoding_utf16le;
+        }
+      else if(args[i] == "UTF-16BE")
+        {
+        encoding = encoding_utf16be;
+        }
+      else if(args[i] == "UTF-32LE")
+        {
+        encoding = encoding_utf32le;
+        }
+      else if(args[i] == "UTF-32BE")
+        {
+        encoding = encoding_utf32be;
         }
       else
         {
@@ -647,6 +669,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
     return false;
     }
 
+  //If BOM is found and encoding was not specified, use the BOM
+  int bom_found = cmsys::FStream::ReadBOM(fin);
+  if(encoding == encoding_none && bom_found != cmsys::FStream::BOM_None)
+    {
+    encoding = bom_found;
+    }
+
+  unsigned int bytes_rem = 0;
+  if(encoding == encoding_utf16le || encoding == encoding_utf16be)
+    {
+    bytes_rem = 1;
+    }
+  if(encoding == encoding_utf32le || encoding == encoding_utf32be)
+    {
+    bytes_rem = 3;
+    }
+
   // Parse strings out of the file.
   int output_size = 0;
   std::vector<std::string> strings;
@@ -658,6 +697,25 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
     std::string current_str;
 
     int c = fin.get();
+    for(unsigned int i=0; i<bytes_rem; ++i)
+      {
+      int c1 = fin.get();
+      if(!fin)
+        {
+        fin.putback(static_cast<char>(c1));
+        break;
+        }
+      c = (c << 8) | c1;
+      }
+    if(encoding == encoding_utf16le)
+      {
+      c = ((c & 0xFF) << 8) | ((c & 0xFF00) >> 8);
+      }
+    else if(encoding == encoding_utf32le)
+      {
+       c = (((c & 0xFF) << 24) | ((c & 0xFF00) << 8) |
+          ((c & 0xFF0000) >> 8) | ((c & 0xFF000000) >> 24));
+      }
 
     if(c == '\r')
       {
@@ -673,7 +731,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
       // c is guaranteed to fit in char by the above if...
       current_str += static_cast<char>(c);
       }
-    else if(utf8_encoding)
+    else if(encoding == encoding_utf8)
       {
       // Check for UTF-8 encoded string (up to 4 octets)
       static const unsigned char utf8_check_table[3][2] =

+ 5 - 0
Tests/RunCMake/string/RunCMakeTest.cmake

@@ -12,3 +12,8 @@ run_cmake(UuidMissingTypeValue)
 run_cmake(UuidBadType)
 
 run_cmake(RegexClear)
+
+run_cmake(UTF-16BE)
+run_cmake(UTF-16LE)
+run_cmake(UTF-32BE)
+run_cmake(UTF-32LE)

+ 2 - 0
Tests/RunCMake/string/UTF-16BE-stderr.txt

@@ -0,0 +1,2 @@
+Hello World
+Hello World

+ 4 - 0
Tests/RunCMake/string/UTF-16BE.cmake

@@ -0,0 +1,4 @@
+file(STRINGS UTF-16BE.txt str ENCODING UTF-16BE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-16BE.txt str LENGTH_MINIMUM 4)
+message("${str}")

binární
Tests/RunCMake/string/UTF-16BE.txt


+ 2 - 0
Tests/RunCMake/string/UTF-16LE-stderr.txt

@@ -0,0 +1,2 @@
+Hello World
+Hello World

+ 4 - 0
Tests/RunCMake/string/UTF-16LE.cmake

@@ -0,0 +1,4 @@
+file(STRINGS UTF-16LE.txt str ENCODING UTF-16LE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-16LE.txt str LENGTH_MINIMUM 4)
+message("${str}")

binární
Tests/RunCMake/string/UTF-16LE.txt


+ 2 - 0
Tests/RunCMake/string/UTF-32BE-stderr.txt

@@ -0,0 +1,2 @@
+Hello World
+Hello World

+ 4 - 0
Tests/RunCMake/string/UTF-32BE.cmake

@@ -0,0 +1,4 @@
+file(STRINGS UTF-32BE.txt str ENCODING UTF-32BE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-32BE.txt str LENGTH_MINIMUM 4)
+message("${str}")

binární
Tests/RunCMake/string/UTF-32BE.txt


+ 2 - 0
Tests/RunCMake/string/UTF-32LE-stderr.txt

@@ -0,0 +1,2 @@
+Hello World
+Hello World

+ 4 - 0
Tests/RunCMake/string/UTF-32LE.cmake

@@ -0,0 +1,4 @@
+file(STRINGS UTF-32LE.txt str ENCODING UTF-32LE LENGTH_MINIMUM 4)
+message("${str}")
+file(STRINGS UTF-32LE.txt str LENGTH_MINIMUM 4)
+message("${str}")

binární
Tests/RunCMake/string/UTF-32LE.txt