2
0

TextOperations.cpp 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. /*
  2. * TextOperations.cpp, part of VCMI engine
  3. *
  4. * Authors: listed in file AUTHORS in main folder
  5. *
  6. * License: GNU General Public License v2.0 or later
  7. * Full text of license available in license.txt file, in main folder
  8. *
  9. */
  10. #include "StdInc.h"
  11. #include "TextOperations.h"
  12. #include "CGeneralTextHandler.h"
  13. #include <boost/locale.hpp>
  14. VCMI_LIB_NAMESPACE_BEGIN
  15. size_t TextOperations::getUnicodeCharacterSize(char firstByte)
  16. {
  17. // length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
  18. // 0xxxxxxx -> 1 - ASCII chars
  19. // 110xxxxx -> 2
  20. // 1110xxxx -> 3
  21. // 11110xxx -> 4 - last allowed in current standard
  22. auto value = static_cast<uint8_t>(firstByte);
  23. if ((value & 0b10000000) == 0)
  24. return 1; // ASCII
  25. if ((value & 0b11100000) == 0b11000000)
  26. return 2;
  27. if ((value & 0b11110000) == 0b11100000)
  28. return 3;
  29. if ((value & 0b11111000) == 0b11110000)
  30. return 4;
  31. assert(0);// invalid unicode sequence
  32. return 4;
  33. }
  34. bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
  35. {
  36. assert(maxSize > 0);
  37. auto value = static_cast<uint8_t>(character[0]);
  38. // ASCII
  39. if ( value < 0b10000000)
  40. return maxSize > 0;
  41. // can't be first byte in UTF8
  42. if (value < 0b11000000)
  43. return false;
  44. // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
  45. if (value > 0b11110000)
  46. return false;
  47. // first character must follow rules checked in getUnicodeCharacterSize
  48. size_t size = getUnicodeCharacterSize(character[0]);
  49. if (size > maxSize)
  50. return false;
  51. // remaining characters must have highest bit set to 1
  52. for (size_t i = 1; i < size; i++)
  53. {
  54. auto characterValue = static_cast<uint8_t>(character[i]);
  55. if (characterValue < 0b10000000)
  56. return false;
  57. }
  58. return true;
  59. }
  60. bool TextOperations::isValidASCII(const std::string & text)
  61. {
  62. for (const char & ch : text)
  63. if (static_cast<uint8_t>(ch) >= 0x80 )
  64. return false;
  65. return true;
  66. }
  67. bool TextOperations::isValidASCII(const char * data, size_t size)
  68. {
  69. for (size_t i=0; i<size; i++)
  70. if (static_cast<uint8_t>(data[i]) >= 0x80 )
  71. return false;
  72. return true;
  73. }
  74. bool TextOperations::isValidUnicodeString(const std::string & text)
  75. {
  76. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  77. {
  78. if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
  79. return false;
  80. }
  81. return true;
  82. }
  83. bool TextOperations::isValidUnicodeString(const char * data, size_t size)
  84. {
  85. for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
  86. {
  87. if (!isValidUnicodeCharacter(data + i, size - i))
  88. return false;
  89. }
  90. return true;
  91. }
  92. std::string TextOperations::toUnicode(const std::string &text)
  93. {
  94. return toUnicode(text, CGeneralTextHandler::getInstalledEncoding());
  95. }
  96. std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
  97. {
  98. return boost::locale::conv::to_utf<char>(text, encoding);
  99. }
  100. std::string TextOperations::fromUnicode(const std::string & text)
  101. {
  102. return fromUnicode(text, CGeneralTextHandler::getInstalledEncoding());
  103. }
  104. std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
  105. {
  106. return boost::locale::conv::from_utf<char>(text, encoding);
  107. }
  108. void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
  109. {
  110. if(text.empty())
  111. return;
  112. //todo: more efficient algorithm
  113. for(int i = 0; i< amount; i++){
  114. auto b = text.begin();
  115. auto e = text.end();
  116. size_t lastLen = 0;
  117. size_t len = 0;
  118. while (b != e) {
  119. lastLen = len;
  120. size_t n = getUnicodeCharacterSize(*b);
  121. if(!isValidUnicodeCharacter(&(*b),e-b))
  122. {
  123. logGlobal->error("Invalid UTF8 sequence");
  124. break;//invalid sequence will be trimmed
  125. }
  126. len += n;
  127. b += n;
  128. }
  129. text.resize(lastLen);
  130. }
  131. }
  132. std::string TextOperations::escapeString(std::string input)
  133. {
  134. boost::replace_all(input, "\\", "\\\\");
  135. boost::replace_all(input, "\n", "\\n");
  136. boost::replace_all(input, "\r", "\\r");
  137. boost::replace_all(input, "\t", "\\t");
  138. boost::replace_all(input, "\"", "\\\"");
  139. return input;
  140. }
  141. VCMI_LIB_NAMESPACE_END