TextOperations.cpp 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. /*
  2. * TextOperations.cpp, part of VCMI engine
  3. *
  4. * Authors: listed in file AUTHORS in main folder
  5. *
  6. * License: GNU General Public License v2.0 or later
  7. * Full text of license available in license.txt file, in main folder
  8. *
  9. */
  10. #include "StdInc.h"
  11. #include "TextOperations.h"
  12. #include "CGeneralTextHandler.h"
  13. #include <boost/locale.hpp>
  14. VCMI_LIB_NAMESPACE_BEGIN
  15. size_t Unicode::getCharacterSize(char firstByte)
  16. {
  17. // length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
  18. // 0xxxxxxx -> 1 - ASCII chars
  19. // 110xxxxx -> 2
  20. // 11110xxx -> 4 - last allowed in current standard
  21. // 1111110x -> 6 - last allowed in original standard
  22. if ((ui8)firstByte < 0x80)
  23. return 1; // ASCII
  24. size_t ret = 0;
  25. for (size_t i=0; i<8; i++)
  26. {
  27. if (((ui8)firstByte & (0x80 >> i)) != 0)
  28. ret++;
  29. else
  30. break;
  31. }
  32. return ret;
  33. }
  34. bool Unicode::isValidCharacter(const char * character, size_t maxSize)
  35. {
  36. // can't be first byte in UTF8
  37. if ((ui8)character[0] >= 0x80 && (ui8)character[0] < 0xC0)
  38. return false;
  39. // first character must follow rules checked in getCharacterSize
  40. size_t size = getCharacterSize((ui8)character[0]);
  41. if ((ui8)character[0] > 0xF4)
  42. return false; // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
  43. if (size > maxSize)
  44. return false;
  45. // remaining characters must have highest bit set to 1
  46. for (size_t i = 1; i < size; i++)
  47. {
  48. if (((ui8)character[i] & 0x80) == 0)
  49. return false;
  50. }
  51. return true;
  52. }
  53. bool Unicode::isValidASCII(const std::string & text)
  54. {
  55. for (const char & ch : text)
  56. if (ui8(ch) >= 0x80 )
  57. return false;
  58. return true;
  59. }
  60. bool Unicode::isValidASCII(const char * data, size_t size)
  61. {
  62. for (size_t i=0; i<size; i++)
  63. if (ui8(data[i]) >= 0x80 )
  64. return false;
  65. return true;
  66. }
  67. bool Unicode::isValidString(const std::string & text)
  68. {
  69. for (size_t i=0; i<text.size(); i += getCharacterSize(text[i]))
  70. {
  71. if (!isValidCharacter(text.data() + i, text.size() - i))
  72. return false;
  73. }
  74. return true;
  75. }
  76. bool Unicode::isValidString(const char * data, size_t size)
  77. {
  78. for (size_t i=0; i<size; i += getCharacterSize(data[i]))
  79. {
  80. if (!isValidCharacter(data + i, size - i))
  81. return false;
  82. }
  83. return true;
  84. }
  85. std::string Unicode::toUnicode(const std::string &text)
  86. {
  87. return toUnicode(text, CGeneralTextHandler::getInstalledEncoding());
  88. }
  89. std::string Unicode::toUnicode(const std::string &text, const std::string &encoding)
  90. {
  91. return boost::locale::conv::to_utf<char>(text, encoding);
  92. }
  93. std::string Unicode::fromUnicode(const std::string & text)
  94. {
  95. return fromUnicode(text, CGeneralTextHandler::getInstalledEncoding());
  96. }
  97. std::string Unicode::fromUnicode(const std::string &text, const std::string &encoding)
  98. {
  99. return boost::locale::conv::from_utf<char>(text, encoding);
  100. }
  101. void Unicode::trimRight(std::string & text, const size_t amount)
  102. {
  103. if(text.empty())
  104. return;
  105. //todo: more efficient algorithm
  106. for(int i = 0; i< amount; i++){
  107. auto b = text.begin();
  108. auto e = text.end();
  109. size_t lastLen = 0;
  110. size_t len = 0;
  111. while (b != e) {
  112. lastLen = len;
  113. size_t n = getCharacterSize(*b);
  114. if(!isValidCharacter(&(*b),e-b))
  115. {
  116. logGlobal->error("Invalid UTF8 sequence");
  117. break;//invalid sequence will be trimmed
  118. }
  119. len += n;
  120. b += n;
  121. }
  122. text.resize(lastLen);
  123. }
  124. }
  125. VCMI_LIB_NAMESPACE_END