TextOperations.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /*
  2. * TextOperations.cpp, part of VCMI engine
  3. *
  4. * Authors: listed in file AUTHORS in main folder
  5. *
  6. * License: GNU General Public License v2.0 or later
  7. * Full text of license available in license.txt file, in main folder
  8. *
  9. */
  10. #include "StdInc.h"
  11. #include "TextOperations.h"
  12. #include "texts/CGeneralTextHandler.h"
  13. #include "Languages.h"
  14. #include "CConfigHandler.h"
  15. #include <vstd/DateUtils.h>
  16. #include <boost/locale.hpp>
  17. VCMI_LIB_NAMESPACE_BEGIN
  18. size_t TextOperations::getUnicodeCharacterSize(char firstByte)
  19. {
  20. // length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
  21. // 0xxxxxxx -> 1 - ASCII chars
  22. // 110xxxxx -> 2
  23. // 1110xxxx -> 3
  24. // 11110xxx -> 4 - last allowed in current standard
  25. auto value = static_cast<uint8_t>(firstByte);
  26. if ((value & 0b10000000) == 0)
  27. return 1; // ASCII
  28. if ((value & 0b11100000) == 0b11000000)
  29. return 2;
  30. if ((value & 0b11110000) == 0b11100000)
  31. return 3;
  32. if ((value & 0b11111000) == 0b11110000)
  33. return 4;
  34. assert(0);// invalid unicode sequence
  35. return 4;
  36. }
  37. bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
  38. {
  39. assert(maxSize > 0);
  40. auto value = static_cast<uint8_t>(character[0]);
  41. // ASCII
  42. if ( value < 0b10000000)
  43. return maxSize > 0;
  44. // can't be first byte in UTF8
  45. if (value < 0b11000000)
  46. return false;
  47. // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
  48. if (value > 0b11110000)
  49. return false;
  50. // first character must follow rules checked in getUnicodeCharacterSize
  51. size_t size = getUnicodeCharacterSize(character[0]);
  52. if (size > maxSize)
  53. return false;
  54. // remaining characters must have highest bit set to 1
  55. for (size_t i = 1; i < size; i++)
  56. {
  57. auto characterValue = static_cast<uint8_t>(character[i]);
  58. if (characterValue < 0b10000000)
  59. return false;
  60. }
  61. return true;
  62. }
  63. bool TextOperations::isValidASCII(const std::string & text)
  64. {
  65. for (const char & ch : text)
  66. if (static_cast<uint8_t>(ch) >= 0x80 )
  67. return false;
  68. return true;
  69. }
  70. bool TextOperations::isValidASCII(const char * data, size_t size)
  71. {
  72. for (size_t i=0; i<size; i++)
  73. if (static_cast<uint8_t>(data[i]) >= 0x80 )
  74. return false;
  75. return true;
  76. }
  77. bool TextOperations::isValidUnicodeString(const std::string & text)
  78. {
  79. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  80. {
  81. if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
  82. return false;
  83. }
  84. return true;
  85. }
  86. bool TextOperations::isValidUnicodeString(const char * data, size_t size)
  87. {
  88. for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
  89. {
  90. if (!isValidUnicodeCharacter(data + i, size - i))
  91. return false;
  92. }
  93. return true;
  94. }
  95. uint32_t TextOperations::getUnicodeCodepoint(const char * data, size_t maxSize)
  96. {
  97. assert(isValidUnicodeCharacter(data, maxSize));
  98. if (!isValidUnicodeCharacter(data, maxSize))
  99. return 0;
  100. // https://en.wikipedia.org/wiki/UTF-8#Encoding
  101. switch (getUnicodeCharacterSize(data[0]))
  102. {
  103. case 1:
  104. return static_cast<uint8_t>(data[0]) & 0b1111111;
  105. case 2:
  106. return
  107. ((static_cast<uint8_t>(data[0]) & 0b11111 ) << 6) +
  108. ((static_cast<uint8_t>(data[1]) & 0b111111) << 0) ;
  109. case 3:
  110. return
  111. ((static_cast<uint8_t>(data[0]) & 0b1111 ) << 12) +
  112. ((static_cast<uint8_t>(data[1]) & 0b111111) << 6) +
  113. ((static_cast<uint8_t>(data[2]) & 0b111111) << 0) ;
  114. case 4:
  115. return
  116. ((static_cast<uint8_t>(data[0]) & 0b111 ) << 18) +
  117. ((static_cast<uint8_t>(data[1]) & 0b111111) << 12) +
  118. ((static_cast<uint8_t>(data[2]) & 0b111111) << 6) +
  119. ((static_cast<uint8_t>(data[3]) & 0b111111) << 0) ;
  120. }
  121. assert(0);
  122. return 0;
  123. }
  124. uint32_t TextOperations::getUnicodeCodepoint(char data, const std::string & encoding )
  125. {
  126. std::string stringNative(1, data);
  127. std::string stringUnicode = toUnicode(stringNative, encoding);
  128. if (stringUnicode.empty())
  129. return 0;
  130. return getUnicodeCodepoint(stringUnicode.data(), stringUnicode.size());
  131. }
  132. std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
  133. {
  134. try {
  135. return boost::locale::conv::to_utf<char>(text, encoding);
  136. }
  137. catch (const boost::locale::conv::conversion_error &)
  138. {
  139. throw std::runtime_error("Failed to convert text '" + text + "' from encoding " + encoding );
  140. }
  141. }
  142. std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
  143. {
  144. try {
  145. return boost::locale::conv::from_utf<char>(text, encoding);
  146. }
  147. catch (const boost::locale::conv::conversion_error &)
  148. {
  149. throw std::runtime_error("Failed to convert text '" + text + "' to encoding " + encoding );
  150. }
  151. }
  152. void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
  153. {
  154. if(text.empty())
  155. return;
  156. //todo: more efficient algorithm
  157. for(int i = 0; i< amount; i++){
  158. auto b = text.begin();
  159. auto e = text.end();
  160. size_t lastLen = 0;
  161. size_t len = 0;
  162. while (b != e) {
  163. lastLen = len;
  164. size_t n = getUnicodeCharacterSize(*b);
  165. if(!isValidUnicodeCharacter(&(*b),e-b))
  166. {
  167. logGlobal->error("Invalid UTF8 sequence");
  168. break;//invalid sequence will be trimmed
  169. }
  170. len += n;
  171. b += n;
  172. }
  173. text.resize(lastLen);
  174. }
  175. }
  176. size_t TextOperations::getUnicodeCharactersCount(const std::string & text)
  177. {
  178. size_t charactersCount = 0;
  179. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  180. charactersCount++;
  181. return charactersCount;
  182. }
  183. std::string TextOperations::escapeString(std::string input)
  184. {
  185. boost::replace_all(input, "\\", "\\\\");
  186. boost::replace_all(input, "\n", "\\n");
  187. boost::replace_all(input, "\r", "\\r");
  188. boost::replace_all(input, "\t", "\\t");
  189. boost::replace_all(input, "\"", "\\\"");
  190. return input;
  191. }
  192. std::string TextOperations::getFormattedDateTimeLocal(std::time_t dt)
  193. {
  194. return vstd::getFormattedDateTime(dt, Languages::getLanguageOptions(settings["general"]["language"].String()).dateTimeFormat);
  195. }
  196. std::string TextOperations::getFormattedTimeLocal(std::time_t dt)
  197. {
  198. return vstd::getFormattedDateTime(dt, "%H:%M");
  199. }
  200. std::string TextOperations::getCurrentFormattedTimeLocal(std::chrono::seconds timeOffset)
  201. {
  202. auto timepoint = std::chrono::system_clock::now() + timeOffset;
  203. return TextOperations::getFormattedTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
  204. }
  205. std::string TextOperations::getCurrentFormattedDateTimeLocal(std::chrono::seconds timeOffset)
  206. {
  207. auto timepoint = std::chrono::system_clock::now() + timeOffset;
  208. return TextOperations::getFormattedDateTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
  209. }
  210. int TextOperations::getLevenshteinDistance(const std::string & s, const std::string & t)
  211. {
  212. int n = t.size();
  213. int m = s.size();
  214. // create two work vectors of integer distances
  215. std::vector<int> v0(n+1, 0);
  216. std::vector<int> v1(n+1, 0);
  217. // initialize v0 (the previous row of distances)
  218. // this row is A[0][i]: edit distance from an empty s to t;
  219. // that distance is the number of characters to append to s to make t.
  220. for (int i = 0; i < n; ++i)
  221. v0[i] = i;
  222. for (int i = 0; i < m; ++i)
  223. {
  224. // calculate v1 (current row distances) from the previous row v0
  225. // first element of v1 is A[i + 1][0]
  226. // edit distance is delete (i + 1) chars from s to match empty t
  227. v1[0] = i + 1;
  228. // use formula to fill in the rest of the row
  229. for (int j = 0; j < n; ++j)
  230. {
  231. // calculating costs for A[i + 1][j + 1]
  232. int deletionCost = v0[j + 1] + 1;
  233. int insertionCost = v1[j] + 1;
  234. int substitutionCost;
  235. if (s[i] == t[j])
  236. substitutionCost = v0[j];
  237. else
  238. substitutionCost = v0[j] + 1;
  239. v1[j + 1] = std::min({deletionCost, insertionCost, substitutionCost});
  240. }
  241. // copy v1 (current row) to v0 (previous row) for next iteration
  242. // since data in v1 is always invalidated, a swap without copy could be more efficient
  243. std::swap(v0, v1);
  244. }
  245. // after the last swap, the results of v1 are now in v0
  246. return v0[n];
  247. }
  248. bool TextOperations::textSearchSimilar(const std::string & s, const std::string & t)
  249. {
  250. boost::locale::generator gen;
  251. std::locale loc = gen("en_US.UTF-8"); // support for UTF8 lowercase
  252. auto haystack = boost::locale::to_lower(t, loc);
  253. auto needle = boost::locale::to_lower(s, loc);
  254. if(boost::algorithm::contains(haystack, needle))
  255. return true;
  256. if(needle.size() > haystack.size())
  257. return false;
  258. for(int i = 0; i < haystack.size() - needle.size() + 1; i++)
  259. {
  260. auto dist = getLevenshteinDistance(haystack.substr(i, needle.size()), needle);
  261. if(needle.size() > 2 && dist <= 1)
  262. return true;
  263. else if(needle.size() > 4 && dist <= 2)
  264. return true;
  265. }
  266. return false;
  267. }
  268. VCMI_LIB_NAMESPACE_END