TextOperations.cpp 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /*
  2. * TextOperations.cpp, part of VCMI engine
  3. *
  4. * Authors: listed in file AUTHORS in main folder
  5. *
  6. * License: GNU General Public License v2.0 or later
  7. * Full text of license available in license.txt file, in main folder
  8. *
  9. */
  10. #include "StdInc.h"
  11. #include "TextOperations.h"
  12. #include "../GameLibrary.h"
  13. #include "../texts/CGeneralTextHandler.h"
  14. #include "Languages.h"
  15. #include "CConfigHandler.h"
  16. #include <vstd/DateUtils.h>
  17. #include <boost/locale.hpp>
  18. VCMI_LIB_NAMESPACE_BEGIN
  19. size_t TextOperations::getUnicodeCharacterSize(char firstByte)
  20. {
  21. // length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
  22. // 0xxxxxxx -> 1 - ASCII chars
  23. // 110xxxxx -> 2
  24. // 1110xxxx -> 3
  25. // 11110xxx -> 4 - last allowed in current standard
  26. auto value = static_cast<uint8_t>(firstByte);
  27. if ((value & 0b10000000) == 0)
  28. return 1; // ASCII
  29. if ((value & 0b11100000) == 0b11000000)
  30. return 2;
  31. if ((value & 0b11110000) == 0b11100000)
  32. return 3;
  33. if ((value & 0b11111000) == 0b11110000)
  34. return 4;
  35. assert(0);// invalid unicode sequence
  36. return 4;
  37. }
  38. bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
  39. {
  40. assert(maxSize > 0);
  41. auto value = static_cast<uint8_t>(character[0]);
  42. // ASCII
  43. if ( value < 0b10000000)
  44. return maxSize > 0;
  45. // can't be first byte in UTF8
  46. if (value < 0b11000000)
  47. return false;
  48. // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
  49. if (value > 0b11110000)
  50. return false;
  51. // first character must follow rules checked in getUnicodeCharacterSize
  52. size_t size = getUnicodeCharacterSize(character[0]);
  53. if (size > maxSize)
  54. return false;
  55. // remaining characters must have highest bit set to 1
  56. for (size_t i = 1; i < size; i++)
  57. {
  58. auto characterValue = static_cast<uint8_t>(character[i]);
  59. if (characterValue < 0b10000000)
  60. return false;
  61. }
  62. return true;
  63. }
  64. bool TextOperations::isValidASCII(const std::string & text)
  65. {
  66. for (const char & ch : text)
  67. if (static_cast<uint8_t>(ch) >= 0x80 )
  68. return false;
  69. return true;
  70. }
  71. bool TextOperations::isValidASCII(const char * data, size_t size)
  72. {
  73. for (size_t i=0; i<size; i++)
  74. if (static_cast<uint8_t>(data[i]) >= 0x80 )
  75. return false;
  76. return true;
  77. }
  78. bool TextOperations::isValidUnicodeString(const std::string & text)
  79. {
  80. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  81. {
  82. if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
  83. return false;
  84. }
  85. return true;
  86. }
  87. bool TextOperations::isValidUnicodeString(const char * data, size_t size)
  88. {
  89. for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
  90. {
  91. if (!isValidUnicodeCharacter(data + i, size - i))
  92. return false;
  93. }
  94. return true;
  95. }
  96. uint32_t TextOperations::getUnicodeCodepoint(const char * data, size_t maxSize)
  97. {
  98. assert(isValidUnicodeCharacter(data, maxSize));
  99. if (!isValidUnicodeCharacter(data, maxSize))
  100. return 0;
  101. // https://en.wikipedia.org/wiki/UTF-8#Encoding
  102. switch (getUnicodeCharacterSize(data[0]))
  103. {
  104. case 1:
  105. return static_cast<uint8_t>(data[0]) & 0b1111111;
  106. case 2:
  107. return
  108. ((static_cast<uint8_t>(data[0]) & 0b11111 ) << 6) +
  109. ((static_cast<uint8_t>(data[1]) & 0b111111) << 0) ;
  110. case 3:
  111. return
  112. ((static_cast<uint8_t>(data[0]) & 0b1111 ) << 12) +
  113. ((static_cast<uint8_t>(data[1]) & 0b111111) << 6) +
  114. ((static_cast<uint8_t>(data[2]) & 0b111111) << 0) ;
  115. case 4:
  116. return
  117. ((static_cast<uint8_t>(data[0]) & 0b111 ) << 18) +
  118. ((static_cast<uint8_t>(data[1]) & 0b111111) << 12) +
  119. ((static_cast<uint8_t>(data[2]) & 0b111111) << 6) +
  120. ((static_cast<uint8_t>(data[3]) & 0b111111) << 0) ;
  121. }
  122. assert(0);
  123. return 0;
  124. }
  125. uint32_t TextOperations::getUnicodeCodepoint(char data, const std::string & encoding )
  126. {
  127. std::string stringNative(1, data);
  128. std::string stringUnicode = toUnicode(stringNative, encoding);
  129. if (stringUnicode.empty())
  130. return 0;
  131. return getUnicodeCodepoint(stringUnicode.data(), stringUnicode.size());
  132. }
  133. std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
  134. {
  135. try {
  136. return boost::locale::conv::to_utf<char>(text, encoding);
  137. }
  138. catch (const boost::locale::conv::conversion_error &)
  139. {
  140. throw std::runtime_error("Failed to convert text '" + text + "' from encoding " + encoding );
  141. }
  142. }
  143. std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
  144. {
  145. try {
  146. return boost::locale::conv::from_utf<char>(text, encoding);
  147. }
  148. catch (const boost::locale::conv::conversion_error &)
  149. {
  150. throw std::runtime_error("Failed to convert text '" + text + "' to encoding " + encoding );
  151. }
  152. }
  153. void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
  154. {
  155. if(text.empty())
  156. return;
  157. //todo: more efficient algorithm
  158. for(int i = 0; i< amount; i++){
  159. auto b = text.begin();
  160. auto e = text.end();
  161. size_t lastLen = 0;
  162. size_t len = 0;
  163. while (b != e) {
  164. lastLen = len;
  165. size_t n = getUnicodeCharacterSize(*b);
  166. if(!isValidUnicodeCharacter(&(*b),e-b))
  167. {
  168. logGlobal->error("Invalid UTF8 sequence");
  169. break;//invalid sequence will be trimmed
  170. }
  171. len += n;
  172. b += n;
  173. }
  174. text.resize(lastLen);
  175. }
  176. }
  177. size_t TextOperations::getUnicodeCharactersCount(const std::string & text)
  178. {
  179. size_t charactersCount = 0;
  180. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  181. charactersCount++;
  182. return charactersCount;
  183. }
  184. std::string TextOperations::escapeString(std::string input)
  185. {
  186. boost::replace_all(input, "\\", "\\\\");
  187. boost::replace_all(input, "\n", "\\n");
  188. boost::replace_all(input, "\r", "\\r");
  189. boost::replace_all(input, "\t", "\\t");
  190. boost::replace_all(input, "\"", "\\\"");
  191. return input;
  192. }
  193. std::string TextOperations::getFormattedDateTimeLocal(std::time_t dt)
  194. {
  195. return vstd::getFormattedDateTime(dt, Languages::getLanguageOptions(settings["general"]["language"].String()).dateTimeFormat);
  196. }
  197. std::string TextOperations::getFormattedTimeLocal(std::time_t dt)
  198. {
  199. return vstd::getFormattedDateTime(dt, "%H:%M");
  200. }
  201. std::string TextOperations::getCurrentFormattedTimeLocal(std::chrono::seconds timeOffset)
  202. {
  203. auto timepoint = std::chrono::system_clock::now() + timeOffset;
  204. return TextOperations::getFormattedTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
  205. }
  206. std::string TextOperations::getCurrentFormattedDateTimeLocal(std::chrono::seconds timeOffset)
  207. {
  208. auto timepoint = std::chrono::system_clock::now() + timeOffset;
  209. return TextOperations::getFormattedDateTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
  210. }
  211. int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view t)
  212. {
  213. int n = t.size();
  214. int m = s.size();
  215. // create two work vectors of integer distances
  216. std::vector<int> v0(n+1, 0);
  217. std::vector<int> v1(n+1, 0);
  218. // initialize v0 (the previous row of distances)
  219. // this row is A[0][i]: edit distance from an empty s to t;
  220. // that distance is the number of characters to append to s to make t.
  221. for (int i = 0; i < n; ++i)
  222. v0[i] = i;
  223. for (int i = 0; i < m; ++i)
  224. {
  225. // calculate v1 (current row distances) from the previous row v0
  226. // first element of v1 is A[i + 1][0]
  227. // edit distance is delete (i + 1) chars from s to match empty t
  228. v1[0] = i + 1;
  229. // use formula to fill in the rest of the row
  230. for (int j = 0; j < n; ++j)
  231. {
  232. // calculating costs for A[i + 1][j + 1]
  233. int deletionCost = v0[j + 1] + 1;
  234. int insertionCost = v1[j] + 1;
  235. int substitutionCost;
  236. if (s[i] == t[j])
  237. substitutionCost = v0[j];
  238. else
  239. substitutionCost = v0[j] + 1;
  240. v1[j + 1] = std::min({deletionCost, insertionCost, substitutionCost});
  241. }
  242. // copy v1 (current row) to v0 (previous row) for next iteration
  243. // since data in v1 is always invalidated, a swap without copy could be more efficient
  244. std::swap(v0, v1);
  245. }
  246. // after the last swap, the results of v1 are now in v0
  247. return v0[n];
  248. }
  249. DLL_LINKAGE std::string TextOperations::getLocaleName()
  250. {
  251. return Languages::getLanguageOptions(LIBRARY->generaltexth->getPreferredLanguage()).localeName;
  252. }
  253. DLL_LINKAGE bool TextOperations::compareLocalizedStrings(std::string_view str1, std::string_view str2)
  254. {
  255. static const std::locale loc(getLocaleName());
  256. static const std::collate<char> & col = std::use_facet<std::collate<char>>(loc);
  257. return col.compare(str1.data(), str1.data() + str1.size(),
  258. str2.data(), str2.data() + str2.size()) < 0;
  259. }
  260. std::optional<int> TextOperations::textSearchSimilarityScore(const std::string & s, const std::string & t)
  261. {
  262. static const std::locale loc = boost::locale::generator().generate(getLocaleName());
  263. auto haystack = boost::locale::to_lower(t, loc);
  264. auto needle = boost::locale::to_lower(s, loc);
  265. // 0 - Best possible match: text starts with the search string
  266. if(haystack.rfind(needle, 0) == 0)
  267. return 0;
  268. // 1 - Strong match: text contains the search string
  269. if(haystack.find(needle) != std::string::npos)
  270. return 1;
  271. // Dynamic threshold: Reject if too many typos based on word length
  272. int maxAllowedDistance = std::max(2, static_cast<int>(needle.size() / 2));
  273. // Compute Levenshtein distance for fuzzy similarity
  274. int minDist = std::numeric_limits<int>::max();
  275. for(size_t i = 0; i <= haystack.size() - needle.size(); i++)
  276. {
  277. int dist = getLevenshteinDistance(haystack.substr(i, needle.size()), needle);
  278. minDist = std::min(minDist, dist);
  279. }
  280. // Apply scaling: Short words tolerate smaller distances
  281. if(needle.size() > 2 && minDist <= 2)
  282. minDist += 1;
  283. return (minDist > maxAllowedDistance) ? std::nullopt : std::optional<int>{ minDist };
  284. }
  285. VCMI_LIB_NAMESPACE_END