TextOperations.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. /*
  2. * TextOperations.cpp, part of VCMI engine
  3. *
  4. * Authors: listed in file AUTHORS in main folder
  5. *
  6. * License: GNU General Public License v2.0 or later
  7. * Full text of license available in license.txt file, in main folder
  8. *
  9. */
  10. #include "StdInc.h"
  11. #include "TextOperations.h"
  12. #include "../GameLibrary.h"
  13. #include "../texts/CGeneralTextHandler.h"
  14. #include "Languages.h"
  15. #include "CConfigHandler.h"
  16. #include <vstd/DateUtils.h>
  17. #include <boost/locale/encoding.hpp>
  18. VCMI_LIB_NAMESPACE_BEGIN
  19. size_t TextOperations::getUnicodeCharacterSize(char firstByte)
  20. {
  21. // length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
  22. // 0xxxxxxx -> 1 - ASCII chars
  23. // 110xxxxx -> 2
  24. // 1110xxxx -> 3
  25. // 11110xxx -> 4 - last allowed in current standard
  26. auto value = static_cast<uint8_t>(firstByte);
  27. if ((value & 0b10000000) == 0)
  28. return 1; // ASCII
  29. if ((value & 0b11100000) == 0b11000000)
  30. return 2;
  31. if ((value & 0b11110000) == 0b11100000)
  32. return 3;
  33. if ((value & 0b11111000) == 0b11110000)
  34. return 4;
  35. assert(0);// invalid unicode sequence
  36. return 4;
  37. }
  38. bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
  39. {
  40. assert(maxSize > 0);
  41. auto value = static_cast<uint8_t>(character[0]);
  42. // ASCII
  43. if ( value < 0b10000000)
  44. return maxSize > 0;
  45. // can't be first byte in UTF8
  46. if (value < 0b11000000)
  47. return false;
  48. // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
  49. if (value > 0b11110000)
  50. return false;
  51. // first character must follow rules checked in getUnicodeCharacterSize
  52. size_t size = getUnicodeCharacterSize(character[0]);
  53. if (size > maxSize)
  54. return false;
  55. // remaining characters must have highest bit set to 1
  56. for (size_t i = 1; i < size; i++)
  57. {
  58. auto characterValue = static_cast<uint8_t>(character[i]);
  59. if (characterValue < 0b10000000)
  60. return false;
  61. }
  62. return true;
  63. }
  64. bool TextOperations::isValidASCII(const std::string & text)
  65. {
  66. for (const char & ch : text)
  67. if (static_cast<uint8_t>(ch) >= 0x80 )
  68. return false;
  69. return true;
  70. }
  71. bool TextOperations::isValidASCII(const char * data, size_t size)
  72. {
  73. for (size_t i=0; i<size; i++)
  74. if (static_cast<uint8_t>(data[i]) >= 0x80 )
  75. return false;
  76. return true;
  77. }
  78. bool TextOperations::isValidUnicodeString(std::string_view text)
  79. {
  80. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  81. {
  82. if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
  83. return false;
  84. }
  85. return true;
  86. }
  87. bool TextOperations::isValidUnicodeString(const char * data, size_t size)
  88. {
  89. for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
  90. {
  91. if (!isValidUnicodeCharacter(data + i, size - i))
  92. return false;
  93. }
  94. return true;
  95. }
  96. uint32_t TextOperations::getUnicodeCodepoint(const char * data, size_t maxSize)
  97. {
  98. assert(isValidUnicodeCharacter(data, maxSize));
  99. if (!isValidUnicodeCharacter(data, maxSize))
  100. return 0;
  101. // https://en.wikipedia.org/wiki/UTF-8#Encoding
  102. switch (getUnicodeCharacterSize(data[0]))
  103. {
  104. case 1:
  105. return static_cast<uint8_t>(data[0]) & 0b1111111;
  106. case 2:
  107. return
  108. ((static_cast<uint8_t>(data[0]) & 0b11111 ) << 6) +
  109. ((static_cast<uint8_t>(data[1]) & 0b111111) << 0) ;
  110. case 3:
  111. return
  112. ((static_cast<uint8_t>(data[0]) & 0b1111 ) << 12) +
  113. ((static_cast<uint8_t>(data[1]) & 0b111111) << 6) +
  114. ((static_cast<uint8_t>(data[2]) & 0b111111) << 0) ;
  115. case 4:
  116. return
  117. ((static_cast<uint8_t>(data[0]) & 0b111 ) << 18) +
  118. ((static_cast<uint8_t>(data[1]) & 0b111111) << 12) +
  119. ((static_cast<uint8_t>(data[2]) & 0b111111) << 6) +
  120. ((static_cast<uint8_t>(data[3]) & 0b111111) << 0) ;
  121. }
  122. assert(0);
  123. return 0;
  124. }
  125. uint32_t TextOperations::getUnicodeCodepoint(char data, const std::string & encoding )
  126. {
  127. std::string stringNative(1, data);
  128. std::string stringUnicode = toUnicode(stringNative, encoding);
  129. if (stringUnicode.empty())
  130. return 0;
  131. return getUnicodeCodepoint(stringUnicode.data(), stringUnicode.size());
  132. }
  133. std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
  134. {
  135. try {
  136. return boost::locale::conv::to_utf<char>(text, encoding);
  137. }
  138. catch (const boost::locale::conv::conversion_error &)
  139. {
  140. throw std::runtime_error("Failed to convert text '" + text + "' from encoding " + encoding );
  141. }
  142. }
  143. std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
  144. {
  145. try {
  146. return boost::locale::conv::from_utf<char>(text, encoding);
  147. }
  148. catch (const boost::locale::conv::conversion_error &)
  149. {
  150. throw std::runtime_error("Failed to convert text '" + text + "' to encoding " + encoding );
  151. }
  152. }
  153. void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
  154. {
  155. if(text.empty())
  156. return;
  157. //todo: more efficient algorithm
  158. for(int i = 0; i< amount; i++){
  159. auto b = text.begin();
  160. auto e = text.end();
  161. size_t lastLen = 0;
  162. size_t len = 0;
  163. while (b != e) {
  164. lastLen = len;
  165. size_t n = getUnicodeCharacterSize(*b);
  166. if(!isValidUnicodeCharacter(&(*b),e-b))
  167. {
  168. logGlobal->error("Invalid UTF8 sequence");
  169. break;//invalid sequence will be trimmed
  170. }
  171. len += n;
  172. b += n;
  173. }
  174. text.resize(lastLen);
  175. }
  176. }
  177. size_t TextOperations::getUnicodeCharactersCount(std::string_view text)
  178. {
  179. size_t charactersCount = 0;
  180. for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
  181. charactersCount++;
  182. return charactersCount;
  183. }
  184. std::string TextOperations::escapeString(std::string input)
  185. {
  186. boost::replace_all(input, "\\", "\\\\");
  187. boost::replace_all(input, "\n", "\\n");
  188. boost::replace_all(input, "\r", "\\r");
  189. boost::replace_all(input, "\t", "\\t");
  190. boost::replace_all(input, "\"", "\\\"");
  191. return input;
  192. }
  193. std::string TextOperations::getFormattedDateTimeLocal(std::time_t dt)
  194. {
  195. return vstd::getFormattedDateTime(dt, Languages::getLanguageOptions(settings["general"]["language"].String()).dateTimeFormat);
  196. }
  197. std::string TextOperations::getFormattedTimeLocal(std::time_t dt)
  198. {
  199. return vstd::getFormattedDateTime(dt, "%H:%M");
  200. }
  201. std::string TextOperations::getCurrentFormattedTimeLocal(std::chrono::seconds timeOffset)
  202. {
  203. auto timepoint = std::chrono::system_clock::now() + timeOffset;
  204. return TextOperations::getFormattedTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
  205. }
  206. std::string TextOperations::getCurrentFormattedDateTimeLocal(std::chrono::seconds timeOffset)
  207. {
  208. auto timepoint = std::chrono::system_clock::now() + timeOffset;
  209. return TextOperations::getFormattedDateTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
  210. }
  211. static const std::locale & getLocale()
  212. {
  213. auto getLocale = []() -> std::locale
  214. {
  215. const std::string & baseLocaleName = Languages::getLanguageOptions(LIBRARY->generaltexth->getPreferredLanguage()).localeName;
  216. const std::string fallbackLocale = Languages::getLanguageOptions(Languages::ELanguages::ENGLISH).localeName;
  217. for (const auto & localeName : { baseLocaleName + ".UTF-8", baseLocaleName, fallbackLocale + ".UTF-8", fallbackLocale })
  218. {
  219. try
  220. {
  221. // Locale generation may fail (and throw an exception) in two cases:
  222. // - if the corresponding locale is not installed on the system
  223. // - on Android named locales are not supported at all and always throw an exception
  224. return std::locale(localeName);
  225. }
  226. catch (const std::exception & e)
  227. {
  228. logGlobal->warn("Failed to set locale '%s'", localeName);
  229. }
  230. }
  231. return std::locale();
  232. };
  233. static const std::locale locale = getLocale();
  234. return locale;
  235. }
  236. int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view t)
  237. {
  238. assert(isValidUnicodeString(s));
  239. assert(isValidUnicodeString(t));
  240. auto charactersEqual = [&s, &t](int sPoint, int tPoint)
  241. {
  242. uint32_t sUTF32 = getUnicodeCodepoint(s.data() + sPoint, s.size() - sPoint);
  243. uint32_t tUTF32 = getUnicodeCodepoint(t.data() + tPoint, t.size() - tPoint);
  244. if (sUTF32 == tUTF32)
  245. return true;
  246. // Windows - wchar_t represents UTF-16 symbol that does not cover entire Unicode
  247. // In UTF-16 such characters can only be represented as 2 wchar_t's, but toupper can only operate on single wchar
  248. // Assume symbols are different if one of them cannot be represented as a single UTF-16 symbol
  249. if constexpr (sizeof(wchar_t) == 2)
  250. {
  251. if (sUTF32 > 0xFFFF || (sUTF32 >= 0xD800 && sUTF32 <= 0xDFFF ))
  252. return false;
  253. if (tUTF32 > 0xFFFF || (tUTF32 >= 0xD800 && tUTF32 <= 0xDFFF ))
  254. return false;
  255. }
  256. const auto & facet = std::use_facet<std::ctype<wchar_t>>(getLocale());
  257. return facet.toupper(sUTF32) == facet.toupper(tUTF32);
  258. };
  259. int n = getUnicodeCharactersCount(t);
  260. int m = getUnicodeCharactersCount(s);
  261. // create two work vectors of integer distances
  262. std::vector<int> v0(n+1, 0);
  263. std::vector<int> v1(n+1, 0);
  264. // initialize v0 (the previous row of distances)
  265. // this row is A[0][i]: edit distance from an empty s to t;
  266. // that distance is the number of characters to append to s to make t.
  267. for (int i = 0; i < n; ++i)
  268. v0[i] = i;
  269. for (int i = 0, iPoint = 0; i < m; ++i, iPoint += getUnicodeCharacterSize(s[iPoint]))
  270. {
  271. // calculate v1 (current row distances) from the previous row v0
  272. // first element of v1 is A[i + 1][0]
  273. // edit distance is delete (i + 1) chars from s to match empty t
  274. v1[0] = i + 1;
  275. // use formula to fill in the rest of the row
  276. for (int j = 0, jPoint = 0; j < n; ++j, jPoint += getUnicodeCharacterSize(t[jPoint]))
  277. {
  278. // calculating costs for A[i + 1][j + 1]
  279. int deletionCost = v0[j + 1] + 1;
  280. int insertionCost = v1[j] + 1;
  281. int substitutionCost;
  282. if (charactersEqual(iPoint, jPoint))
  283. substitutionCost = v0[j];
  284. else
  285. substitutionCost = v0[j] + 1;
  286. v1[j + 1] = std::min({deletionCost, insertionCost, substitutionCost});
  287. }
  288. // copy v1 (current row) to v0 (previous row) for next iteration
  289. // since data in v1 is always invalidated, a swap without copy could be more efficient
  290. std::swap(v0, v1);
  291. }
  292. // after the last swap, the results of v1 are now in v0
  293. return v0[n];
  294. }
  295. DLL_LINKAGE bool TextOperations::compareLocalizedStrings(std::string_view str1, std::string_view str2)
  296. {
  297. const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
  298. return col.compare(
  299. str1.data(), str1.data() + str1.size(),
  300. str2.data(), str2.data() + str2.size()
  301. ) < 0;
  302. }
  303. std::optional<int> TextOperations::textSearchSimilarityScore(const std::string & needle, const std::string & haystack)
  304. {
  305. // 0 - Best possible match: text starts with the search string
  306. if(haystack.rfind(needle, 0) == 0)
  307. return 0;
  308. // 1 - Strong match: text contains the search string
  309. if(haystack.find(needle) != std::string::npos)
  310. return 1;
  311. // Dynamic threshold: Reject if too many typos based on word length
  312. int haystackCodepoints = getUnicodeCharactersCount(haystack);
  313. int needleCodepoints = getUnicodeCharactersCount(needle);
  314. int maxAllowedDistance = needleCodepoints / 2;
  315. // Compute Levenshtein distance for fuzzy similarity
  316. int minDist = std::numeric_limits<int>::max();
  317. for(int i = 0; i <= haystackCodepoints - needleCodepoints; ++i)
  318. {
  319. int haystackBegin = 0;
  320. for(int j = 0; j < i; ++j)
  321. haystackBegin += getUnicodeCharacterSize(haystack[haystackBegin]);
  322. int haystackEnd = haystackBegin;
  323. for(int j = 0; j < needleCodepoints; ++j)
  324. haystackEnd += getUnicodeCharacterSize(haystack[haystackEnd]);
  325. int dist = getLevenshteinDistance(haystack.substr(haystackBegin, haystackEnd - haystackBegin), needle);
  326. minDist = std::min(minDist, dist);
  327. }
  328. // Apply scaling: Short words tolerate smaller distances
  329. if(needle.size() > 2 && minDist <= 2)
  330. minDist += 1;
  331. return (minDist > maxAllowedDistance) ? std::nullopt : std::optional<int>{ minDist };
  332. }
  333. std::string TextOperations::filesystemPathToUtf8(const boost::filesystem::path& path)
  334. {
  335. #ifdef VCMI_WINDOWS
  336. return boost::locale::conv::utf_to_utf<char>(path.native());
  337. #else
  338. return path.string();
  339. #endif
  340. }
  341. boost::filesystem::path TextOperations::Utf8TofilesystemPath(const std::string& path)
  342. {
  343. #ifdef VCMI_WINDOWS
  344. return boost::filesystem::path(boost::locale::conv::utf_to_utf<wchar_t>(path));
  345. #else
  346. return boost::filesystem::path(path.string());
  347. #endif
  348. }
  349. VCMI_LIB_NAMESPACE_END