|
@@ -16,39 +16,52 @@
|
|
|
|
|
|
VCMI_LIB_NAMESPACE_BEGIN
|
|
|
|
|
|
-size_t Unicode::getCharacterSize(char firstByte)
|
|
|
+size_t TextOperations::getUnicodeCharacterSize(char firstByte)
|
|
|
{
|
|
|
// length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
|
|
|
// 0xxxxxxx -> 1 - ASCII chars
|
|
|
// 110xxxxx -> 2
|
|
|
+ // 1110xxxx -> 3
|
|
|
// 11110xxx -> 4 - last allowed in current standard
|
|
|
- // 1111110x -> 6 - last allowed in original standard
|
|
|
|
|
|
- if ((ui8)firstByte < 0x80)
|
|
|
+ auto value = static_cast<uint8_t>(firstByte);
|
|
|
+
|
|
|
+ if ((value & 0b10000000) == 0)
|
|
|
return 1; // ASCII
|
|
|
|
|
|
- size_t ret = 0;
|
|
|
+ if ((value & 0b11100000) == 0b11000000)
|
|
|
+ return 2;
|
|
|
|
|
|
- for (size_t i=0; i<8; i++)
|
|
|
- {
|
|
|
- if (((ui8)firstByte & (0x80 >> i)) != 0)
|
|
|
- ret++;
|
|
|
- else
|
|
|
- break;
|
|
|
- }
|
|
|
- return ret;
|
|
|
+ if ((value & 0b11110000) == 0b11100000)
|
|
|
+ return 3;
|
|
|
+
|
|
|
+ if ((value & 0b11111000) == 0b11110000)
|
|
|
+ return 4;
|
|
|
+
|
|
|
+ assert(0);// invalid unicode sequence
|
|
|
+ return 4;
|
|
|
}
|
|
|
|
|
|
-bool Unicode::isValidCharacter(const char * character, size_t maxSize)
|
|
|
+bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
|
|
|
{
|
|
|
+ assert(maxSize > 0);
|
|
|
+
|
|
|
+ auto value = static_cast<uint8_t>(character[0]);
|
|
|
+
|
|
|
+ // ASCII
|
|
|
+ if ( value < 0b10000000)
|
|
|
+ return maxSize > 0;
|
|
|
+
|
|
|
// can't be first byte in UTF8
|
|
|
- if ((ui8)character[0] >= 0x80 && (ui8)character[0] < 0xC0)
|
|
|
+ if (value < 0b11000000)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
|
|
|
+ if (value > 0b11110000)
|
|
|
return false;
|
|
|
- // first character must follow rules checked in getCharacterSize
|
|
|
- size_t size = getCharacterSize((ui8)character[0]);
|
|
|
|
|
|
- if ((ui8)character[0] > 0xF4)
|
|
|
- return false; // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
|
|
|
+ // first character must follow rules checked in getUnicodeCharacterSize
|
|
|
+ size_t size = getUnicodeCharacterSize(character[0]);
|
|
|
|
|
|
if (size > maxSize)
|
|
|
return false;
|
|
@@ -56,69 +69,70 @@ bool Unicode::isValidCharacter(const char * character, size_t maxSize)
|
|
|
// remaining characters must have highest bit set to 1
|
|
|
for (size_t i = 1; i < size; i++)
|
|
|
{
|
|
|
- if (((ui8)character[i] & 0x80) == 0)
|
|
|
+ auto characterValue = static_cast<uint8_t>(character[i]);
|
|
|
+ if (characterValue < 0b10000000)
|
|
|
return false;
|
|
|
}
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-bool Unicode::isValidASCII(const std::string & text)
|
|
|
+bool TextOperations::isValidASCII(const std::string & text)
|
|
|
{
|
|
|
for (const char & ch : text)
|
|
|
- if (ui8(ch) >= 0x80 )
|
|
|
+ if (static_cast<uint8_t>(ch) >= 0x80 )
|
|
|
return false;
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-bool Unicode::isValidASCII(const char * data, size_t size)
|
|
|
+bool TextOperations::isValidASCII(const char * data, size_t size)
|
|
|
{
|
|
|
for (size_t i=0; i<size; i++)
|
|
|
- if (ui8(data[i]) >= 0x80 )
|
|
|
+ if (static_cast<uint8_t>(data[i]) >= 0x80 )
|
|
|
return false;
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-bool Unicode::isValidString(const std::string & text)
|
|
|
+bool TextOperations::isValidUnicodeString(const std::string & text)
|
|
|
{
|
|
|
- for (size_t i=0; i<text.size(); i += getCharacterSize(text[i]))
|
|
|
+ for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
|
|
|
{
|
|
|
- if (!isValidCharacter(text.data() + i, text.size() - i))
|
|
|
+ if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
|
|
|
return false;
|
|
|
}
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-bool Unicode::isValidString(const char * data, size_t size)
|
|
|
+bool TextOperations::isValidUnicodeString(const char * data, size_t size)
|
|
|
{
|
|
|
- for (size_t i=0; i<size; i += getCharacterSize(data[i]))
|
|
|
+ for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
|
|
|
{
|
|
|
- if (!isValidCharacter(data + i, size - i))
|
|
|
+ if (!isValidUnicodeCharacter(data + i, size - i))
|
|
|
return false;
|
|
|
}
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-std::string Unicode::toUnicode(const std::string &text)
|
|
|
+std::string TextOperations::toUnicode(const std::string &text)
|
|
|
{
|
|
|
return toUnicode(text, CGeneralTextHandler::getInstalledEncoding());
|
|
|
}
|
|
|
|
|
|
-std::string Unicode::toUnicode(const std::string &text, const std::string &encoding)
|
|
|
+std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
|
|
|
{
|
|
|
return boost::locale::conv::to_utf<char>(text, encoding);
|
|
|
}
|
|
|
|
|
|
-std::string Unicode::fromUnicode(const std::string & text)
|
|
|
+std::string TextOperations::fromUnicode(const std::string & text)
|
|
|
{
|
|
|
return fromUnicode(text, CGeneralTextHandler::getInstalledEncoding());
|
|
|
}
|
|
|
|
|
|
-std::string Unicode::fromUnicode(const std::string &text, const std::string &encoding)
|
|
|
+std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
|
|
|
{
|
|
|
return boost::locale::conv::from_utf<char>(text, encoding);
|
|
|
}
|
|
|
|
|
|
-void Unicode::trimRight(std::string & text, const size_t amount)
|
|
|
+void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
|
|
|
{
|
|
|
if(text.empty())
|
|
|
return;
|
|
@@ -130,9 +144,9 @@ void Unicode::trimRight(std::string & text, const size_t amount)
|
|
|
size_t len = 0;
|
|
|
while (b != e) {
|
|
|
lastLen = len;
|
|
|
- size_t n = getCharacterSize(*b);
|
|
|
+ size_t n = getUnicodeCharacterSize(*b);
|
|
|
|
|
|
- if(!isValidCharacter(&(*b),e-b))
|
|
|
+ if(!isValidUnicodeCharacter(&(*b),e-b))
|
|
|
{
|
|
|
logGlobal->error("Invalid UTF8 sequence");
|
|
|
break;//invalid sequence will be trimmed
|
|
@@ -146,4 +160,15 @@ void Unicode::trimRight(std::string & text, const size_t amount)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+std::string TextOperations::escapeString(std::string input)
|
|
|
+{
|
|
|
+ boost::replace_all(input, "\\", "\\\\");
|
|
|
+ boost::replace_all(input, "\n", "\\n");
|
|
|
+ boost::replace_all(input, "\r", "\\r");
|
|
|
+ boost::replace_all(input, "\t", "\\t");
|
|
|
+ boost::replace_all(input, "\"", "\\\"");
|
|
|
+
|
|
|
+ return input;
|
|
|
+}
|
|
|
+
|
|
|
VCMI_LIB_NAMESPACE_END
|